GitLab steht wegen Wartungsarbeiten am Montag, den 10. Mai, zwischen 17:00 und 19:00 Uhr nicht zur Verfügung.

Commit 7fabc892 authored by Douwe Maan's avatar Douwe Maan

Merge branch 'bvl-circuitbreaker-process' into 'master'

Check NFS mounts in a separate process

Closes #39847

See merge request gitlab-org/gitlab-ce!15426
parents 12d33b88 f1ae1e39
...@@ -5,7 +5,7 @@ def show ...@@ -5,7 +5,7 @@ def show
end end
def reset_storage_health def reset_storage_health
Gitlab::Git::Storage::CircuitBreaker.reset_all! Gitlab::Git::Storage::FailureInfo.reset_all!
redirect_to admin_health_check_path, redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset') notice: _('Git storage health information has been reset')
end end
......
class HealthController < ActionController::Base class HealthController < ActionController::Base
protect_from_forgery with: :exception protect_from_forgery with: :exception, except: :storage_check
include RequiresWhitelistedMonitoringClient include RequiresWhitelistedMonitoringClient
CHECKS = [ CHECKS = [
...@@ -23,6 +23,15 @@ def liveness ...@@ -23,6 +23,15 @@ def liveness
render_check_results(results) render_check_results(results)
end end
def storage_check
results = Gitlab::Git::Storage::Checker.check_all
render json: {
check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval,
results: results
}
end
private private
def render_check_results(results) def render_check_results(results)
......
...@@ -124,17 +124,6 @@ def circuitbreaker_access_retries_help_text ...@@ -124,17 +124,6 @@ def circuitbreaker_access_retries_help_text
_('The number of attempts GitLab will make to access a storage.') _('The number of attempts GitLab will make to access a storage.')
end end
def circuitbreaker_backoff_threshold_help_text
_("The number of failures after which GitLab will start temporarily "\
"disabling access to a storage shard on a host")
end
def circuitbreaker_failure_wait_time_help_text
_("When access to a storage fails. GitLab will prevent access to the "\
"storage for the time specified here. This allows the filesystem to "\
"recover. Repositories on failing shards are temporarly unavailable")
end
def circuitbreaker_failure_reset_time_help_text def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\ _("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.") "failures occur during this time, information about the mount is reset.")
...@@ -145,6 +134,11 @@ def circuitbreaker_storage_timeout_help_text ...@@ -145,6 +134,11 @@ def circuitbreaker_storage_timeout_help_text
"timeout error will be raised.") "timeout error will be raised.")
end end
def circuitbreaker_check_interval_help_text
_("The time in seconds between storage checks. When a previous check did "\
"complete yet, GitLab will skip a check.")
end
def visible_attributes def visible_attributes
[ [
:admin_notification_email, :admin_notification_email,
...@@ -154,10 +148,9 @@ def visible_attributes ...@@ -154,10 +148,9 @@ def visible_attributes
:akismet_enabled, :akismet_enabled,
:auto_devops_enabled, :auto_devops_enabled,
:circuitbreaker_access_retries, :circuitbreaker_access_retries,
:circuitbreaker_backoff_threshold, :circuitbreaker_check_interval,
:circuitbreaker_failure_count_threshold, :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_failure_wait_time,
:circuitbreaker_storage_timeout, :circuitbreaker_storage_timeout,
:clientside_sentry_dsn, :clientside_sentry_dsn,
:clientside_sentry_enabled, :clientside_sentry_enabled,
......
...@@ -18,16 +18,12 @@ def message_for_circuit_breaker(circuit_breaker) ...@@ -18,16 +18,12 @@ def message_for_circuit_breaker(circuit_breaker)
current_failures = circuit_breaker.failure_count current_failures = circuit_breaker.failure_count
translation_params = { number_of_failures: current_failures, translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures, maximum_failures: maximum_failures }
number_of_seconds: circuit_breaker.failure_wait_time }
if circuit_breaker.circuit_broken? if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\ "retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params "resolved.") % translation_params
elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params "allow access on the next attempt.") % translation_params
......
...@@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base ...@@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base
presence: true, presence: true,
numericality: { greater_than_or_equal_to: 0 } numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_backoff_threshold, validates :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout, :circuitbreaker_storage_timeout,
:circuitbreaker_check_interval,
presence: true, presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 } numericality: { only_integer: true, greater_than_or_equal_to: 0 }
...@@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base ...@@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base
presence: true, presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 } numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
if value.to_i >= record.circuitbreaker_failure_count_threshold
record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
"lower than the failure count threshold"))
end
end
validates :gitaly_timeout_default, validates :gitaly_timeout_default,
presence: true, presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 } numericality: { only_integer: true, greater_than_or_equal_to: 0 }
......
...@@ -545,6 +545,12 @@ ...@@ -545,6 +545,12 @@
%fieldset %fieldset
%legend Git Storage Circuitbreaker settings %legend Git Storage Circuitbreaker settings
.form-group
= f.label :circuitbreaker_check_interval, _('Check interval'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_check_interval, class: 'form-control'
.help-block
= circuitbreaker_check_interval_help_text
.form-group .form-group
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2' = f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
...@@ -557,18 +563,6 @@ ...@@ -557,18 +563,6 @@
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control' = f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.help-block .help-block
= circuitbreaker_storage_timeout_help_text = circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
.help-block
= circuitbreaker_backoff_threshold_help_text
.form-group
= f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_wait_time, class: 'form-control'
.help-block
= circuitbreaker_failure_wait_time_help_text
.form-group .form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
......
#!/usr/bin/env ruby
require 'optparse'
require 'net/http'
require 'json'
require 'socket'
require 'logger'
require_relative '../lib/gitlab/storage_check'
Gitlab::StorageCheck::CLI.start!(ARGV)
---
title: Monitor NFS shards for circuitbreaker in a separate process
merge_request: 15426
author:
type: changed
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
scope path: '-' do scope path: '-' do
get 'liveness' => 'health#liveness' get 'liveness' => 'health#liveness'
get 'readiness' => 'health#readiness' get 'readiness' => 'health#readiness'
post 'storage_check' => 'health#storage_check'
resources :metrics, only: [:index] resources :metrics, only: [:index]
mount Peek::Railtie => '/peek' mount Peek::Railtie => '/peek'
......
class AddCircuitbreakerCheckIntervalToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
disable_ddl_transaction!
def up
add_column_with_default :application_settings,
:circuitbreaker_check_interval,
:integer,
default: 1
end
def down
remove_column :application_settings,
:circuitbreaker_check_interval
end
end
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class UpdateCircuitbreakerDefaults < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
class ApplicationSetting < ActiveRecord::Base; end
def up
change_column_default :application_settings,
:circuitbreaker_failure_count_threshold,
3
change_column_default :application_settings,
:circuitbreaker_storage_timeout,
15
ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 3,
circuitbreaker_storage_timeout: 15)
end
def down
change_column_default :application_settings,
:circuitbreaker_failure_count_threshold,
160
change_column_default :application_settings,
:circuitbreaker_storage_timeout,
30
ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 160,
circuitbreaker_storage_timeout: 30)
end
end
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class RemoveOldCircuitbreakerConfig < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def up
remove_column :application_settings,
:circuitbreaker_backoff_threshold
remove_column :application_settings,
:circuitbreaker_failure_wait_time
end
def down
add_column :application_settings,
:circuitbreaker_backoff_threshold,
:integer,
default: 80
add_column :application_settings,
:circuitbreaker_failure_wait_time,
:integer,
default: 30
end
end
...@@ -135,12 +135,10 @@ ...@@ -135,12 +135,10 @@
t.boolean "hashed_storage_enabled", default: false, null: false t.boolean "hashed_storage_enabled", default: false, null: false
t.boolean "project_export_enabled", default: true, null: false t.boolean "project_export_enabled", default: true, null: false
t.boolean "auto_devops_enabled", default: false, null: false t.boolean "auto_devops_enabled", default: false, null: false
t.integer "circuitbreaker_failure_count_threshold", default: 160 t.integer "circuitbreaker_failure_count_threshold", default: 3
t.integer "circuitbreaker_failure_wait_time", default: 30
t.integer "circuitbreaker_failure_reset_time", default: 1800 t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 30 t.integer "circuitbreaker_storage_timeout", default: 15
t.integer "circuitbreaker_access_retries", default: 3 t.integer "circuitbreaker_access_retries", default: 3
t.integer "circuitbreaker_backoff_threshold", default: 80
t.boolean "throttle_unauthenticated_enabled", default: false, null: false t.boolean "throttle_unauthenticated_enabled", default: false, null: false
t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false
t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false
...@@ -150,6 +148,7 @@ ...@@ -150,6 +148,7 @@
t.boolean "throttle_authenticated_web_enabled", default: false, null: false t.boolean "throttle_authenticated_web_enabled", default: false, null: false
t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false
t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false
t.integer "circuitbreaker_check_interval", default: 1, null: false
t.boolean "password_authentication_enabled_for_web" t.boolean "password_authentication_enabled_for_web"
t.boolean "password_authentication_enabled_for_git", default: true t.boolean "password_authentication_enabled_for_git", default: true
t.integer "gitaly_timeout_default", default: 55, null: false t.integer "gitaly_timeout_default", default: 55, null: false
......
...@@ -70,10 +70,9 @@ PUT /application/settings ...@@ -70,10 +70,9 @@ PUT /application/settings
| `akismet_api_key` | string | no | API key for akismet spam protection | | `akismet_api_key` | string | no | API key for akismet spam protection |
| `akismet_enabled` | boolean | no | Enable or disable akismet spam protection | | `akismet_enabled` | boolean | no | Enable or disable akismet spam protection |
| `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. | | `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. | | `circuitbreaker_check_interval` | integer | no | Number of seconds in between storage checks. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. | | `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. | | `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. |
| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt | | `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt |
| `clientside_sentry_dsn` | string | no | Required if `clientside_sentry_dsn` is enabled | | `clientside_sentry_dsn` | string | no | Required if `clientside_sentry_dsn` is enabled |
| `clientside_sentry_enabled` | boolean | no | Enable Sentry error reporting for the client side | | `clientside_sentry_enabled` | boolean | no | Enable Sentry error reporting for the client side |
......
...@@ -41,7 +41,7 @@ def storage_health ...@@ -41,7 +41,7 @@ def storage_health
detail 'This feature was introduced in GitLab 9.5' detail 'This feature was introduced in GitLab 9.5'
end end
delete do delete do
Gitlab::Git::Storage::CircuitBreaker.reset_all! Gitlab::Git::Storage::FailureInfo.reset_all!
end end
end end
end end
......
module Gitlab
module Git
module Storage
class Checker
include CircuitBreakerSettings
attr_reader :storage_path, :storage, :hostname, :logger
def self.check_all(logger = Rails.logger)
threads = Gitlab.config.repositories.storages.keys.map do |storage_name|
Thread.new do
Thread.current[:result] = new(storage_name, logger).check_with_lease
end
end
threads.map do |thread|
thread.join
thread[:result]
end
end
def initialize(storage, logger = Rails.logger)
@storage = storage
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
@logger = logger
@hostname = Gitlab::Environment.hostname
end
def check_with_lease
lease_key = "storage_check:#{cache_key}"
lease = Gitlab::ExclusiveLease.new(lease_key, timeout: storage_timeout)
result = { storage: storage, success: nil }
if uuid = lease.try_obtain
result[:success] = check
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
else
logger.warn("#{hostname}: #{storage}: Skipping check, previous check still running")
end
result
end
def check
if Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
true
else
track_storage_inaccessible
logger.error("#{hostname}: #{storage}: Not accessible.")
false
end
end
private
def track_storage_inaccessible
first_failure = current_failure_info.first_failure || Time.now
last_failure = Time.now
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, first_failure.to_i)
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
maintain_known_keys(redis)
end
end
end
def track_storage_accessible
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, nil)
redis.hset(cache_key, :last_failure, nil)
redis.hset(cache_key, :failure_count, 0)
maintain_known_keys(redis)
end
end
end
def maintain_known_keys(redis)
expire_time = Time.now.to_i + failure_reset_time
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
end
def current_failure_info
FailureInfo.load(cache_key)
end
end
end
end
end
...@@ -4,22 +4,11 @@ module Storage ...@@ -4,22 +4,11 @@ module Storage
class CircuitBreaker class CircuitBreaker
include CircuitBreakerSettings include CircuitBreakerSettings
FailureInfo = Struct.new(:last_failure, :failure_count)
attr_reader :storage, attr_reader :storage,
:hostname, :hostname
:storage_path
delegate :last_failure, :failure_count, to: :failure_info
def self.reset_all!
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
RequestStore.delete(:circuitbreaker_cache) delegate :last_failure, :failure_count, :no_failures?,
end to: :failure_info
def self.for_storage(storage) def self.for_storage(storage)
cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
...@@ -46,9 +35,6 @@ def self.build(storage, hostname = Gitlab::Environment.hostname) ...@@ -46,9 +35,6 @@ def self.build(storage, hostname = Gitlab::Environment.hostname)
def initialize(storage, hostname) def initialize(storage, hostname)
@storage = storage @storage = storage
@hostname = hostname @hostname = hostname
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
end end
def perform def perform
...@@ -65,15 +51,6 @@ def circuit_broken? ...@@ -65,15 +51,6 @@ def circuit_broken?
failure_count > failure_count_threshold failure_count > failure_count_threshold
end end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > backoff_threshold
recent_failure && too_many_failures
end
private private
# The circuitbreaker can be enabled for the entire fleet using a Feature # The circuitbreaker can be enabled for the entire fleet using a Feature
...@@ -86,88 +63,13 @@ def enabled? ...@@ -86,88 +63,13 @@ def enabled?
end end
def failure_info def failure_info
@failure_info ||= get_failure_info @failure_info ||= FailureInfo.load(cache_key)
end
# Memoizing the `storage_available` call means we only do it once per
# request when the storage is available.
#
# When the storage appears not available, and the memoized value is `false`
# we might want to try again.
def storage_available?
return @storage_available if @storage_available
if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
else
track_storage_inaccessible
end
@storage_available
end end