Commit f1ae1e39 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Move the circuitbreaker check out in a separate process

Moving the check out of the general requests, makes sure we don't have
any slowdown in the regular requests.

To keep the process performing this checks small, the check is still
performed inside a unicorn. But that is called from a process running
on the same server.

Because the checks are now done outside normal request, we can have a
simpler failure strategy:

The check is now performed in the background every
`circuitbreaker_check_interval`. Failures are logged in redis. The
failures are reset when the check succeeds. Per check we will try
`circuitbreaker_access_retries` times within
`circuitbreaker_storage_timeout` seconds.

When the number of failures exceeds
`circuitbreaker_failure_count_threshold`, we will block access to the
storage.

After `failure_reset_time` of no checks, we will clear the stored
failures. This could happen when the process that performs the checks
is not running.
parent 12d33b88
......@@ -5,7 +5,7 @@ class Admin::HealthCheckController < Admin::ApplicationController
end
def reset_storage_health
Gitlab::Git::Storage::CircuitBreaker.reset_all!
Gitlab::Git::Storage::FailureInfo.reset_all!
redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset')
end
......
class HealthController < ActionController::Base
protect_from_forgery with: :exception
protect_from_forgery with: :exception, except: :storage_check
include RequiresWhitelistedMonitoringClient
CHECKS = [
......@@ -23,6 +23,15 @@ class HealthController < ActionController::Base
render_check_results(results)
end
def storage_check
results = Gitlab::Git::Storage::Checker.check_all
render json: {
check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval,
results: results
}
end
private
def render_check_results(results)
......
......@@ -124,17 +124,6 @@ module ApplicationSettingsHelper
_('The number of attempts GitLab will make to access a storage.')
end
def circuitbreaker_backoff_threshold_help_text
_("The number of failures after which GitLab will start temporarily "\
"disabling access to a storage shard on a host")
end
def circuitbreaker_failure_wait_time_help_text
_("When access to a storage fails. GitLab will prevent access to the "\
"storage for the time specified here. This allows the filesystem to "\
"recover. Repositories on failing shards are temporarly unavailable")
end
def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.")
......@@ -145,6 +134,11 @@ module ApplicationSettingsHelper
"timeout error will be raised.")
end
def circuitbreaker_check_interval_help_text
_("The time in seconds between storage checks. When a previous check did "\
"complete yet, GitLab will skip a check.")
end
def visible_attributes
[
:admin_notification_email,
......@@ -154,10 +148,9 @@ module ApplicationSettingsHelper
:akismet_enabled,
:auto_devops_enabled,
:circuitbreaker_access_retries,
:circuitbreaker_backoff_threshold,
:circuitbreaker_check_interval,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_failure_wait_time,
:circuitbreaker_storage_timeout,
:clientside_sentry_dsn,
:clientside_sentry_enabled,
......
......@@ -18,16 +18,12 @@ module StorageHealthHelper
current_failures = circuit_breaker.failure_count
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time }
maximum_failures: maximum_failures }
if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params
......
......@@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
validates :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
:circuitbreaker_check_interval,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
......@@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
if value.to_i >= record.circuitbreaker_failure_count_threshold
record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
"lower than the failure count threshold"))
end
end
validates :gitaly_timeout_default,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
......
......@@ -545,6 +545,12 @@
%fieldset
%legend Git Storage Circuitbreaker settings
.form-group
= f.label :circuitbreaker_check_interval, _('Check interval'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_check_interval, class: 'form-control'
.help-block
= circuitbreaker_check_interval_help_text
.form-group
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
.col-sm-10
......@@ -557,18 +563,6 @@
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.help-block
= circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
.help-block
= circuitbreaker_backoff_threshold_help_text
.form-group
= f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_wait_time, class: 'form-control'
.help-block
= circuitbreaker_failure_wait_time_help_text
.form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10
......
#!/usr/bin/env ruby
require 'optparse'
require 'net/http'
require 'json'
require 'socket'
require 'logger'
require_relative '../lib/gitlab/storage_check'
Gitlab::StorageCheck::CLI.start!(ARGV)
---
title: Monitor NFS shards for circuitbreaker in a separate process
merge_request: 15426
author:
type: changed
......@@ -42,6 +42,7 @@ Rails.application.routes.draw do
scope path: '-' do
get 'liveness' => 'health#liveness'
get 'readiness' => 'health#readiness'
post 'storage_check' => 'health#storage_check'
resources :metrics, only: [:index]
mount Peek::Railtie => '/peek'
......
class AddCircuitbreakerCheckIntervalToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
disable_ddl_transaction!
def up
add_column_with_default :application_settings,
:circuitbreaker_check_interval,
:integer,
default: 1
end
def down
remove_column :application_settings,
:circuitbreaker_check_interval
end
end
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class UpdateCircuitbreakerDefaults < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
class ApplicationSetting < ActiveRecord::Base; end
def up
change_column_default :application_settings,
:circuitbreaker_failure_count_threshold,
3
change_column_default :application_settings,
:circuitbreaker_storage_timeout,
15
ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 3,
circuitbreaker_storage_timeout: 15)
end
def down
change_column_default :application_settings,
:circuitbreaker_failure_count_threshold,
160
change_column_default :application_settings,
:circuitbreaker_storage_timeout,
30
ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 160,
circuitbreaker_storage_timeout: 30)
end
end
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class RemoveOldCircuitbreakerConfig < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def up
remove_column :application_settings,
:circuitbreaker_backoff_threshold
remove_column :application_settings,
:circuitbreaker_failure_wait_time
end
def down
add_column :application_settings,
:circuitbreaker_backoff_threshold,
:integer,
default: 80
add_column :application_settings,
:circuitbreaker_failure_wait_time,
:integer,
default: 30
end
end
......@@ -135,12 +135,10 @@ ActiveRecord::Schema.define(version: 20171205190711) do
t.boolean "hashed_storage_enabled", default: false, null: false
t.boolean "project_export_enabled", default: true, null: false
t.boolean "auto_devops_enabled", default: false, null: false
t.integer "circuitbreaker_failure_count_threshold", default: 160
t.integer "circuitbreaker_failure_wait_time", default: 30
t.integer "circuitbreaker_failure_count_threshold", default: 3
t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 30
t.integer "circuitbreaker_storage_timeout", default: 15
t.integer "circuitbreaker_access_retries", default: 3
t.integer "circuitbreaker_backoff_threshold", default: 80
t.boolean "throttle_unauthenticated_enabled", default: false, null: false
t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false
t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false
......@@ -150,6 +148,7 @@ ActiveRecord::Schema.define(version: 20171205190711) do
t.boolean "throttle_authenticated_web_enabled", default: false, null: false
t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false
t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false
t.integer "circuitbreaker_check_interval", default: 1, null: false
t.boolean "password_authentication_enabled_for_web"
t.boolean "password_authentication_enabled_for_git", default: true
t.integer "gitaly_timeout_default", default: 55, null: false
......
......@@ -70,10 +70,9 @@ PUT /application/settings
| `akismet_api_key` | string | no | API key for akismet spam protection |
| `akismet_enabled` | boolean | no | Enable or disable akismet spam protection |
| `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. |
| `circuitbreaker_check_interval` | integer | no | Number of seconds in between storage checks. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. |
| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt |
| `clientside_sentry_dsn` | string | no | Required if `clientside_sentry_dsn` is enabled |
| `clientside_sentry_enabled` | boolean | no | Enable Sentry error reporting for the client side |
......
......@@ -41,7 +41,7 @@ module API
detail 'This feature was introduced in GitLab 9.5'
end
delete do
Gitlab::Git::Storage::CircuitBreaker.reset_all!
Gitlab::Git::Storage::FailureInfo.reset_all!
end
end
end
......
module Gitlab
module Git
module Storage
class Checker
include CircuitBreakerSettings
attr_reader :storage_path, :storage, :hostname, :logger
def self.check_all(logger = Rails.logger)
threads = Gitlab.config.repositories.storages.keys.map do |storage_name|
Thread.new do
Thread.current[:result] = new(storage_name, logger).check_with_lease
end
end
threads.map do |thread|
thread.join
thread[:result]
end
end
def initialize(storage, logger = Rails.logger)
@storage = storage
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
@logger = logger
@hostname = Gitlab::Environment.hostname
end
def check_with_lease
lease_key = "storage_check:#{cache_key}"
lease = Gitlab::ExclusiveLease.new(lease_key, timeout: storage_timeout)
result = { storage: storage, success: nil }
if uuid = lease.try_obtain
result[:success] = check
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
else
logger.warn("#{hostname}: #{storage}: Skipping check, previous check still running")
end
result
end
def check
if Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
true
else
track_storage_inaccessible
logger.error("#{hostname}: #{storage}: Not accessible.")
false
end
end
private
def track_storage_inaccessible
first_failure = current_failure_info.first_failure || Time.now
last_failure = Time.now
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, first_failure.to_i)
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
maintain_known_keys(redis)
end
end
end
def track_storage_accessible
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, nil)
redis.hset(cache_key, :last_failure, nil)
redis.hset(cache_key, :failure_count, 0)
maintain_known_keys(redis)
end
end
end
def maintain_known_keys(redis)
expire_time = Time.now.to_i + failure_reset_time
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
end
def current_failure_info
FailureInfo.load(cache_key)
end
end
end
end
end
......@@ -4,22 +4,11 @@ module Gitlab
class CircuitBreaker
include CircuitBreakerSettings
FailureInfo = Struct.new(:last_failure, :failure_count)
attr_reader :storage,
:hostname,
:storage_path
delegate :last_failure, :failure_count, to: :failure_info
def self.reset_all!
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
:hostname
RequestStore.delete(:circuitbreaker_cache)
end
delegate :last_failure, :failure_count, :no_failures?,
to: :failure_info
def self.for_storage(storage)
cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
......@@ -46,9 +35,6 @@ module Gitlab
def initialize(storage, hostname)
@storage = storage
@hostname = hostname
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
end
def perform
......@@ -65,15 +51,6 @@ module Gitlab
failure_count > failure_count_threshold
end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > backoff_threshold
recent_failure && too_many_failures
end
private
# The circuitbreaker can be enabled for the entire fleet using a Feature
......@@ -86,88 +63,13 @@ module Gitlab
end
def failure_info
@failure_info ||= get_failure_info
end
# Memoizing the `storage_available` call means we only do it once per
# request when the storage is available.
#
# When the storage appears not available, and the memoized value is `false`
# we might want to try again.
def storage_available?
return @storage_available if @storage_available
if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
else
track_storage_inaccessible
end
@storage_available
@failure_info ||= FailureInfo.load(cache_key)
end
def check_storage_accessible!
if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end
if backing_off?
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end
unless storage_available?
raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time)
end
end
def no_failures?
last_failure.blank? && failure_count == 0
end
def track_storage_inaccessible
@failure_info = FailureInfo.new(Time.now, failure_count + 1)
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
maintain_known_keys(redis)
end
end
end
def track_storage_accessible
@failure_info = FailureInfo.new(nil, 0)
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :last_failure, nil)
redis.hset(cache_key, :failure_count, 0)
maintain_known_keys(redis)
end
end
end
def maintain_known_keys(redis)
expire_time = Time.now.to_i + failure_reset_time
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
end
def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count)
end
last_failure = Time.at(last_failure.to_i) if last_failure.present?
FailureInfo.new(last_failure, failure_count.to_i)
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
end
end
......
......@@ -6,10 +6,6 @@ module Gitlab
application_settings.circuitbreaker_failure_count_threshold
end
def failure_wait_time
application_settings.circuitbreaker_failure_wait_time
end
def failure_reset_time
application_settings.circuitbreaker_failure_reset_time
end
......@@ -22,8 +18,12 @@ module Gitlab
application_settings.circuitbreaker_access_retries
end
def backoff_threshold
application_settings.circuitbreaker_backoff_threshold
def check_interval
application_settings.circuitbreaker_check_interval
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
private
......
module Gitlab
module Git
module Storage
class FailureInfo
attr_accessor :first_failure, :last_failure, :failure_count
def self.reset_all!
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
RequestStore.delete(:circuitbreaker_cache)
end
def self.load(cache_key)
first_failure, last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :first_failure, :last_failure, :failure_count)
end
last_failure = Time.at(last_failure.to_i) if last_failure.present?
first_failure = Time.at(first_failure.to_i) if first_failure.present?
new(first_failure, last_failure, failure_count.to_i)
end
def initialize(first_failure, last_failure, failure_count)
@first_failure = first_failure
@last_failure = last_failure
@failure_count = failure_count
end
def no_failures?
first_failure.blank? && last_failure.blank? && failure_count == 0
end
end
end
end
end
......@@ -11,6 +11,9 @@ module Gitlab
# These will always have nil values