GitLab steht Mittwoch, den 08. Juli, zwischen 09:00 und 13:00 Uhr aufgrund von Wartungsarbeiten nicht zur Verfügung.

Commit 5bf65c93 authored by Sean McGivern's avatar Sean McGivern

Merge branch 'bvl-nfs-circuitbreaker' into 'master'

Circuitbreaker for storage paths

Closes #32207, #33117, gitlab-com/infrastructure#1946, and gitlab-com/infrastructure#1775

See merge request !11449
parents fa716921 fda83a61
class Admin::HealthCheckController < Admin::ApplicationController
def show
@errors = HealthCheck::Utils.process_checks(['standard'])
@failing_storage_statuses = Gitlab::Git::Storage::Health.for_failing_storages
end
def reset_storage_health
Gitlab::Git::Storage::CircuitBreaker.reset_all!
redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset')
end
end
......@@ -52,6 +52,15 @@ class ApplicationController < ActionController::Base
head :forbidden, retry_after: Gitlab::Auth::UniqueIpsLimiter.config.unique_ips_limit_time_window
end
rescue_from Gitlab::Git::Storage::Inaccessible, GRPC::Unavailable, Gitlab::Git::CommandError do |exception|
Raven.capture_exception(exception) if sentry_enabled?
log_exception(exception)
headers['Retry-After'] = exception.retry_after if exception.respond_to?(:retry_after)
render_503
end
def redirect_back_or_default(default: root_path, options: {})
redirect_to request.referer.present? ? :back : default, options
end
......@@ -152,6 +161,19 @@ class ApplicationController < ActionController::Base
head :unprocessable_entity
end
def render_503
respond_to do |format|
format.html do
render(
file: Rails.root.join("public", "503"),
layout: false,
status: :service_unavailable
)
end
format.any { head :service_unavailable }
end
end
def no_cache_headers
response.headers["Cache-Control"] = "no-cache, no-store, max-age=0, must-revalidate"
response.headers["Pragma"] = "no-cache"
......
module StorageHealthHelper
def failing_storage_health_message(storage_health)
storage_name = content_tag(:strong, h(storage_health.storage_name))
host_names = h(storage_health.failing_on_hosts.to_sentence)
translation_params = { storage_name: storage_name,
host_names: host_names,
failed_attempts: storage_health.total_failures }
translation = n_('%{storage_name}: failed storage access attempt on host:',
'%{storage_name}: %{failed_attempts} failed storage access attempts:',
storage_health.total_failures) % translation_params
translation.html_safe
end
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time }
if permanently_broken
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
elsif circuit_breaker.circuit_broken?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params
end
end
end
......@@ -133,12 +133,13 @@ class Repository
ref ||= root_ref
args = %W(
#{Gitlab.config.git.bin_path} log #{ref} --pretty=%H --skip #{offset}
log #{ref} --pretty=%H --skip #{offset}
--max-count #{limit} --grep=#{query} --regexp-ignore-case
)
args = args.concat(%W(-- #{path})) if path.present?
git_log_results = Gitlab::Popen.popen(args, path_to_repo).first.lines
git_log_results = run_git(args).first.lines
git_log_results.map { |c| commit(c.chomp) }.compact
end
......@@ -687,8 +688,8 @@ class Repository
end
def refs_contains_sha(ref_type, sha)
args = %W(#{Gitlab.config.git.bin_path} #{ref_type} --contains #{sha})
names = Gitlab::Popen.popen(args, path_to_repo).first
args = %W(#{ref_type} --contains #{sha})
names = run_git(args).first
if names.respond_to?(:split)
names = names.split("\n").map(&:strip)
......@@ -966,15 +967,17 @@ class Repository
return [] if empty_repo? || query.blank?
offset = 2
args = %W(#{Gitlab.config.git.bin_path} grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref})
Gitlab::Popen.popen(args, path_to_repo).first.scrub.split(/^--$/)
args = %W(grep -i -I -n --before-context #{offset} --after-context #{offset} -E -e #{Regexp.escape(query)} #{ref || root_ref})
run_git(args).first.scrub.split(/^--$/)
end
def search_files_by_name(query, ref)
return [] if empty_repo? || query.blank?
args = %W(#{Gitlab.config.git.bin_path} ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)})
Gitlab::Popen.popen(args, path_to_repo).first.lines.map(&:strip)
args = %W(ls-tree --full-tree -r #{ref || root_ref} --name-status | #{Regexp.escape(query)})
run_git(args).first.lines.map(&:strip)
end
def with_repo_branch_commit(start_repository, start_branch_name)
......@@ -1019,8 +1022,8 @@ class Repository
end
def fetch_ref(source_path, source_ref, target_ref)
args = %W(#{Gitlab.config.git.bin_path} fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref})
Gitlab::Popen.popen(args, path_to_repo)
args = %W(fetch --no-tags -f #{source_path} #{source_ref}:#{target_ref})
run_git(args)
end
def create_ref(ref, ref_path)
......@@ -1101,6 +1104,12 @@ class Repository
private
def run_git(args)
circuit_breaker.perform do
Gitlab::Popen.popen([Gitlab.config.git.bin_path, *args], path_to_repo)
end
end
def blob_data_at(sha, path)
blob = blob_at(sha, path)
return unless blob
......@@ -1110,7 +1119,9 @@ class Repository
end
def refs_directory_exists?
File.exist?(File.join(path_to_repo, 'refs'))
circuit_breaker.perform do
File.exist?(File.join(path_to_repo, 'refs'))
end
end
def cache
......@@ -1158,8 +1169,8 @@ class Repository
end
def last_commit_id_for_path_by_shelling_out(sha, path)
args = %W(#{Gitlab.config.git.bin_path} rev-list --max-count=1 #{sha} -- #{path})
Gitlab::Popen.popen(args, path_to_repo).first.strip
args = %W(rev-list --max-count=1 #{sha} -- #{path})
run_git(args).first.strip
end
def repository_storage_path
......@@ -1169,4 +1180,8 @@ class Repository
def initialize_raw_repository
Gitlab::Git::Repository.new(project.repository_storage, disk_path + '.git')
end
def circuit_breaker
@circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(project.repository_storage)
end
end
- if failing_storages.any?
= _('There are problems accessing Git storage: ')
%ul
- failing_storages.each do |storage_health|
%li
= failing_storage_health_message(storage_health)
%ul
- storage_health.failing_circuit_breakers.each do |circuit_breaker|
%li
#{circuit_breaker.hostname}: #{message_for_circuit_breaker(circuit_breaker)}
= _("Access to failing storages has been temporarily disabled to allow the mount to recover. Reset storage information after the issue has been resolved to allow access again.")
.prepend-top-10
= button_to _("Reset git storage health information"), reset_storage_health_admin_health_check_path,
method: :post, class: 'btn btn-default'
- @no_container = true
- page_title "Health Check"
- page_title _('Health Check')
- no_errors = @errors.blank? && @failing_storage_statuses.blank?
= render 'admin/monitoring/head'
%div{ class: container_class }
%h3.page-title
Health Check
%h3.page-title= page_title
.bs-callout.clearfix
.pull-left
%p
Access token is
#{ s_('HealthCheck|Access token is') }
%code#health-check-token= current_application_settings.health_check_access_token
.prepend-top-10
= button_to "Reset health check access token", reset_health_check_token_admin_application_settings_path,
= button_to _("Reset health check access token"), reset_health_check_token_admin_application_settings_path,
method: :put, class: 'btn btn-default',
data: { confirm: 'Are you sure you want to reset the health check token?' }
data: { confirm: _('Are you sure you want to reset the health check token?') }
%p.light
Health information can be retrieved from the following endpoints. More information is available
= link_to 'here', help_page_path('user/admin_area/monitoring/health_check')
#{ _('Health information can be retrieved from the following endpoints. More information is available') }
= link_to s_('More information is available|here'), help_page_path('user/admin_area/monitoring/health_check')
%ul
%li
%code= readiness_url(token: current_application_settings.health_check_access_token)
......@@ -29,14 +29,15 @@
.panel.panel-default
.panel-heading
Current Status:
- if @errors.blank?
- if no_errors
= icon('circle', class: 'cgreen')
Healthy
#{ s_('HealthCheck|Healthy') }
- else
= icon('warning', class: 'cred')
Unhealthy
#{ s_('HealthCheck|Unhealthy') }
.panel-body
- if @errors.blank?
No Health Problems Detected
- if no_errors
#{ s_('HealthCheck|No Health Problems Detected') }
- else
= @errors
= render partial: 'failing_storages', object: @failing_storage_statuses
---
title: Block access to failing repository storage
merge_request: 11449
author:
......@@ -506,6 +506,11 @@ production: &base
path: /home/git/repositories/
gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port)
# gitaly_token: 'special token' # Optional: override global gitaly.token for this storage.
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after an access failure before allowing access again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
## Backup settings
backup:
......@@ -638,6 +643,10 @@ test:
default:
path: tmp/tests/repositories/
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
broken:
path: tmp/tests/non-existent-repositories
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
gitaly:
enabled: true
token: secret
......
......@@ -433,6 +433,17 @@ end
Settings.repositories.storages.values.each do |storage|
# Expand relative paths
storage['path'] = Settings.absolute(storage['path'])
# Set failure defaults
storage['failure_count_threshold'] ||= 10
storage['failure_wait_time'] ||= 30
storage['failure_reset_time'] ||= 1800
storage['storage_timeout'] ||= 5
# Set turn strings into numbers
storage['failure_count_threshold'] = storage['failure_count_threshold'].to_i
storage['failure_wait_time'] = storage['failure_wait_time'].to_i
storage['failure_reset_time'] = storage['failure_reset_time'].to_i
# We might want to have a timeout shorter than 1 second.
storage['storage_timeout'] = storage['storage_timeout'].to_f
end
#
......
......@@ -7,6 +7,13 @@ def find_parent_path(name, path)
Gitlab.config.repositories.storages.detect do |n, rs|
name != n && Pathname.new(rs['path']).realpath == parent
end
rescue Errno::EIO, Errno::ENOENT => e
warning = "WARNING: couldn't verify #{path} (#{name}). "\
"If this is an external storage, it might be offline."
message = "#{warning}\n#{e.message}"
Rails.logger.error("#{message}\n\t" + e.backtrace.join("\n\t"))
nil
end
def storage_validation_error(message)
......@@ -29,6 +36,15 @@ def validate_storages_config
if !repository_storage.is_a?(Hash) || repository_storage['path'].nil?
storage_validation_error("#{name} is not a valid storage, because it has no `path` key. Refer to gitlab.yml.example for an updated example")
end
%w(failure_count_threshold failure_wait_time failure_reset_time storage_timeout).each do |setting|
# Falling back to the defaults is fine!
next if repository_storage[setting].nil?
unless repository_storage[setting].to_f > 0
storage_validation_error("#{setting}, for storage `#{name}` needs to be greater than 0")
end
end
end
end
......
......@@ -67,7 +67,9 @@ namespace :admin do
end
resource :logs, only: [:show]
resource :health_check, controller: 'health_check', only: [:show]
resource :health_check, controller: 'health_check', only: [:show] do
post :reset_storage_health
end
resource :background_jobs, controller: 'background_jobs', only: [:show]
resource :system_info, controller: 'system_info', only: [:show]
resources :requests_profiles, only: [:index, :show], param: :name, constraints: { name: /.+\.html/ }
......
......@@ -60,7 +60,7 @@ respectively.
path: /mnt/cephfs/repositories
```
1. [Restart GitLab] for the changes to take effect.
1. [Restart GitLab][restart-gitlab] for the changes to take effect.
>**Note:**
The [`gitlab_shell: repos_path` entry][repospath] in `gitlab.yml` will be
......@@ -97,9 +97,80 @@ be stored via the **Application Settings** in the Admin area.
Beginning with GitLab 8.13.4, multiple paths can be chosen. New projects will be
randomly placed on one of the selected paths.
## Handling failing repository storage
> [Introduced][ce-11449] in GitLab 9.5.
When GitLab detects access to the repositories storage fails repeatedly, it can
gracefully prevent attempts to access the storage. This might be useful when
the repositories are stored somewhere on the network.
The configuration could look as follows:
**For Omnibus installations**
1. Edit `/etc/gitlab/gitlab.rb`:
```ruby
git_data_dirs({
"default" => {
"path" => "/mnt/nfs-01/git-data",
"failure_count_threshold" => 10,
"failure_wait_time" => 30,
"failure_reset_time" => 1800,
"storage_timeout" => 5
}
})
```
1. Save the file and [reconfigure GitLab][reconfigure-gitlab] for the changes to take effect.
---
**For installations from source**
1. Edit `config/gitlab.yml`:
```yaml
repositories:
storages: # You must have at least a `default` storage path.
default:
path: /home/git/repositories/
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after last access failure before trying again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
```
1. Save the file and [restart GitLab][restart-gitlab] for the changes to take effect.
**`failure_count_threshold`:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in
the admin interface: `https://gitlab.example.com/admin/health_check` or using the
[api](../api/repository_storage_health.md) to allow access to the storage again.
**`failure_wait_time`:** When access to a storage fails. GitLab will prevent
access to the storage for the time specified here. This allows the filesystem to
recover without.
**`failure_reset_time`:** The time in seconds GitLab will keep failure
information. When no failures occur during this time, information about the
mount is reset.
**`storage_timeout`:** The time in seconds GitLab will try to access storage.
After this time a timeout error will be raised.
When storage failures occur, this will be visible in the admin interface like this:
![failing storage](img/failing_storage.png)
To allow access to all storages, click the `Reset git storage health information` button.
[ce-4578]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/4578
[restart gitlab]: restart_gitlab.md#installations-from-source
[reconfigure gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure
[restart-gitlab]: restart_gitlab.md#installations-from-source
[reconfigure-gitlab]: restart_gitlab.md#omnibus-gitlab-reconfigure
[backups]: ../raketasks/backup_restore.md
[raketask]: https://gitlab.com/gitlab-org/gitlab-ce/blob/033e5423a2594e08a7ebcd2379bd2331f4c39032/lib/backup/repository.rb#L54-56
[repospath]: https://gitlab.com/gitlab-org/gitlab-ce/blob/8-9-stable/config/gitlab.yml.example#L457
[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
# Circuitbreaker API
> [Introduced][ce-11449] in GitLab 9.5.
The Circuitbreaker API is only accessible to administrators. All requests by
guests will respond with `401 Unauthorized`, and all requests by normal users
will respond with `403 Forbidden`.
## Repository Storages
### Get all storage information
Returns of all currently configured storages and their health information.
```
GET /circuit_breakers/repository_storage
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
```json
[
{
"storage_name": "default",
"failing_on_hosts": [],
"total_failures": 0
},
{
"storage_name": "broken",
"failing_on_hosts": [
"web01", "worker01"
],
"total_failures": 1
}
]
```
### Get failing storages
This returns a list of all currently failing storages.
```
GET /circuit_breakers/repository_storage/failing
```
```bash
curl --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage/failing
```
```json
[
{
"storage_name":"broken",
"failing_on_hosts":["web01", "worker01"],
"total_failures":2
}
]
```
## Reset failing storage information
Use this remove all failing storage information and allow access to the storage again.
```
DELETE /circuit_breakers/repository_storage
```
```bash
curl --request DELETE --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/circuit_breakers/repository_storage
```
[ce-11449]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/11449
......@@ -95,6 +95,7 @@ module API
mount ::API::Boards
mount ::API::Branches
mount ::API::BroadcastMessages
mount ::API::CircuitBreakers
mount ::API::Commits
mount ::API::CommitStatuses
mount ::API::DeployKeys
......
module API
class CircuitBreakers < Grape::API
before { authenticated_as_admin! }
resource :circuit_breakers do
params do
requires :type,
type: String,
desc: "The type of circuitbreaker",
values: ['repository_storage']
end
resource ':type' do
namespace '', requirements: { type: 'repository_storage' } do
helpers do
def failing_storage_health
@failing_storage_health ||= Gitlab::Git::Storage::Health.for_failing_storages
end
def storage_health
@failing_storage_health ||= Gitlab::Git::Storage::Health.for_all_storages
end
end
desc 'Get all failing git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get do
present storage_health, with: Entities::RepositoryStorageHealth
end
desc 'Get all failing git storages' do
detail 'This feature was introduced in GitLab 9.5'
success Entities::RepositoryStorageHealth
end
get 'failing' do
present failing_storage_health, with: Entities::RepositoryStorageHealth
end
desc 'Reset all storage failures and open circuitbreaker' do
detail 'This feature was introduced in GitLab 9.5'
end
delete do
Gitlab::Git::Storage::CircuitBreaker.reset_all!
end
end
end
end
end
end
......@@ -951,5 +951,11 @@ module API
expose :ip_address
expose :submitted, as: :akismet_submitted
end
class RepositoryStorageHealth < Grape::Entity
expose :storage_name
expose :failing_on_hosts
expose :total_failures
end
end
end
module Gitlab
module Environment
def self.hostname
@hostname ||= ENV['HOSTNAME'] || Socket.gethostname
end
end
end
......@@ -59,11 +59,17 @@ module Gitlab
end
def rugged
@rugged ||= Rugged::Repository.new(path, alternates: alternate_object_directories)
@rugged ||= circuit_breaker.perform do
Rugged::Repository.new(path, alternates: alternate_object_directories)
end
rescue Rugged::RepositoryError, Rugged::OSError
raise NoRepository.new('no repository for such path')
end
def circuit_breaker
@circuit_breaker ||= Gitlab::Git::Storage::CircuitBreaker.for_storage(storage)
end
# Returns an Array of branch names
# sorted by name ASC
def branch_names
......
module Gitlab
module Git
module Storage
class Inaccessible < StandardError
attr_reader :retry_after
def initialize(message = nil, retry_after = nil)
super(message)
@retry_after = retry_after
end
end
CircuitOpen = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
def self.redis
Gitlab::Redis::SharedState
end
end
end
end
module Gitlab
module Git
module Storage
class CircuitBreaker
FailureInfo = Struct.new(:last_failure, :failure_count)
attr_reader :storage,
:hostname,
:storage_path,
:failure_count_threshold,
:failure_wait_time,
:failure_reset_time,
:storage_timeout
delegate :last_failure, :failure_count, to: :failure_info
def self.reset_all!
pattern = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}*"
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.keys(pattern)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
RequestStore.delete(:circuitbreaker_cache)
end
def self.for_storage(storage)
cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
Hash.new do |hash, storage_name|
hash[storage_name] = new(storage_name)
end
end
cached_circuitbreakers[storage]
end
def initialize(storage, hostname = Gitlab::Environment.hostname)
@storage = storage
@hostname = hostname
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
@failure_count_threshold = config['failure_count_threshold']
@failure_wait_time = config['failure_wait_time']
@failure_reset_time = config['failure_reset_time']
@storage_timeout = config['storage_timeout']
end
def perform
return yield unless Feature.enabled?('git_storage_circuit_breaker')
check_storage_accessible!
yield
end
def circuit_broken?
return false if no_failures?