GitLab wird am Donnerstag, den 09. Dezember, zwischen 08:00 und 10:00 Uhr wegen wichtigen Wartungsarbeiten nicht zur Verfügung stehen.

Commit 0a9d771b authored by Kamil Trzciński's avatar Kamil Trzciński
Browse files

Import common metrics into database.

This MR backports PrometheusMetric model to CE
and adds: common, identifier to figure out what kind of metric is used.
parent 05ee94be
......@@ -232,6 +232,8 @@ class Project < ActiveRecord::Base
has_many :clusters, through: :cluster_project, class_name: 'Clusters::Cluster'
has_many :cluster_ingresses, through: :clusters, source: :application_ingress, class_name: 'Clusters::Applications::Ingress'
has_many :prometheus_metrics
# Container repositories need to remove data from the container registry,
# which is not managed by the DB. Hence we're still using dependent: :destroy
# here.
......
class PrometheusMetric < ActiveRecord::Base
belongs_to :project, validate: true, inverse_of: :prometheus_metrics
enum group: {
# built-in groups
nginx_ingress: -1,
ha_proxy: -2,
aws_elb: -3,
nginx: -4,
kubernetes: -5,
# custom/user groups
business: 0,
response: 1,
system: 2
}
validates :title, presence: true
validates :query, presence: true
validates :group, presence: true
validates :y_label, presence: true
validates :unit, presence: true
validate :require_project
scope :common, -> { where(common: true) }
GROUP_TITLES = {
# built-in groups
nginx_ingress: _('Response metrics (NGINX Ingress)'),
ha_proxy: _('Response metrics (HA Proxy)'),
aws_elb: _('Response metrics (AWS ELB)'),
nginx: _('Response metrics (NGINX)'),
kubernetes: _('System metrics (Kubernetes)'),
# custom/user groups
business: _('Business metrics (Custom)'),
response: _('Response metrics (Custom)'),
system: _('System metrics (Custom)')
}.freeze
REQUIRED_METRICS = {
nginx_ingress: %w(nginx_upstream_responses_total nginx_upstream_response_msecs_avg),
ha_proxy: %w(haproxy_frontend_http_requests_total haproxy_frontend_http_responses_total),
aws_elb: %w(aws_elb_request_count_sum aws_elb_latency_average aws_elb_httpcode_backend_5_xx_sum),
nginx: %w(nginx_server_requests nginx_server_requestMsec),
kubernetes: %w(container_memory_usage_bytes container_cpu_usage_seconds_total)
}.freeze
def group_title
GROUP_TITLES[group.to_sym]
end
def required_metrics
(REQUIRED_METRICS[group.to_sym] || []).map(&:to_s)
end
def to_query_metric
Gitlab::Prometheus::Metric.new(id: id, title: title, required_metrics: required_metrics, weight: 0, y_label: y_label, queries: queries)
end
def queries
[
{
query_range: query,
unit: unit,
label: legend,
series: query_series
}
]
end
def query_series
case legend
when 'Status Code'
{
label: 'status_code',
when: [
{ value: '2xx', color: 'green' },
{ value: '4xx', color: 'orange' },
{ value: '5xx', color: 'red' }
]
}
end
end
private
def require_project
if project
errors.add(:project, "cannot be set if this is common metric") if common?
else
errors.add(:project, "has to be set when this is project-specific metric") unless common?
end
end
end
---
title: Import all common metrics into database
merge_request:
author:
type: changed
......@@ -7,7 +7,8 @@
- nginx_upstream_responses_total
weight: 1
queries:
- query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)'
- id: response_metrics_nginx_ingress_throughput_status_code
query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)'
unit: req / sec
label: Status Code
series:
......@@ -25,7 +26,8 @@
- nginx_upstream_response_msecs_avg
weight: 1
queries:
- query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})'
- id: response_metrics_nginx_ingress_latency_pod_average
query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})'
label: Pod average
unit: ms
- title: "HTTP Error Rate"
......@@ -34,7 +36,8 @@
- nginx_upstream_responses_total
weight: 1
queries:
- query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100'
- id: response_metrics_nginx_ingress_http_error_rate
query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100'
label: 5xx Errors
unit: "%"
- group: Response metrics (HA Proxy)
......@@ -46,10 +49,12 @@
- haproxy_frontend_http_requests_total
weight: 1
queries:
- query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)'
- id: response_metrics_ha_proxy_throughput_status_code
query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)'
unit: req / sec
label: Status Code
series:
- label: code
- label: status_code
when:
- value: 2xx
color: green
......@@ -63,7 +68,8 @@
- haproxy_frontend_http_responses_total
weight: 1
queries:
- query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))'
- id: response_metrics_ha_proxy_http_error_rate
query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))'
label: HTTP Errors
unit: "%"
- group: Response metrics (AWS ELB)
......@@ -75,7 +81,8 @@
- aws_elb_request_count_sum
weight: 1
queries:
- query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60'
- id: response_metrics_aws_elb_throughput_requests
query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60'
label: Total
unit: req / sec
- title: "Latency"
......@@ -84,7 +91,8 @@
- aws_elb_latency_average
weight: 1
queries:
- query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000'
- id: response_metrics_aws_elb_latency_average
query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000'
label: Average
unit: ms
- title: "HTTP Error Rate"
......@@ -94,7 +102,8 @@
- aws_elb_httpcode_backend_5_xx_sum
weight: 1
queries:
- query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})'
- id: response_metrics_aws_elb_http_error_rate
query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})'
label: HTTP Errors
unit: "%"
- group: Response metrics (NGINX)
......@@ -106,7 +115,8 @@
- nginx_server_requests
weight: 1
queries:
- query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)'
- id: response_metrics_nginx_throughput_status_code
query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)'
unit: req / sec
label: Status Code
series:
......@@ -124,7 +134,8 @@
- nginx_server_requestMsec
weight: 1
queries:
- query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})'
- id: response_metrics_nginx_latency
query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})'
label: Upstream
unit: ms
- title: "HTTP Error Rate"
......@@ -133,7 +144,8 @@
- nginx_server_requests
weight: 1
queries:
- query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))'
- id: response_metrics_nginx_http_error_rate
query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))'
label: HTTP Errors
unit: "errors / sec"
- group: System metrics (Kubernetes)
......@@ -145,7 +157,8 @@
- container_memory_usage_bytes
weight: 4
queries:
- query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024'
- id: system_metrics_kubernetes_container_memory_total
query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024'
label: Total
unit: GB
- title: "Core Usage (Total)"
......@@ -154,7 +167,8 @@
- container_cpu_usage_seconds_total
weight: 3
queries:
- query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)'
- id: system_metrics_kubernetes_container_cores_total
query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)'
label: Total
unit: "cores"
- title: "Memory Usage (Pod average)"
......@@ -163,7 +177,8 @@
- container_memory_usage_bytes
weight: 2
queries:
- query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024'
- id: system_metrics_kubernetes_container_memory_average
query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024'
label: Pod average
unit: MB
- title: "Core Usage (Pod average)"
......@@ -172,6 +187,12 @@
- container_cpu_usage_seconds_total
weight: 1
queries:
- query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
- id: system_metrics_kubernetes_container_core_usage
query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
label: Pod average
unit: "cores"
\ No newline at end of file
unit: "cores"
- id: system_metrics_kubernetes_container_core_usage_canary
query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
label: Pod average
unit: "cores"
track: canary
require_relative '../importers/common_metrics_importer.rb'
::Importers::CommonMetricsImporter.new.execute
require_relative '../importers/common_metrics_importer.rb'
::Importers::CommonMetricsImporter.new.execute
# frozen_string_literal: true
module Importers
class PrometheusMetric < ActiveRecord::Base
enum group: {
# built-in groups
nginx_ingress: -1,
ha_proxy: -2,
aws_elb: -3,
nginx: -4,
kubernetes: -5,
# custom groups
business: 0,
response: 1,
system: 2,
}
scope :common, -> { where(common: true) }
GROUP_TITLES = {
business: _('Business metrics (Custom)'),
response: _('Response metrics (Custom)'),
system: _('System metrics (Custom)'),
nginx_ingress: _('Response metrics (NGINX Ingress)'),
ha_proxy: _('Response metrics (HA Proxy)'),
aws_elb: _('Response metrics (AWS ELB)'),
nginx: _('Response metrics (NGINX)'),
kubernetes: _('System metrics (Kubernetes)')
}.freeze
end
class CommonMetricsImporter
MissingQueryId = Class.new(StandardError)
attr_reader :content
def initialize(file = 'config/prometheus/common_metrics.yml')
@content = YAML.load_file(file)
end
def execute
process_content do |id, attributes|
find_or_build_metric!(id)
.update!(**attributes)
end
end
private
def process_content(&blk)
content.map do |group|
process_group(group, &blk)
end
end
def process_group(group, &blk)
attributes = {
group: find_group_title_key(group['group'])
}
group['metrics'].map do |metric|
process_metric(metric, attributes, &blk)
end
end
def process_metric(metric, attributes, &blk)
attributes = attributes.merge(
title: metric['title'],
y_label: metric['y_label'])
metric['queries'].map do |query|
process_metric_query(query, attributes, &blk)
end
end
def process_metric_query(query, attributes, &blk)
attributes = attributes.merge(
legend: query['label'],
query: query['query_range'],
unit: query['unit'])
blk.call(query['id'], attributes)
end
def find_or_build_metric!(id)
raise MissingQueryId unless id
PrometheusMetric.common.find_by(identifier: id) ||
PrometheusMetric.new(common: true, identifier: id)
end
def find_group_title_key(title)
PrometheusMetric.groups[find_group_title(title)]
end
def find_group_title(title)
PrometheusMetric::GROUP_TITLES.invert[title]
end
end
end
class CreatePrometheusMetrics < ActiveRecord::Migration
DOWNTIME = false
def change
create_table :prometheus_metrics do |t|
t.references :project, index: true, foreign_key: { on_delete: :cascade }, null: false
t.string :title, null: false
t.string :query, null: false
t.string :y_label
t.string :unit
t.string :legend
t.integer :group, null: false, index: true
t.timestamps_with_timezone null: false
end
end
end
class AddCommonToPrometheusMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def up
add_column_with_default(:prometheus_metrics, :common, :boolean, default: false)
end
def down
remove_column(:prometheus_metrics, :common)
end
end
class ChangeProjectIdForPrometheusMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def change
change_column_null :prometheus_metrics, :project_id, true
end
end
class AddIndexOnDefaultPrometheusMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def up
add_concurrent_index :prometheus_metrics, :common
end
def down
remove_concurrent_index :prometheus_metrics, :project_id
end
end
# frozen_string_literal: true
class AddIdentifierToPrometheusMetric < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :prometheus_metrics, :identifier, :string, unique: true
end
end
class ImportCommonMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
require_relative '../importers/common_metrics_importer.rb'
DOWNTIME = false
def up
Importers::CommonMetricsImporter.new.execute
end
def down
# no-op
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180826111825) do
ActiveRecord::Schema.define(version: 20180831164909) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -1696,6 +1696,24 @@
add_index "projects", ["star_count"], name: "index_projects_on_star_count", using: :btree
add_index "projects", ["visibility_level"], name: "index_projects_on_visibility_level", using: :btree
create_table "prometheus_metrics", force: :cascade do |t|
t.integer "project_id"
t.string "title", null: false
t.string "query", null: false
t.string "y_label"
t.string "unit"
t.string "legend"
t.integer "group", null: false
t.datetime_with_timezone "created_at", null: false
t.datetime_with_timezone "updated_at", null: false
t.boolean "common", default: false, null: false
t.string "identifier"
end
add_index "prometheus_metrics", ["common"], name: "index_prometheus_metrics_on_common", using: :btree
add_index "prometheus_metrics", ["group"], name: "index_prometheus_metrics_on_group", using: :btree
add_index "prometheus_metrics", ["project_id"], name: "index_prometheus_metrics_on_project_id", using: :btree
create_table "protected_branch_merge_access_levels", force: :cascade do |t|
t.integer "protected_branch_id", null: false
t.integer "access_level", default: 40, null: false
......@@ -2375,6 +2393,7 @@
add_foreign_key "project_import_data", "projects", name: "fk_ffb9ee3a10", on_delete: :cascade
add_foreign_key "project_mirror_data", "projects", on_delete: :cascade
add_foreign_key "project_statistics", "projects", on_delete: :cascade
add_foreign_key "prometheus_metrics", "projects", on_delete: :cascade
add_foreign_key "protected_branch_merge_access_levels", "protected_branches", name: "fk_8a3072ccb3", on_delete: :cascade
add_foreign_key "protected_branch_push_access_levels", "protected_branches", name: "fk_9ffc86a3d9", on_delete: :cascade
add_foreign_key "protected_branches", "projects", name: "fk_7a9c6d93e7", on_delete: :cascade
......
......@@ -20,6 +20,45 @@ GitLab uses the defined queries and fills in the environment specific variables.
## Adding to the library
We strive to support the 2-4 most important metrics for each common system service that supports Prometheus. If you are looking for support for a particular exporter which has not yet been added to the library, additions can be made [to the `additional_metrics.yml`](https://gitlab.com/gitlab-org/gitlab-ce/blob/master/config/prometheus/additional_metrics.yml) file.
We strive to support the 2-4 most important metrics for each common system service that supports Prometheus. If you are looking for support for a particular exporter which has not yet been added to the library, additions can be made [to the `common_metrics.yml`](https://gitlab.com/gitlab-org/gitlab-ce/blob/master/config/prometheus/common_metrics.yml) file.
> Note: The library is only for monitoring public, common, system services which all customers can benefit from. Support for monitoring [customer proprietary metrics](https://gitlab.com/gitlab-org/gitlab-ee/issues/2273) will be added in a subsequent release.
### Query identifier
The requirement for adding metrics is to have each query to have unique identifier.
Identifier is used to update the metric later when changed.
```yaml
- group: Response metrics (NGINX Ingress)
metrics:
- title: "Throughput"
y_label: "Requests / Sec"
queries:
- id: response_metrics_nginx_ingress_throughput_status_code
query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)'
unit: req / sec
label: Status Code
```
### Update existing metrics
After you add or change existing _common_ metric you have to create a new database migration that will query and update all existing metrics.
**Note: If a query metric (which is identified by `id:`) is removed it will not be removed from database by default.**
**You might want to add additional database migration that makes a decision what to do with removed one.**
**For example: you might be interested in migrating all dependent data to a different metric.**
```ruby
class ImportCommonMetrics < ActiveRecord::Migration
require_relative '../importers/common_metrics_importer.rb'
DOWNTIME = false
def up
Importers::CommonMetricsImporter.new.execute
end
def down
# no-op
end
end
```
......@@ -5,7 +5,7 @@ module AdditionalMetricsParser
MUTEX = Mutex.new
extend self
def load_groups_from_yaml(file_name = 'additional_metrics.yml')
def load_groups_from_yaml(file_name)
yaml_metrics_raw(file_name).map(&method(:group_from_entry))
end
......
......@@ -4,10 +4,13 @@ class MetricGroup
include ActiveModel::Model
attr_accessor :name, :priority, :metrics
validates :name, :priority, :metrics, presence: true
def self.common_metrics
AdditionalMetricsParser.load_groups_from_yaml
::PrometheusMetric.common.group_by(&:group_title).map do |name, metrics|
MetricGroup.new(name: name, priority: 0, metrics: metrics.map(&:to_query_metric))
end
end
# EE only
......
require 'rails_helper'
require Rails.root.join("db", "importers", "common_metrics_importer.rb")
describe Importers::PrometheusMetric do
it 'group enum equals ::PrometheusMetric' do
expect(described_class.groups).to eq(::PrometheusMetric.groups)
end
it 'GROUP_TITLES equals ::PrometheusMetric' do
expect(described_class::GROUP_TITLES).to eq(::PrometheusMetric::GROUP_TITLES)
end
end
describe Importers::CommonMetricsImporter do
subject { described_class.new }
context "does import common_metrics.yml" do
let(:groups) { subject.content }
let(:metrics) { groups.map { |group| group['metrics'] }.flatten }
let(:queries) { metrics.map { |group| group['queries'] }.flatten }
let(:query_ids) { queries.map { |query| query['id'] } }