Commit 0a9d771b authored by Kamil Trzciński's avatar Kamil Trzciński

Import common metrics into database.

This MR backports PrometheusMetric model to CE
and adds: common, identifier to figure out what kind of metric is used.
parent 05ee94be
......@@ -232,6 +232,8 @@ class Project < ActiveRecord::Base
has_many :clusters, through: :cluster_project, class_name: 'Clusters::Cluster'
has_many :cluster_ingresses, through: :clusters, source: :application_ingress, class_name: 'Clusters::Applications::Ingress'
has_many :prometheus_metrics
# Container repositories need to remove data from the container registry,
# which is not managed by the DB. Hence we're still using dependent: :destroy
# here.
......
class PrometheusMetric < ActiveRecord::Base
belongs_to :project, validate: true, inverse_of: :prometheus_metrics
enum group: {
# built-in groups
nginx_ingress: -1,
ha_proxy: -2,
aws_elb: -3,
nginx: -4,
kubernetes: -5,
# custom/user groups
business: 0,
response: 1,
system: 2
}
validates :title, presence: true
validates :query, presence: true
validates :group, presence: true
validates :y_label, presence: true
validates :unit, presence: true
validate :require_project
scope :common, -> { where(common: true) }
GROUP_TITLES = {
# built-in groups
nginx_ingress: _('Response metrics (NGINX Ingress)'),
ha_proxy: _('Response metrics (HA Proxy)'),
aws_elb: _('Response metrics (AWS ELB)'),
nginx: _('Response metrics (NGINX)'),
kubernetes: _('System metrics (Kubernetes)'),
# custom/user groups
business: _('Business metrics (Custom)'),
response: _('Response metrics (Custom)'),
system: _('System metrics (Custom)')
}.freeze
REQUIRED_METRICS = {
nginx_ingress: %w(nginx_upstream_responses_total nginx_upstream_response_msecs_avg),
ha_proxy: %w(haproxy_frontend_http_requests_total haproxy_frontend_http_responses_total),
aws_elb: %w(aws_elb_request_count_sum aws_elb_latency_average aws_elb_httpcode_backend_5_xx_sum),
nginx: %w(nginx_server_requests nginx_server_requestMsec),
kubernetes: %w(container_memory_usage_bytes container_cpu_usage_seconds_total)
}.freeze
def group_title
GROUP_TITLES[group.to_sym]
end
def required_metrics
(REQUIRED_METRICS[group.to_sym] || []).map(&:to_s)
end
def to_query_metric
Gitlab::Prometheus::Metric.new(id: id, title: title, required_metrics: required_metrics, weight: 0, y_label: y_label, queries: queries)
end
def queries
[
{
query_range: query,
unit: unit,
label: legend,
series: query_series
}
]
end
def query_series
case legend
when 'Status Code'
{
label: 'status_code',
when: [
{ value: '2xx', color: 'green' },
{ value: '4xx', color: 'orange' },
{ value: '5xx', color: 'red' }
]
}
end
end
private
def require_project
if project
errors.add(:project, "cannot be set if this is common metric") if common?
else
errors.add(:project, "has to be set when this is project-specific metric") unless common?
end
end
end
---
title: Import all common metrics into database
merge_request:
author:
type: changed
......@@ -7,7 +7,8 @@
- nginx_upstream_responses_total
weight: 1
queries:
- query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)'
- id: response_metrics_nginx_ingress_throughput_status_code
query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)'
unit: req / sec
label: Status Code
series:
......@@ -25,7 +26,8 @@
- nginx_upstream_response_msecs_avg
weight: 1
queries:
- query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})'
- id: response_metrics_nginx_ingress_latency_pod_average
query_range: 'avg(nginx_upstream_response_msecs_avg{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"})'
label: Pod average
unit: ms
- title: "HTTP Error Rate"
......@@ -34,7 +36,8 @@
- nginx_upstream_responses_total
weight: 1
queries:
- query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100'
- id: response_metrics_nginx_ingress_http_error_rate
query_range: 'sum(rate(nginx_upstream_responses_total{status_code="5xx", upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) / sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) * 100'
label: 5xx Errors
unit: "%"
- group: Response metrics (HA Proxy)
......@@ -46,10 +49,12 @@
- haproxy_frontend_http_requests_total
weight: 1
queries:
- query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)'
- id: response_metrics_ha_proxy_throughput_status_code
query_range: 'sum(rate(haproxy_frontend_http_requests_total{%{environment_filter}}[2m])) by (code)'
unit: req / sec
label: Status Code
series:
- label: code
- label: status_code
when:
- value: 2xx
color: green
......@@ -63,7 +68,8 @@
- haproxy_frontend_http_responses_total
weight: 1
queries:
- query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))'
- id: response_metrics_ha_proxy_http_error_rate
query_range: 'sum(rate(haproxy_frontend_http_responses_total{code="5xx",%{environment_filter}}[2m])) / sum(rate(haproxy_frontend_http_responses_total{%{environment_filter}}[2m]))'
label: HTTP Errors
unit: "%"
- group: Response metrics (AWS ELB)
......@@ -75,7 +81,8 @@
- aws_elb_request_count_sum
weight: 1
queries:
- query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60'
- id: response_metrics_aws_elb_throughput_requests
query_range: 'sum(aws_elb_request_count_sum{%{environment_filter}}) / 60'
label: Total
unit: req / sec
- title: "Latency"
......@@ -84,7 +91,8 @@
- aws_elb_latency_average
weight: 1
queries:
- query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000'
- id: response_metrics_aws_elb_latency_average
query_range: 'avg(aws_elb_latency_average{%{environment_filter}}) * 1000'
label: Average
unit: ms
- title: "HTTP Error Rate"
......@@ -94,7 +102,8 @@
- aws_elb_httpcode_backend_5_xx_sum
weight: 1
queries:
- query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})'
- id: response_metrics_aws_elb_http_error_rate
query_range: 'sum(aws_elb_httpcode_backend_5_xx_sum{%{environment_filter}}) / sum(aws_elb_request_count_sum{%{environment_filter}})'
label: HTTP Errors
unit: "%"
- group: Response metrics (NGINX)
......@@ -106,7 +115,8 @@
- nginx_server_requests
weight: 1
queries:
- query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)'
- id: response_metrics_nginx_throughput_status_code
query_range: 'sum(rate(nginx_server_requests{server_zone!="*", server_zone!="_", %{environment_filter}}[2m])) by (code)'
unit: req / sec
label: Status Code
series:
......@@ -124,7 +134,8 @@
- nginx_server_requestMsec
weight: 1
queries:
- query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})'
- id: response_metrics_nginx_latency
query_range: 'avg(nginx_server_requestMsec{%{environment_filter}})'
label: Upstream
unit: ms
- title: "HTTP Error Rate"
......@@ -133,7 +144,8 @@
- nginx_server_requests
weight: 1
queries:
- query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))'
- id: response_metrics_nginx_http_error_rate
query_range: 'sum(rate(nginx_server_requests{code="5xx", %{environment_filter}}[2m]))'
label: HTTP Errors
unit: "errors / sec"
- group: System metrics (Kubernetes)
......@@ -145,7 +157,8 @@
- container_memory_usage_bytes
weight: 4
queries:
- query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024'
- id: system_metrics_kubernetes_container_memory_total
query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) /1024/1024/1024'
label: Total
unit: GB
- title: "Core Usage (Total)"
......@@ -154,7 +167,8 @@
- container_cpu_usage_seconds_total
weight: 3
queries:
- query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)'
- id: system_metrics_kubernetes_container_cores_total
query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job)'
label: Total
unit: "cores"
- title: "Memory Usage (Pod average)"
......@@ -163,7 +177,8 @@
- container_memory_usage_bytes
weight: 2
queries:
- query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024'
- id: system_metrics_kubernetes_container_memory_average
query_range: 'avg(sum(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) by (job)) without (job) / count(avg(container_memory_usage_bytes{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}) without (job)) /1024/1024'
label: Pod average
unit: MB
- title: "Core Usage (Pod average)"
......@@ -172,6 +187,12 @@
- container_cpu_usage_seconds_total
weight: 1
queries:
- query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
- id: system_metrics_kubernetes_container_core_usage
query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
label: Pod average
unit: "cores"
\ No newline at end of file
unit: "cores"
- id: system_metrics_kubernetes_container_core_usage_canary
query_range: 'avg(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (job)) without (job) / count(sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name=~"^%{ci_environment_slug}-canary-(.*)",namespace="%{kube_namespace}"}[15m])) by (pod_name))'
label: Pod average
unit: "cores"
track: canary
require_relative '../importers/common_metrics_importer.rb'
::Importers::CommonMetricsImporter.new.execute
require_relative '../importers/common_metrics_importer.rb'
::Importers::CommonMetricsImporter.new.execute
# frozen_string_literal: true
module Importers
class PrometheusMetric < ActiveRecord::Base
enum group: {
# built-in groups
nginx_ingress: -1,
ha_proxy: -2,
aws_elb: -3,
nginx: -4,
kubernetes: -5,
# custom groups
business: 0,
response: 1,
system: 2,
}
scope :common, -> { where(common: true) }
GROUP_TITLES = {
business: _('Business metrics (Custom)'),
response: _('Response metrics (Custom)'),
system: _('System metrics (Custom)'),
nginx_ingress: _('Response metrics (NGINX Ingress)'),
ha_proxy: _('Response metrics (HA Proxy)'),
aws_elb: _('Response metrics (AWS ELB)'),
nginx: _('Response metrics (NGINX)'),
kubernetes: _('System metrics (Kubernetes)')
}.freeze
end
class CommonMetricsImporter
MissingQueryId = Class.new(StandardError)
attr_reader :content
def initialize(file = 'config/prometheus/common_metrics.yml')
@content = YAML.load_file(file)
end
def execute
process_content do |id, attributes|
find_or_build_metric!(id)
.update!(**attributes)
end
end
private
def process_content(&blk)
content.map do |group|
process_group(group, &blk)
end
end
def process_group(group, &blk)
attributes = {
group: find_group_title_key(group['group'])
}
group['metrics'].map do |metric|
process_metric(metric, attributes, &blk)
end
end
def process_metric(metric, attributes, &blk)
attributes = attributes.merge(
title: metric['title'],
y_label: metric['y_label'])
metric['queries'].map do |query|
process_metric_query(query, attributes, &blk)
end
end
def process_metric_query(query, attributes, &blk)
attributes = attributes.merge(
legend: query['label'],
query: query['query_range'],
unit: query['unit'])
blk.call(query['id'], attributes)
end
def find_or_build_metric!(id)
raise MissingQueryId unless id
PrometheusMetric.common.find_by(identifier: id) ||
PrometheusMetric.new(common: true, identifier: id)
end
def find_group_title_key(title)
PrometheusMetric.groups[find_group_title(title)]
end
def find_group_title(title)
PrometheusMetric::GROUP_TITLES.invert[title]
end
end
end
class CreatePrometheusMetrics < ActiveRecord::Migration
DOWNTIME = false
def change
create_table :prometheus_metrics do |t|
t.references :project, index: true, foreign_key: { on_delete: :cascade }, null: false
t.string :title, null: false
t.string :query, null: false
t.string :y_label
t.string :unit
t.string :legend
t.integer :group, null: false, index: true
t.timestamps_with_timezone null: false
end
end
end
class AddCommonToPrometheusMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def up
add_column_with_default(:prometheus_metrics, :common, :boolean, default: false)
end
def down
remove_column(:prometheus_metrics, :common)
end
end
class ChangeProjectIdForPrometheusMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def change
change_column_null :prometheus_metrics, :project_id, true
end
end
class AddIndexOnDefaultPrometheusMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
def up
add_concurrent_index :prometheus_metrics, :common
end
def down
remove_concurrent_index :prometheus_metrics, :project_id
end
end
# frozen_string_literal: true
class AddIdentifierToPrometheusMetric < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :prometheus_metrics, :identifier, :string, unique: true
end
end
class ImportCommonMetrics < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
require_relative '../importers/common_metrics_importer.rb'
DOWNTIME = false
def up
Importers::CommonMetricsImporter.new.execute
end
def down
# no-op
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180826111825) do
ActiveRecord::Schema.define(version: 20180831164909) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -1696,6 +1696,24 @@ ActiveRecord::Schema.define(version: 20180826111825) do
add_index "projects", ["star_count"], name: "index_projects_on_star_count", using: :btree
add_index "projects", ["visibility_level"], name: "index_projects_on_visibility_level", using: :btree
create_table "prometheus_metrics", force: :cascade do |t|
t.integer "project_id"
t.string "title", null: false
t.string "query", null: false
t.string "y_label"
t.string "unit"
t.string "legend"
t.integer "group", null: false
t.datetime_with_timezone "created_at", null: false
t.datetime_with_timezone "updated_at", null: false
t.boolean "common", default: false, null: false
t.string "identifier"
end
add_index "prometheus_metrics", ["common"], name: "index_prometheus_metrics_on_common", using: :btree
add_index "prometheus_metrics", ["group"], name: "index_prometheus_metrics_on_group", using: :btree
add_index "prometheus_metrics", ["project_id"], name: "index_prometheus_metrics_on_project_id", using: :btree
create_table "protected_branch_merge_access_levels", force: :cascade do |t|
t.integer "protected_branch_id", null: false
t.integer "access_level", default: 40, null: false
......@@ -2375,6 +2393,7 @@ ActiveRecord::Schema.define(version: 20180826111825) do
add_foreign_key "project_import_data", "projects", name: "fk_ffb9ee3a10", on_delete: :cascade
add_foreign_key "project_mirror_data", "projects", on_delete: :cascade
add_foreign_key "project_statistics", "projects", on_delete: :cascade
add_foreign_key "prometheus_metrics", "projects", on_delete: :cascade
add_foreign_key "protected_branch_merge_access_levels", "protected_branches", name: "fk_8a3072ccb3", on_delete: :cascade
add_foreign_key "protected_branch_push_access_levels", "protected_branches", name: "fk_9ffc86a3d9", on_delete: :cascade
add_foreign_key "protected_branches", "projects", name: "fk_7a9c6d93e7", on_delete: :cascade
......
......@@ -20,6 +20,45 @@ GitLab uses the defined queries and fills in the environment specific variables.
## Adding to the library
We strive to support the 2-4 most important metrics for each common system service that supports Prometheus. If you are looking for support for a particular exporter which has not yet been added to the library, additions can be made [to the `additional_metrics.yml`](https://gitlab.com/gitlab-org/gitlab-ce/blob/master/config/prometheus/additional_metrics.yml) file.
We strive to support the 2-4 most important metrics for each common system service that supports Prometheus. If you are looking for support for a particular exporter which has not yet been added to the library, additions can be made [to the `common_metrics.yml`](https://gitlab.com/gitlab-org/gitlab-ce/blob/master/config/prometheus/common_metrics.yml) file.
> Note: The library is only for monitoring public, common, system services which all customers can benefit from. Support for monitoring [customer proprietary metrics](https://gitlab.com/gitlab-org/gitlab-ee/issues/2273) will be added in a subsequent release.
### Query identifier
The requirement for adding metrics is to have each query to have unique identifier.
Identifier is used to update the metric later when changed.
```yaml
- group: Response metrics (NGINX Ingress)
metrics:
- title: "Throughput"
y_label: "Requests / Sec"
queries:
- id: response_metrics_nginx_ingress_throughput_status_code
query_range: 'sum(rate(nginx_upstream_responses_total{upstream=~"%{kube_namespace}-%{ci_environment_slug}-.*"}[2m])) by (status_code)'
unit: req / sec
label: Status Code
```
### Update existing metrics
After you add or change existing _common_ metric you have to create a new database migration that will query and update all existing metrics.
**Note: If a query metric (which is identified by `id:`) is removed it will not be removed from database by default.**
**You might want to add additional database migration that makes a decision what to do with removed one.**
**For example: you might be interested in migrating all dependent data to a different metric.**
```ruby
class ImportCommonMetrics < ActiveRecord::Migration
require_relative '../importers/common_metrics_importer.rb'
DOWNTIME = false
def up
Importers::CommonMetricsImporter.new.execute
end
def down
# no-op
end
end
```
......@@ -5,7 +5,7 @@ module Gitlab
MUTEX = Mutex.new
extend self
def load_groups_from_yaml(file_name = 'additional_metrics.yml')
def load_groups_from_yaml(file_name)
yaml_metrics_raw(file_name).map(&method(:group_from_entry))
end
......
......@@ -4,10 +4,13 @@ module Gitlab
include ActiveModel::Model
attr_accessor :name, :priority, :metrics
validates :name, :priority, :metrics, presence: true
def self.common_metrics
AdditionalMetricsParser.load_groups_from_yaml
::PrometheusMetric.common.group_by(&:group_title).map do |name, metrics|
MetricGroup.new(name: name, priority: 0, metrics: metrics.map(&:to_query_metric))
end
end
# EE only
......
require 'rails_helper'
require Rails.root.join("db", "importers", "common_metrics_importer.rb")
describe Importers::PrometheusMetric do
it 'group enum equals ::PrometheusMetric' do
expect(described_class.groups).to eq(::PrometheusMetric.groups)
end
it 'GROUP_TITLES equals ::PrometheusMetric' do
expect(described_class::GROUP_TITLES).to eq(::PrometheusMetric::GROUP_TITLES)
end
end
describe Importers::CommonMetricsImporter do
subject { described_class.new }
context "does import common_metrics.yml" do
let(:groups) { subject.content }
let(:metrics) { groups.map { |group| group['metrics'] }.flatten }
let(:queries) { metrics.map { |group| group['queries'] }.flatten }
let(:query_ids) { queries.map { |query| query['id'] } }
before do
subject.execute
end
it "has the same amount of groups" do
expect(PrometheusMetric.common.group(:group).count.count).to eq(groups.count)
end
it "has the same amount of metrics" do
expect(PrometheusMetric.common.group(:group, :title).count.count).to eq(metrics.count)
end
it "has the same amount of queries" do
expect(PrometheusMetric.common.count).to eq(queries.count)
end
it "does not have duplicate IDs" do
expect(query_ids).to eq(query_ids.uniq)
end
it "imports all IDs" do
expect(PrometheusMetric.common.pluck(:identifier)).to eq(query_ids)
end
end