Skip to content

Commit 86c193e

Browse files
committed
Label alerts destined for PagerDuty
1 parent 175204a commit 86c193e

File tree

2 files changed

+60
-2
lines changed

2 files changed

+60
-2
lines changed

lib/stackbuilder/stacks/services/app_service.rb

+6-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ module Stacks::Services::AppService
3333
attr_accessor :alerts_channel
3434
attr_accessor :startup_alert_threshold
3535
attr_accessor :monitor_readiness_probe
36+
attr_accessor :page_on_critical
3637

3738
alias_method :database_application_name, :application
3839

@@ -60,6 +61,7 @@ def configure
6061
@cpu_request = false
6162
@cpu_limit = false
6263
@monitor_readiness_probe = true
64+
@page_on_critical = false
6365
end
6466

6567
def enable_ehcache
@@ -597,6 +599,8 @@ def generate_k8s_pod_disruption_budget(app_service_labels)
597599
def generate_k8s_alerting(site, app_service_labels)
598600
fail("app_service '#{name}' in '#{@environment.name}' requires alerts_channel (set self.alerts_channel)") if @alerts_channel.nil?
599601

602+
pagerduty = page_on_critical ? { 'pagerduty' => 'true' } : {}
603+
600604
rules = []
601605

602606
rules << {
@@ -606,7 +610,7 @@ def generate_k8s_alerting(site, app_service_labels)
606610
'severity' => 'critical',
607611
'alertname' => "#{k8s_app_resources_name} CRITICAL",
608612
'alert_owner_channel' => alerts_channel
609-
},
613+
}.merge(pagerduty),
610614
'annotations' => {
611615
'message' => '{{ $value }} components are critical on {{ $labels.namespace }}/{{ $labels.pod }}',
612616
'status_page_url' => "https://go.timgroup.com/insight/#{site}/proxy/{{ $labels.namespace }}/{{ $labels.pod }}/info/status"
@@ -637,7 +641,7 @@ def generate_k8s_alerting(site, app_service_labels)
637641
'severity' => 'critical',
638642
'alertname' => "#{k8s_app_resources_name} is stuck in a crash loop",
639643
'alert_owner_channel' => alerts_channel
640-
},
644+
}.merge(pagerduty),
641645
'annotations' => {
642646
'message' => 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting ' \
643647
'{{ printf "%.2f" $value }} times / 5 minutes.'

spec/app_service_spec.rb

+54
Original file line numberDiff line numberDiff line change
@@ -1343,6 +1343,60 @@ def k8s_resource(set, kind)
13431343
})
13441344
end
13451345

1346+
it 'creates a paging alert rule for critical status components' do
1347+
factory = eval_stacks do
1348+
stack "mystack" do
1349+
app_service "x", :kubernetes => true do
1350+
self.maintainers = [person('Testers')]
1351+
self.description = 'Testing'
1352+
self.alerts_channel = 'test'
1353+
self.page_on_critical = true
1354+
1355+
self.application = 'MyApplication'
1356+
self.startup_alert_threshold = '1h'
1357+
end
1358+
end
1359+
env "e1", :primary_site => 'space' do
1360+
instantiate_stack "mystack"
1361+
end
1362+
end
1363+
set = factory.inventory.find_environment('e1').definitions['mystack'].k8s_machinesets['x']
1364+
prometheus_rule = k8s_resource(set, 'PrometheusRule')
1365+
1366+
expect(prometheus_rule['apiVersion']).to eql('monitoring.coreos.com/v1')
1367+
expect(prometheus_rule['metadata']).to eql('labels' => {
1368+
'prometheus' => 'main',
1369+
'role' => 'alert-rules',
1370+
'app.kubernetes.io/managed-by' => 'stacks',
1371+
'stack' => 'mystack',
1372+
'machineset' => 'x',
1373+
'group' => 'blue',
1374+
'app.kubernetes.io/instance' => 'blue',
1375+
'app.kubernetes.io/part-of' => 'x',
1376+
'app.kubernetes.io/component' => 'app_service'
1377+
},
1378+
'name' => 'x-blue-app',
1379+
'namespace' => 'e1')
1380+
1381+
expect(prometheus_rule['spec']['groups'].first['name']).to eql('stacks-alerts')
1382+
status_critical_rule = prometheus_rule['spec']['groups'].first['rules'].find do |r|
1383+
r['alert'] == 'StatusCritical'
1384+
end
1385+
expected_status_page_url = "https://go.timgroup.com/insight/space/proxy/{{ $labels.namespace }}/{{ $labels.pod }}/info/status"
1386+
expect(status_critical_rule).to eql('alert' => 'StatusCritical',
1387+
'expr' => 'sum(tucker_component_status{job="x-blue-app",status="critical"}) by (pod, namespace) > 0',
1388+
'labels' => {
1389+
'severity' => 'critical',
1390+
'alertname' => 'x-blue-app CRITICAL',
1391+
'alert_owner_channel' => 'test',
1392+
'pagerduty' => 'true'
1393+
},
1394+
'annotations' => {
1395+
'message' => '{{ $value }} components are critical on {{ $labels.namespace }}/{{ $labels.pod }}',
1396+
'status_page_url' => expected_status_page_url
1397+
})
1398+
end
1399+
13461400
it 'creates an alert rule for pods stuck in a crash loop' do
13471401
factory = eval_stacks do
13481402
stack "mystack" do

0 commit comments

Comments
 (0)