diff --git a/CHANGELOG.md b/CHANGELOG.md index cabbd2fe011..c98df74ed02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ * [BUGFIX] Query Frontend: If 'LogQueriesLongerThan' is set to < 0, log all queries as described in the docs. #4633 * [BUGFIX] Distributor: update defaultReplicationStrategy to not fail with extend-write when a single instance is unhealthy. #4636 * [BUGFIX] Distributor: Fix race condition on `/series` introduced by #4683. #4716 +* [BUGFIX] Ruler: Fixed leaking notifiers after users are removed #4718 * [BUGFIX] Distributor: Fix a memory leak in distributor due to the cluster label. #4739 * [ENHANCEMENT] Compactor: uploading blocks no compaction marks to the global location and introduce a new metric #4729 * `cortex_bucket_blocks_marked_for_no_compaction_count`: Total number of blocks marked for no compaction in the bucket. diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index d5b2782ac79..4527c562cbb 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -107,6 +107,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou go mngr.Stop() delete(r.userManagers, userID) + r.removeNotifier(userID) r.mapper.cleanupUser(userID) r.lastReloadSuccessful.DeleteLabelValues(userID) r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) @@ -163,33 +164,44 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user // newManager creates a prometheus rule manager wrapped with a user id // configured storage, appendable, notifier, and instrumentation func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) { - notifier, err := r.getOrCreateNotifier(userID) - if err != nil { - return nil, err - } - // Create a new Prometheus registry and register it within - // our metrics struct for the provided user. + // our metrics struct for the provided user if it doesn't already exist. reg := prometheus.NewRegistry() r.userManagerMetrics.AddUserRegistry(userID, reg) + notifier, err := r.getOrCreateNotifier(userID, reg) + if err != nil { + return nil, err + } + return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) removeNotifier(userID string) { + r.notifiersMtx.Lock() + defer r.notifiersMtx.Unlock() + + if n, ok := r.notifiers[userID]; ok { + n.stop() + } + + delete(r.notifiers, userID) +} + +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() n, ok := r.notifiers[userID] if ok { + // When there is a stale user, we stop the notifier but do not remove it + n.run() return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) - reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, - Registerer: reg, + Registerer: userManagerRegistry, Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) { // Note: The passed-in context comes from the Prometheus notifier // and does *not* contain the userID. So it needs to be added to the context diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index 79cd26f7632..7bb3d43c90a 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -23,6 +23,14 @@ type ManagerMetrics struct { GroupLastDuration *prometheus.Desc GroupRules *prometheus.Desc GroupLastEvalSamples *prometheus.Desc + + NotificationLatency *prometheus.Desc + NotificationErrors *prometheus.Desc + NotificationSent *prometheus.Desc + NotificationDropped *prometheus.Desc + NotificationQueueLength *prometheus.Desc + NotificationQueueCapacity *prometheus.Desc + AlertmanagersDiscovered *prometheus.Desc } // NewManagerMetrics returns a ManagerMetrics struct @@ -101,6 +109,51 @@ func NewManagerMetrics(disableRuleGroupLabel bool) *ManagerMetrics { commonLabels, nil, ), + + // Prometheus' ruler's notification metrics + NotificationLatency: prometheus.NewDesc( + "cortex_prometheus_notifications_latency_seconds", + "Latency quantiles for sending alert notifications.", + []string{"user"}, + nil, + ), + + NotificationErrors: prometheus.NewDesc( + "cortex_prometheus_notifications_errors_total", + "Total number of errors sending alert notifications.", + []string{"user", "alertmanager"}, + nil, + ), + NotificationSent: prometheus.NewDesc( + "cortex_prometheus_notifications_sent_total", + "Total number of alerts sent.", + []string{"user", "alertmanager"}, + nil, + ), + NotificationDropped: prometheus.NewDesc( + "cortex_prometheus_notifications_dropped_total", + "Total number of alerts dropped due to errors when sending to Alertmanager.", + []string{"user"}, + nil, + ), + NotificationQueueLength: prometheus.NewDesc( + "cortex_prometheus_notifications_queue_length", + "The number of alert notifications in the queue.", + []string{"user"}, + nil, + ), + NotificationQueueCapacity: prometheus.NewDesc( + "cortex_prometheus_notifications_queue_capacity", + "The capacity of the alert notifications queue.", + []string{"user"}, + nil, + ), + AlertmanagersDiscovered: prometheus.NewDesc( + "cortex_prometheus_notifications_alertmanagers_discovered", + "The number of alertmanagers discovered and active.", + []string{"user"}, + nil, + ), } } @@ -127,6 +180,14 @@ func (m *ManagerMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.GroupLastDuration out <- m.GroupRules out <- m.GroupLastEvalSamples + + out <- m.NotificationLatency + out <- m.NotificationErrors + out <- m.NotificationSent + out <- m.NotificationDropped + out <- m.NotificationQueueLength + out <- m.NotificationQueueCapacity + out <- m.AlertmanagersDiscovered } // Collect implements the Collector interface @@ -152,4 +213,12 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", labels...) data.SendSumOfGaugesPerUserWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", labels...) data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", labels...) + + data.SendSumOfSummariesPerUser(out, m.NotificationLatency, "prometheus_notifications_latency_seconds") + data.SendSumOfCountersPerUserWithLabels(out, m.NotificationErrors, "prometheus_notifications_errors_total", "alertmanager") + data.SendSumOfCountersPerUserWithLabels(out, m.NotificationSent, "prometheus_notifications_sent_total", "alertmanager") + data.SendSumOfCountersPerUser(out, m.NotificationDropped, "prometheus_notifications_dropped_total") + data.SendSumOfGaugesPerUser(out, m.NotificationQueueLength, "prometheus_notifications_queue_length") + data.SendSumOfGaugesPerUser(out, m.NotificationQueueCapacity, "prometheus_notifications_queue_capacity") + data.SendSumOfGaugesPerUser(out, m.AlertmanagersDiscovered, "prometheus_notifications_alertmanagers_discovered") } diff --git a/pkg/ruler/manager_metrics_test.go b/pkg/ruler/manager_metrics_test.go index 99f2b096bb7..d0ac38c81a9 100644 --- a/pkg/ruler/manager_metrics_test.go +++ b/pkg/ruler/manager_metrics_test.go @@ -34,6 +34,53 @@ cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 1 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000 +# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active. +# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge +cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1 +cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10 +cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100 +# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. +# TYPE cortex_prometheus_notifications_dropped_total counter +cortex_prometheus_notifications_dropped_total{user="user1"} 1 +cortex_prometheus_notifications_dropped_total{user="user2"} 10 +cortex_prometheus_notifications_dropped_total{user="user3"} 100 +# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications. +# TYPE cortex_prometheus_notifications_errors_total counter +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100 +# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications. +# TYPE cortex_prometheus_notifications_latency_seconds summary +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1 +cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10 +cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10 +cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100 +cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100 +cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1 +# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue. +# TYPE cortex_prometheus_notifications_queue_capacity gauge +cortex_prometheus_notifications_queue_capacity{user="user1"} 1 +cortex_prometheus_notifications_queue_capacity{user="user2"} 10 +cortex_prometheus_notifications_queue_capacity{user="user3"} 100 +# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue. +# TYPE cortex_prometheus_notifications_queue_length gauge +cortex_prometheus_notifications_queue_length{user="user1"} 1 +cortex_prometheus_notifications_queue_length{user="user2"} 10 +cortex_prometheus_notifications_queue_length{user="user3"} 100 +# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent. +# TYPE cortex_prometheus_notifications_sent_total counter +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100 # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute. # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1 @@ -153,6 +200,53 @@ func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) { cortex_prometheus_last_evaluation_samples{user="user1"} 2000 cortex_prometheus_last_evaluation_samples{user="user2"} 20000 cortex_prometheus_last_evaluation_samples{user="user3"} 200000 +# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active. +# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge +cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1 +cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10 +cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100 +# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. +# TYPE cortex_prometheus_notifications_dropped_total counter +cortex_prometheus_notifications_dropped_total{user="user1"} 1 +cortex_prometheus_notifications_dropped_total{user="user2"} 10 +cortex_prometheus_notifications_dropped_total{user="user3"} 100 +# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications. +# TYPE cortex_prometheus_notifications_errors_total counter +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100 +# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications. +# TYPE cortex_prometheus_notifications_latency_seconds summary +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1 +cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10 +cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10 +cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100 +cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100 +cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1 +# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue. +# TYPE cortex_prometheus_notifications_queue_capacity gauge +cortex_prometheus_notifications_queue_capacity{user="user1"} 1 +cortex_prometheus_notifications_queue_capacity{user="user2"} 10 +cortex_prometheus_notifications_queue_capacity{user="user3"} 100 +# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue. +# TYPE cortex_prometheus_notifications_queue_length gauge +cortex_prometheus_notifications_queue_length{user="user1"} 1 +cortex_prometheus_notifications_queue_length{user="user2"} 10 +cortex_prometheus_notifications_queue_length{user="user3"} 100 +# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent. +# TYPE cortex_prometheus_notifications_sent_total counter +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100 # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute. # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1 @@ -261,22 +355,37 @@ func populateManager(base float64) *prometheus.Registry { metrics.groupLastEvalSamples.WithLabelValues("group_one").Add(base * 1000) metrics.groupLastEvalSamples.WithLabelValues("group_two").Add(base * 1000) + metrics.notificationsLatency.WithLabelValues("alertmanager_1").Observe(base) + metrics.notificationsErrors.WithLabelValues("alertmanager_1").Add(base) + metrics.notificationsSent.WithLabelValues("alertmanager_1").Add(base) + metrics.notificationsDropped.Add(base) + metrics.notificationsQueueLength.Set(base) + metrics.notificationsQueueCapacity.Set(base) + metrics.notificationsAlertmanagersDiscovered.Set(base) return r } // Copied from github.com/prometheus/rules/manager.go +// and github.com/prometheus/notifier/notifier.go type groupMetrics struct { - evalDuration prometheus.Summary - iterationDuration prometheus.Summary - iterationsMissed *prometheus.CounterVec - iterationsScheduled *prometheus.CounterVec - evalTotal *prometheus.CounterVec - evalFailures *prometheus.CounterVec - groupInterval *prometheus.GaugeVec - groupLastEvalTime *prometheus.GaugeVec - groupLastDuration *prometheus.GaugeVec - groupRules *prometheus.GaugeVec - groupLastEvalSamples *prometheus.GaugeVec + evalDuration prometheus.Summary + iterationDuration prometheus.Summary + iterationsMissed *prometheus.CounterVec + iterationsScheduled *prometheus.CounterVec + evalTotal *prometheus.CounterVec + evalFailures *prometheus.CounterVec + groupInterval *prometheus.GaugeVec + groupLastEvalTime *prometheus.GaugeVec + groupLastDuration *prometheus.GaugeVec + groupRules *prometheus.GaugeVec + groupLastEvalSamples *prometheus.GaugeVec + notificationsLatency *prometheus.SummaryVec + notificationsErrors *prometheus.CounterVec + notificationsSent *prometheus.CounterVec + notificationsDropped prometheus.Counter + notificationsQueueLength prometheus.Gauge + notificationsQueueCapacity prometheus.Gauge + notificationsAlertmanagersDiscovered prometheus.Gauge } func newGroupMetrics(r prometheus.Registerer) *groupMetrics { @@ -355,8 +464,53 @@ func newGroupMetrics(r prometheus.Registerer) *groupMetrics { }, []string{"rule_group"}, ), + notificationsLatency: promauto.With(r).NewSummaryVec( + prometheus.SummaryOpts{ + Name: "prometheus_notifications_latency_seconds", + Help: "Latency quantiles for sending alert notifications.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }, + []string{"alertmanager"}, + ), + notificationsErrors: promauto.With(r).NewCounterVec( + prometheus.CounterOpts{ + Name: "prometheus_notifications_errors_total", + Help: "Latency quantiles for sending alert notifications.", + }, + []string{"alertmanager"}, + ), + notificationsSent: promauto.With(r).NewCounterVec( + prometheus.CounterOpts{ + Name: "prometheus_notifications_sent_total", + Help: "Total number of errors sending alert notifications", + }, + []string{"alertmanager"}, + ), + notificationsDropped: promauto.With(r).NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_notifications_dropped_total", + Help: "Total number of alerts dropped due to errors when sending to Alertmanager.", + }, + ), + notificationsQueueLength: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Name: "prometheus_notifications_queue_length", + Help: "The number of alert notifications in the queue.", + }, + ), + notificationsQueueCapacity: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Name: "prometheus_notifications_queue_capacity", + Help: "The capacity of the alert notifications queue.", + }, + ), + notificationsAlertmanagersDiscovered: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Name: "prometheus_notifications_alertmanagers_discovered", + Help: "The number of alertmanagers discovered and active.", + }, + ), } - return m } diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index 39f45bdcdb4..141b8d60d7a 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -44,11 +44,13 @@ func TestSyncRuleGroups(t *testing.T) { return mgr.(*mockRulesManager).running.Load() }) - // Verify that user rule groups are now cached locally. + // Verify that user rule groups are now cached locally and notifiers are created. { users, err := m.mapper.users() + _, ok := m.notifiers[user] require.NoError(t, err) require.Equal(t, []string{user}, users) + require.True(t, ok) } // Passing empty map / nil stops all managers. @@ -63,8 +65,10 @@ func TestSyncRuleGroups(t *testing.T) { // Verify that local rule groups were removed. { users, err := m.mapper.users() + _, ok := m.notifiers[user] require.NoError(t, err) require.Equal(t, []string(nil), users) + require.False(t, ok) } // Resync same rules as before. Previously this didn't restart the manager. diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 10fe947710a..df9310b1dd9 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -260,7 +260,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { manager := newManager(t, cfg) defer manager.Stop() - n, err := manager.getOrCreateNotifier("1") + n, err := manager.getOrCreateNotifier("1", manager.registry) require.NoError(t, err) // Loop until notifier discovery syncs up @@ -275,10 +275,10 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { // Ensure we have metrics in the notifier. assert.NoError(t, prom_testutil.GatherAndCompare(manager.registry.(*prometheus.Registry), strings.NewReader(` - # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. - # TYPE cortex_prometheus_notifications_dropped_total counter - cortex_prometheus_notifications_dropped_total{user="1"} 0 - `), "cortex_prometheus_notifications_dropped_total")) + # HELP prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. + # TYPE prometheus_notifications_dropped_total counter + prometheus_notifications_dropped_total 0 + `), "prometheus_notifications_dropped_total")) } func TestRuler_Rules(t *testing.T) {