From c2d894b03283622dc978a72aa8b9bf2ceb25182f Mon Sep 17 00:00:00 2001 From: Roy Chiang Date: Wed, 13 Apr 2022 19:05:56 -0700 Subject: [PATCH 01/14] fix leaking notifier in ruler when user is removed Signed-off-by: Roy Chiang --- CHANGELOG.md | 10 ++++++++++ pkg/ruler/manager.go | 15 +++++++++++++++ pkg/ruler/manager_test.go | 6 +++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d46774c7e93..e5b4874f34b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,16 @@ * [FEATURE] Compactor: Add `-compactor.skip-blocks-with-out-of-order-chunks-enabled` configuration to mark blocks containing index with out-of-order chunks for no compact instead of halting the compaction * [FEATURE] Querier/Query-Frontend: Add `-querier.per-step-stats-enabled` and `-frontend.cache-queryable-samples-stats` configurations to enable query sample statistics * [FEATURE] Add shuffle sharding for the compactor #4433 +* [BUGFIX] Distributor: Fix race condition on `/series` introduced by #4683. #4716 +* [BUGFIX] Ruler: Fixed leaking notifiers after users are removed #4718 + +## 1.12.0 in progress + +* [CHANGE] Changed default for `-ingester.min-ready-duration` from 1 minute to 15 seconds. #4539 +* [CHANGE] query-frontend: Do not print anything in the logs of `query-frontend` if a in-progress query has been canceled (context canceled) to avoid spam. #4562 +* [CHANGE] Compactor block deletion mark migration, needed when upgrading from v1.7, is now disabled by default. #4597 +* [CHANGE] The `status_code` label on gRPC client metrics has changed from '200' and '500' to '2xx', '5xx', '4xx', 'cancel' or 'error'. 4601 +* [CHANGE] Memberlist: changed probe interval from `1s` to `5s` and probe timeout from `500ms` to `2s`. #4601 * [ENHANCEMENT] Update Go version to 1.17.8. #4602 #4604 #4658 * [ENHANCEMENT] Keep track of discarded samples due to bad relabel configuration in `cortex_discarded_samples_total`. #4503 * [ENHANCEMENT] Ruler: Add `-ruler.disable-rule-group-label` to disable the `rule_group` label on exported metrics. #4571 diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index d5b2782ac79..f6b2b42dd39 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -107,6 +107,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou go mngr.Stop() delete(r.userManagers, userID) + if n := r.removeNotifier(userID); n != nil { + n.stop() + } r.mapper.cleanupUser(userID) r.lastReloadSuccessful.DeleteLabelValues(userID) r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) @@ -176,6 +179,18 @@ func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID strin return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } +func (r *DefaultMultiTenantManager) removeNotifier(userID string) *rulerNotifier { + r.notifiersMtx.Lock() + defer r.notifiersMtx.Unlock() + + n, ok := r.notifiers[userID] + if !ok { + return nil + } + delete(r.notifiers, userID) + return n +} + func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index 7c91f3b88b7..e55cb2c1986 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -50,11 +50,13 @@ func TestSyncRuleGroups(t *testing.T) { return mgr.(*mockRulesManager).running.Load() }) - // Verify that user rule groups are now cached locally. + // Verify that user rule groups are now cached locally, and notifiers are created { users, err := m.mapper.users() + _, ok := m.notifiers[user] require.NoError(t, err) require.Equal(t, []string{user}, users) + require.True(t, ok) } // Passing empty map / nil stops all managers. @@ -69,8 +71,10 @@ func TestSyncRuleGroups(t *testing.T) { // Verify that local rule groups were removed. { users, err := m.mapper.users() + _, ok := m.notifiers[user] require.NoError(t, err) require.Equal(t, []string(nil), users) + require.False(t, ok) } // Resync same rules as before. Previously this didn't restart the manager. From dbaeac25a9088db1435c81a9f5b4fece40d9afe2 Mon Sep 17 00:00:00 2001 From: Roy Chiang Date: Wed, 13 Apr 2022 21:34:34 -0700 Subject: [PATCH 02/14] register notifier metrics under per-user metric registry Signed-off-by: Roy Chiang --- pkg/ruler/manager.go | 14 +++++++------- pkg/ruler/ruler_test.go | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index f6b2b42dd39..bdb7d8c3495 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -166,16 +166,16 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user // newManager creates a prometheus rule manager wrapped with a user id // configured storage, appendable, notifier, and instrumentation func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) { - notifier, err := r.getOrCreateNotifier(userID) - if err != nil { - return nil, err - } - // Create a new Prometheus registry and register it within // our metrics struct for the provided user. reg := prometheus.NewRegistry() r.userManagerMetrics.AddUserRegistry(userID, reg) + notifier, err := r.getOrCreateNotifier(userID, reg) + if err != nil { + return nil, err + } + return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } @@ -191,7 +191,7 @@ func (r *DefaultMultiTenantManager) removeNotifier(userID string) *rulerNotifier return n } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() @@ -200,7 +200,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index a05d24a2f24..fd7dbb78e17 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -274,7 +274,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { defer rcleanup() defer manager.Stop() - n, err := manager.getOrCreateNotifier("1") + n, err := manager.getOrCreateNotifier("1", manager.registry) require.NoError(t, err) // Loop until notifier discovery syncs up From 0fbbecbff5c66f136756e0837fe23ca18c24f536 Mon Sep 17 00:00:00 2001 From: Rohan Gupta Date: Tue, 26 Apr 2022 14:22:01 -0400 Subject: [PATCH 03/14] Fixed changelog Signed-off-by: Rohan Gupta --- CHANGELOG.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5b4874f34b..890b719c3e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,16 +14,6 @@ * [FEATURE] Compactor: Add `-compactor.skip-blocks-with-out-of-order-chunks-enabled` configuration to mark blocks containing index with out-of-order chunks for no compact instead of halting the compaction * [FEATURE] Querier/Query-Frontend: Add `-querier.per-step-stats-enabled` and `-frontend.cache-queryable-samples-stats` configurations to enable query sample statistics * [FEATURE] Add shuffle sharding for the compactor #4433 -* [BUGFIX] Distributor: Fix race condition on `/series` introduced by #4683. #4716 -* [BUGFIX] Ruler: Fixed leaking notifiers after users are removed #4718 - -## 1.12.0 in progress - -* [CHANGE] Changed default for `-ingester.min-ready-duration` from 1 minute to 15 seconds. #4539 -* [CHANGE] query-frontend: Do not print anything in the logs of `query-frontend` if a in-progress query has been canceled (context canceled) to avoid spam. #4562 -* [CHANGE] Compactor block deletion mark migration, needed when upgrading from v1.7, is now disabled by default. #4597 -* [CHANGE] The `status_code` label on gRPC client metrics has changed from '200' and '500' to '2xx', '5xx', '4xx', 'cancel' or 'error'. 4601 -* [CHANGE] Memberlist: changed probe interval from `1s` to `5s` and probe timeout from `500ms` to `2s`. #4601 * [ENHANCEMENT] Update Go version to 1.17.8. #4602 #4604 #4658 * [ENHANCEMENT] Keep track of discarded samples due to bad relabel configuration in `cortex_discarded_samples_total`. #4503 * [ENHANCEMENT] Ruler: Add `-ruler.disable-rule-group-label` to disable the `rule_group` label on exported metrics. #4571 @@ -45,6 +35,7 @@ * [BUGFIX] Query Frontend: If 'LogQueriesLongerThan' is set to < 0, log all queries as described in the docs. #4633 * [BUGFIX] Distributor: update defaultReplicationStrategy to not fail with extend-write when a single instance is unhealthy. #4636 * [BUGFIX] Distributor: Fix race condition on `/series` introduced by #4683. #4716 +* [BUGFIX] Ruler: Fixed leaking notifiers after users are removed #4718 ## 1.11.0 2021-11-25 From 15c7978c69d61b366b26da478ae44c5e20da5779 Mon Sep 17 00:00:00 2001 From: Rohan Gupta Date: Tue, 26 Apr 2022 18:34:08 -0400 Subject: [PATCH 04/14] Fixed integ tests Signed-off-by: Rohan Gupta --- pkg/ruler/manager.go | 19 ++++++++----------- pkg/ruler/ruler_test.go | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index bdb7d8c3495..0aa22c7e837 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -107,9 +107,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou go mngr.Stop() delete(r.userManagers, userID) - if n := r.removeNotifier(userID); n != nil { - n.stop() - } + r.removeNotifier(userID) r.mapper.cleanupUser(userID) r.lastReloadSuccessful.DeleteLabelValues(userID) r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) @@ -171,7 +169,7 @@ func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID strin reg := prometheus.NewRegistry() r.userManagerMetrics.AddUserRegistry(userID, reg) - notifier, err := r.getOrCreateNotifier(userID, reg) + notifier, err := r.getOrCreateNotifier(userID) if err != nil { return nil, err } @@ -179,19 +177,18 @@ func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID strin return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } -func (r *DefaultMultiTenantManager) removeNotifier(userID string) *rulerNotifier { +func (r *DefaultMultiTenantManager) removeNotifier(userID string) error { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() - n, ok := r.notifiers[userID] - if !ok { - return nil + if n, ok := r.notifiers[userID]; ok { + n.stop() } delete(r.notifiers, userID) - return n + return nil } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() @@ -200,7 +197,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManag return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index fd7dbb78e17..a05d24a2f24 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -274,7 +274,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { defer rcleanup() defer manager.Stop() - n, err := manager.getOrCreateNotifier("1", manager.registry) + n, err := manager.getOrCreateNotifier("1") require.NoError(t, err) // Loop until notifier discovery syncs up From 8cc5d8b2aea88ddbbd69344034566ed96d789dfc Mon Sep 17 00:00:00 2001 From: Rohan Gupta Date: Tue, 26 Apr 2022 18:43:49 -0400 Subject: [PATCH 05/14] Fixed lint Signed-off-by: Rohan Gupta --- pkg/ruler/manager.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 0aa22c7e837..32cc9cd4364 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -177,7 +177,7 @@ func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID strin return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } -func (r *DefaultMultiTenantManager) removeNotifier(userID string) error { +func (r *DefaultMultiTenantManager) removeNotifier(userID string) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() @@ -185,7 +185,6 @@ func (r *DefaultMultiTenantManager) removeNotifier(userID string) error { n.stop() } delete(r.notifiers, userID) - return nil } func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { From 5e70f27cebca2be72b523bdaf1b94a95c8548e3f Mon Sep 17 00:00:00 2001 From: Rohan Gupta Date: Tue, 26 Apr 2022 20:17:06 -0400 Subject: [PATCH 06/14] Fixed integ tests again for metrics register error Signed-off-by: Rohan Gupta --- pkg/ruler/manager.go | 6 +++--- pkg/ruler/manager_test.go | 2 +- pkg/ruler/ruler_test.go | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 32cc9cd4364..afedfc7bfea 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -169,7 +169,7 @@ func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID strin reg := prometheus.NewRegistry() r.userManagerMetrics.AddUserRegistry(userID, reg) - notifier, err := r.getOrCreateNotifier(userID) + notifier, err := r.getOrCreateNotifier(userID, r.registry) if err != nil { return nil, err } @@ -187,7 +187,7 @@ func (r *DefaultMultiTenantManager) removeNotifier(userID string) { delete(r.notifiers, userID) } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() @@ -196,7 +196,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, diff --git a/pkg/ruler/manager_test.go b/pkg/ruler/manager_test.go index e55cb2c1986..765de745b97 100644 --- a/pkg/ruler/manager_test.go +++ b/pkg/ruler/manager_test.go @@ -50,7 +50,7 @@ func TestSyncRuleGroups(t *testing.T) { return mgr.(*mockRulesManager).running.Load() }) - // Verify that user rule groups are now cached locally, and notifiers are created + // Verify that user rule groups are now cached locally and notifiers are created. { users, err := m.mapper.users() _, ok := m.notifiers[user] diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index a05d24a2f24..fd7dbb78e17 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -274,7 +274,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { defer rcleanup() defer manager.Stop() - n, err := manager.getOrCreateNotifier("1") + n, err := manager.getOrCreateNotifier("1", manager.registry) require.NoError(t, err) // Loop until notifier discovery syncs up From e4ee8091ba6030d8299239bbd1843e50ba240f34 Mon Sep 17 00:00:00 2001 From: Rohan Gupta Date: Wed, 27 Apr 2022 16:23:11 -0400 Subject: [PATCH 07/14] Stop notifier but do not remove it Signed-off-by: Rohan Gupta --- pkg/ruler/manager.go | 23 ++++++++++++----------- pkg/ruler/ruler_test.go | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index afedfc7bfea..85a0cdcc591 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -107,7 +107,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou go mngr.Stop() delete(r.userManagers, userID) - r.removeNotifier(userID) + r.stopNotifier(userID) r.mapper.cleanupUser(userID) r.lastReloadSuccessful.DeleteLabelValues(userID) r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) @@ -164,39 +164,40 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user // newManager creates a prometheus rule manager wrapped with a user id // configured storage, appendable, notifier, and instrumentation func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) { - // Create a new Prometheus registry and register it within - // our metrics struct for the provided user. - reg := prometheus.NewRegistry() - r.userManagerMetrics.AddUserRegistry(userID, reg) - - notifier, err := r.getOrCreateNotifier(userID, r.registry) + notifier, err := r.getOrCreateNotifier(userID) if err != nil { return nil, err } + // Create a new Prometheus registry and register it within + // our metrics struct for the provided user if it doesn't already exist. + reg := prometheus.NewRegistry() + r.userManagerMetrics.AddUserRegistry(userID, reg) + return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } -func (r *DefaultMultiTenantManager) removeNotifier(userID string) { +func (r *DefaultMultiTenantManager) stopNotifier(userID string) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() if n, ok := r.notifiers[userID]; ok { n.stop() } - delete(r.notifiers, userID) } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() n, ok := r.notifiers[userID] if ok { + // When there is a stale user, we stop the notifier but do not remove it + n.run() return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index fd7dbb78e17..a05d24a2f24 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -274,7 +274,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { defer rcleanup() defer manager.Stop() - n, err := manager.getOrCreateNotifier("1", manager.registry) + n, err := manager.getOrCreateNotifier("1") require.NoError(t, err) // Loop until notifier discovery syncs up From c2d135a3d442f532d7ac9cdaa47025caf02ba784 Mon Sep 17 00:00:00 2001 From: Rohan Gupta Date: Mon, 9 May 2022 09:54:41 -0400 Subject: [PATCH 08/14] Add delete back to remove notifier as memory leak will still occur Signed-off-by: Rohan Gupta --- pkg/ruler/manager.go | 20 +++++++++++--------- pkg/ruler/ruler_test.go | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 85a0cdcc591..9152b8ce4e2 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -107,7 +107,7 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou go mngr.Stop() delete(r.userManagers, userID) - r.stopNotifier(userID) + r.removeNotifier(userID) r.mapper.cleanupUser(userID) r.lastReloadSuccessful.DeleteLabelValues(userID) r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) @@ -164,29 +164,31 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user // newManager creates a prometheus rule manager wrapped with a user id // configured storage, appendable, notifier, and instrumentation func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) { - notifier, err := r.getOrCreateNotifier(userID) - if err != nil { - return nil, err - } - // Create a new Prometheus registry and register it within // our metrics struct for the provided user if it doesn't already exist. reg := prometheus.NewRegistry() r.userManagerMetrics.AddUserRegistry(userID, reg) + notifier, err := r.getOrCreateNotifier(userID, reg) + if err != nil { + return nil, err + } + return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } -func (r *DefaultMultiTenantManager) stopNotifier(userID string) { +func (r *DefaultMultiTenantManager) removeNotifier(userID string) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() if n, ok := r.notifiers[userID]; ok { n.stop() } + + delete(r.notifiers, userID) } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() @@ -197,7 +199,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index a05d24a2f24..fd7dbb78e17 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -274,7 +274,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { defer rcleanup() defer manager.Stop() - n, err := manager.getOrCreateNotifier("1") + n, err := manager.getOrCreateNotifier("1", manager.registry) require.NoError(t, err) // Loop until notifier discovery syncs up From 7387f2db0a28ffb6e3a3da37afd7bcf47b82525e Mon Sep 17 00:00:00 2001 From: Alvin Lin Date: Mon, 30 May 2022 23:37:47 -0700 Subject: [PATCH 09/14] fix integration test failure Signed-off-by: Alvin Lin --- pkg/ruler/manager.go | 16 ++++++++-------- pkg/ruler/ruler_test.go | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 9152b8ce4e2..06defb13fd5 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -164,16 +164,16 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user // newManager creates a prometheus rule manager wrapped with a user id // configured storage, appendable, notifier, and instrumentation func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) { - // Create a new Prometheus registry and register it within - // our metrics struct for the provided user if it doesn't already exist. - reg := prometheus.NewRegistry() - r.userManagerMetrics.AddUserRegistry(userID, reg) - - notifier, err := r.getOrCreateNotifier(userID, reg) + notifier, err := r.getOrCreateNotifier(userID) if err != nil { return nil, err } + // Create a new Prometheus registry and register it within + // our metrics struct for the provided user. + reg := prometheus.NewRegistry() + r.userManagerMetrics.AddUserRegistry(userID, reg) + return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } @@ -188,7 +188,7 @@ func (r *DefaultMultiTenantManager) removeNotifier(userID string) { delete(r.notifiers, userID) } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() @@ -199,7 +199,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManag return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index fd7dbb78e17..a05d24a2f24 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -274,7 +274,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { defer rcleanup() defer manager.Stop() - n, err := manager.getOrCreateNotifier("1", manager.registry) + n, err := manager.getOrCreateNotifier("1") require.NoError(t, err) // Loop until notifier discovery syncs up From 1c85ef2e4fc36d1d344cb4fb86979455bb71dd05 Mon Sep 17 00:00:00 2001 From: Alvin Lin Date: Tue, 31 May 2022 09:45:39 -0700 Subject: [PATCH 10/14] Revert "fix integration test failure" This reverts commit 7387f2db0a28ffb6e3a3da37afd7bcf47b82525e. Signed-off-by: Alvin Lin --- pkg/ruler/manager.go | 16 ++++++++-------- pkg/ruler/ruler_test.go | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 06defb13fd5..9152b8ce4e2 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -164,16 +164,16 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user // newManager creates a prometheus rule manager wrapped with a user id // configured storage, appendable, notifier, and instrumentation func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) { - notifier, err := r.getOrCreateNotifier(userID) - if err != nil { - return nil, err - } - // Create a new Prometheus registry and register it within - // our metrics struct for the provided user. + // our metrics struct for the provided user if it doesn't already exist. reg := prometheus.NewRegistry() r.userManagerMetrics.AddUserRegistry(userID, reg) + notifier, err := r.getOrCreateNotifier(userID, reg) + if err != nil { + return nil, err + } + return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil } @@ -188,7 +188,7 @@ func (r *DefaultMultiTenantManager) removeNotifier(userID string) { delete(r.notifiers, userID) } -func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { +func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManagerRegistry prometheus.Registerer) (*notifier.Manager, error) { r.notifiersMtx.Lock() defer r.notifiersMtx.Unlock() @@ -199,7 +199,7 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifie return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) + reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 10fe947710a..49b43dfbd0f 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -260,7 +260,7 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { manager := newManager(t, cfg) defer manager.Stop() - n, err := manager.getOrCreateNotifier("1") + n, err := manager.getOrCreateNotifier("1", manager.registry) require.NoError(t, err) // Loop until notifier discovery syncs up From b96f9894cb15e0dd09a0442c4b8b9d58b388b421 Mon Sep 17 00:00:00 2001 From: Alvin Lin Date: Tue, 31 May 2022 19:15:07 -0700 Subject: [PATCH 11/14] Fix integration test TestRulerAlertmanager by exposing Prometheus' ruler notifier metrics Signed-off-by: Alvin Lin --- pkg/ruler/manager.go | 4 +-- pkg/ruler/manager_metrics.go | 69 ++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/pkg/ruler/manager.go b/pkg/ruler/manager.go index 9152b8ce4e2..4527c562cbb 100644 --- a/pkg/ruler/manager.go +++ b/pkg/ruler/manager.go @@ -199,11 +199,9 @@ func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string, userManag return n.notifier, nil } - reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, userManagerRegistry) - reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) n = newRulerNotifier(¬ifier.Options{ QueueCapacity: r.cfg.NotificationQueueCapacity, - Registerer: reg, + Registerer: userManagerRegistry, Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) { // Note: The passed-in context comes from the Prometheus notifier // and does *not* contain the userID. So it needs to be added to the context diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index 79cd26f7632..64eff4ade4e 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -23,6 +23,14 @@ type ManagerMetrics struct { GroupLastDuration *prometheus.Desc GroupRules *prometheus.Desc GroupLastEvalSamples *prometheus.Desc + + NotificationLatency *prometheus.Desc + NotificationErrors *prometheus.Desc + NotificationSent *prometheus.Desc + NotificationDropped *prometheus.Desc + NotificationQueueLength *prometheus.Desc + NotificationQueueCapacity *prometheus.Desc + AlertmanagersDiscovered *prometheus.Desc } // NewManagerMetrics returns a ManagerMetrics struct @@ -101,6 +109,51 @@ func NewManagerMetrics(disableRuleGroupLabel bool) *ManagerMetrics { commonLabels, nil, ), + + // Prometheus' ruler's notification metrics + NotificationLatency: prometheus.NewDesc( + "q", + "Latency quantiles for sending alert notifications.", + []string{"user"}, + nil, + ), + + NotificationErrors: prometheus.NewDesc( + "cortex_prometheus_notifications_errors_total", + "Total number of errors sending alert notifications.", + []string{"user", "alertmanager"}, + nil, + ), + NotificationSent: prometheus.NewDesc( + "cortex_prometheus_notifications_sent_total", + "Total number of alerts sent.", + []string{"user", "alertmanager"}, + nil, + ), + NotificationDropped: prometheus.NewDesc( + "cortex_prometheus_notifications_dropped_total", + "Total number of alerts dropped due to errors when sending to Alertmanager.", + []string{"user"}, + nil, + ), + NotificationQueueLength: prometheus.NewDesc( + "cortex_prometheus_notifications_queue_length", + "The number of alert notifications in the queue.", + []string{"user"}, + nil, + ), + NotificationQueueCapacity: prometheus.NewDesc( + "cortex_prometheus_notifications_queue_capacity", + "The capacity of the alert notifications queue.", + []string{"user"}, + nil, + ), + AlertmanagersDiscovered: prometheus.NewDesc( + "cortex_prometheus_notifications_alertmanagers_discovered", + "The number of alertmanagers discovered and active.", + []string{"user"}, + nil, + ), } } @@ -127,6 +180,14 @@ func (m *ManagerMetrics) Describe(out chan<- *prometheus.Desc) { out <- m.GroupLastDuration out <- m.GroupRules out <- m.GroupLastEvalSamples + + out <- m.NotificationLatency + out <- m.NotificationErrors + out <- m.NotificationSent + out <- m.NotificationDropped + out <- m.NotificationQueueLength + out <- m.NotificationQueueCapacity + out <- m.AlertmanagersDiscovered } // Collect implements the Collector interface @@ -152,4 +213,12 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", labels...) data.SendSumOfGaugesPerUserWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", labels...) data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", labels...) + + data.SendSumOfSummariesPerUser(out, m.NotificationLatency, "prometheus_notifications_latency_seconds") + data.SendSumOfCountersPerUserWithLabels(out, m.NotificationErrors, "prometheus_notifications_errors_total", "alertmanager") + data.SendSumOfCountersPerUserWithLabels(out, m.NotificationSent, "prometheus_notifications_sent_total", "alertmanager") + data.SendSumOfGaugesPerUser(out, m.NotificationDropped, "prometheus_notifications_dropped_total") + data.SendSumOfGaugesPerUser(out, m.NotificationQueueLength, "prometheus_notifications_queue_length") + data.SendSumOfGaugesPerUser(out, m.NotificationQueueCapacity, "prometheus_notifications_queue_capacity") + data.SendSumOfGaugesPerUser(out, m.AlertmanagersDiscovered, "prometheus_notifications_alertmanagers_discovered") } From 0fc5769a9921409f4616963991660c8d847c7ea2 Mon Sep 17 00:00:00 2001 From: Alvin Lin Date: Tue, 31 May 2022 19:55:56 -0700 Subject: [PATCH 12/14] Fix manager_metrics_test Signed-off-by: Alvin Lin --- pkg/ruler/manager_metrics.go | 2 +- pkg/ruler/manager_metrics_test.go | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index 64eff4ade4e..0e6e0def937 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -112,7 +112,7 @@ func NewManagerMetrics(disableRuleGroupLabel bool) *ManagerMetrics { // Prometheus' ruler's notification metrics NotificationLatency: prometheus.NewDesc( - "q", + "cortex_prometheus_notifications_latency_seconds", "Latency quantiles for sending alert notifications.", []string{"user"}, nil, diff --git a/pkg/ruler/manager_metrics_test.go b/pkg/ruler/manager_metrics_test.go index 99f2b096bb7..76799d38925 100644 --- a/pkg/ruler/manager_metrics_test.go +++ b/pkg/ruler/manager_metrics_test.go @@ -34,6 +34,14 @@ cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 1 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000 +# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications. +# TYPE cortex_prometheus_notifications_latency_seconds summary +cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 0 +cortex_prometheus_notifications_latency_seconds_count{user="user1"} 0 +cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 0 +cortex_prometheus_notifications_latency_seconds_count{user="user2"} 0 +cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 0 +cortex_prometheus_notifications_latency_seconds_count{user="user3"} 0 # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute. # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1 @@ -153,6 +161,14 @@ func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) { cortex_prometheus_last_evaluation_samples{user="user1"} 2000 cortex_prometheus_last_evaluation_samples{user="user2"} 20000 cortex_prometheus_last_evaluation_samples{user="user3"} 200000 +# HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications. +# TYPE cortex_prometheus_notifications_latency_seconds summary +cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 0 +cortex_prometheus_notifications_latency_seconds_count{user="user1"} 0 +cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 0 +cortex_prometheus_notifications_latency_seconds_count{user="user2"} 0 +cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 0 +cortex_prometheus_notifications_latency_seconds_count{user="user3"} 0 # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute. # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1 From 36657f5ee954aeb5d06cbfc673b26cf784d5c946 Mon Sep 17 00:00:00 2001 From: Alvin Lin Date: Tue, 31 May 2022 20:07:28 -0700 Subject: [PATCH 13/14] Fix TestNotifierSendsUserIDHeader Signed-off-by: Alvin Lin --- pkg/ruler/ruler_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 49b43dfbd0f..df9310b1dd9 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -275,10 +275,10 @@ func TestNotifierSendsUserIDHeader(t *testing.T) { // Ensure we have metrics in the notifier. assert.NoError(t, prom_testutil.GatherAndCompare(manager.registry.(*prometheus.Registry), strings.NewReader(` - # HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. - # TYPE cortex_prometheus_notifications_dropped_total counter - cortex_prometheus_notifications_dropped_total{user="1"} 0 - `), "cortex_prometheus_notifications_dropped_total")) + # HELP prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. + # TYPE prometheus_notifications_dropped_total counter + prometheus_notifications_dropped_total 0 + `), "prometheus_notifications_dropped_total")) } func TestRuler_Rules(t *testing.T) { From 6795b07d53aa56289128d4843e0d7bbccfef0fac Mon Sep 17 00:00:00 2001 From: Roy Chiang Date: Thu, 9 Jun 2022 13:10:31 -0700 Subject: [PATCH 14/14] update unit tests Signed-off-by: Roy Chiang --- pkg/ruler/manager_metrics.go | 2 +- pkg/ruler/manager_metrics_test.go | 186 ++++++++++++++++++++++++++---- 2 files changed, 163 insertions(+), 25 deletions(-) diff --git a/pkg/ruler/manager_metrics.go b/pkg/ruler/manager_metrics.go index 0e6e0def937..7bb3d43c90a 100644 --- a/pkg/ruler/manager_metrics.go +++ b/pkg/ruler/manager_metrics.go @@ -217,7 +217,7 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfSummariesPerUser(out, m.NotificationLatency, "prometheus_notifications_latency_seconds") data.SendSumOfCountersPerUserWithLabels(out, m.NotificationErrors, "prometheus_notifications_errors_total", "alertmanager") data.SendSumOfCountersPerUserWithLabels(out, m.NotificationSent, "prometheus_notifications_sent_total", "alertmanager") - data.SendSumOfGaugesPerUser(out, m.NotificationDropped, "prometheus_notifications_dropped_total") + data.SendSumOfCountersPerUser(out, m.NotificationDropped, "prometheus_notifications_dropped_total") data.SendSumOfGaugesPerUser(out, m.NotificationQueueLength, "prometheus_notifications_queue_length") data.SendSumOfGaugesPerUser(out, m.NotificationQueueCapacity, "prometheus_notifications_queue_capacity") data.SendSumOfGaugesPerUser(out, m.AlertmanagersDiscovered, "prometheus_notifications_alertmanagers_discovered") diff --git a/pkg/ruler/manager_metrics_test.go b/pkg/ruler/manager_metrics_test.go index 76799d38925..d0ac38c81a9 100644 --- a/pkg/ruler/manager_metrics_test.go +++ b/pkg/ruler/manager_metrics_test.go @@ -34,14 +34,53 @@ cortex_prometheus_last_evaluation_samples{rule_group="group_one",user="user3"} 1 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user1"} 1000 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user2"} 10000 cortex_prometheus_last_evaluation_samples{rule_group="group_two",user="user3"} 100000 +# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active. +# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge +cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1 +cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10 +cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100 +# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. +# TYPE cortex_prometheus_notifications_dropped_total counter +cortex_prometheus_notifications_dropped_total{user="user1"} 1 +cortex_prometheus_notifications_dropped_total{user="user2"} 10 +cortex_prometheus_notifications_dropped_total{user="user3"} 100 +# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications. +# TYPE cortex_prometheus_notifications_errors_total counter +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100 # HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications. # TYPE cortex_prometheus_notifications_latency_seconds summary -cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 0 -cortex_prometheus_notifications_latency_seconds_count{user="user1"} 0 -cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 0 -cortex_prometheus_notifications_latency_seconds_count{user="user2"} 0 -cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 0 -cortex_prometheus_notifications_latency_seconds_count{user="user3"} 0 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1 +cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10 +cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10 +cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100 +cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100 +cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1 +# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue. +# TYPE cortex_prometheus_notifications_queue_capacity gauge +cortex_prometheus_notifications_queue_capacity{user="user1"} 1 +cortex_prometheus_notifications_queue_capacity{user="user2"} 10 +cortex_prometheus_notifications_queue_capacity{user="user3"} 100 +# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue. +# TYPE cortex_prometheus_notifications_queue_length gauge +cortex_prometheus_notifications_queue_length{user="user1"} 1 +cortex_prometheus_notifications_queue_length{user="user2"} 10 +cortex_prometheus_notifications_queue_length{user="user3"} 100 +# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent. +# TYPE cortex_prometheus_notifications_sent_total counter +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100 # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute. # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1 @@ -161,14 +200,53 @@ func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) { cortex_prometheus_last_evaluation_samples{user="user1"} 2000 cortex_prometheus_last_evaluation_samples{user="user2"} 20000 cortex_prometheus_last_evaluation_samples{user="user3"} 200000 +# HELP cortex_prometheus_notifications_alertmanagers_discovered The number of alertmanagers discovered and active. +# TYPE cortex_prometheus_notifications_alertmanagers_discovered gauge +cortex_prometheus_notifications_alertmanagers_discovered{user="user1"} 1 +cortex_prometheus_notifications_alertmanagers_discovered{user="user2"} 10 +cortex_prometheus_notifications_alertmanagers_discovered{user="user3"} 100 +# HELP cortex_prometheus_notifications_dropped_total Total number of alerts dropped due to errors when sending to Alertmanager. +# TYPE cortex_prometheus_notifications_dropped_total counter +cortex_prometheus_notifications_dropped_total{user="user1"} 1 +cortex_prometheus_notifications_dropped_total{user="user2"} 10 +cortex_prometheus_notifications_dropped_total{user="user3"} 100 +# HELP cortex_prometheus_notifications_errors_total Total number of errors sending alert notifications. +# TYPE cortex_prometheus_notifications_errors_total counter +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_errors_total{alertmanager="alertmanager_1",user="user3"} 100 # HELP cortex_prometheus_notifications_latency_seconds Latency quantiles for sending alert notifications. # TYPE cortex_prometheus_notifications_latency_seconds summary -cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 0 -cortex_prometheus_notifications_latency_seconds_count{user="user1"} 0 -cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 0 -cortex_prometheus_notifications_latency_seconds_count{user="user2"} 0 -cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 0 -cortex_prometheus_notifications_latency_seconds_count{user="user3"} 0 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.5"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.9"} 1 +cortex_prometheus_notifications_latency_seconds{user="user1",quantile="0.99"} 1 +cortex_prometheus_notifications_latency_seconds_sum{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds_count{user="user1"} 1 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.5"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.9"} 10 +cortex_prometheus_notifications_latency_seconds{user="user2",quantile="0.99"} 10 +cortex_prometheus_notifications_latency_seconds_sum{user="user2"} 10 +cortex_prometheus_notifications_latency_seconds_count{user="user2"} 1 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.5"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.9"} 100 +cortex_prometheus_notifications_latency_seconds{user="user3",quantile="0.99"} 100 +cortex_prometheus_notifications_latency_seconds_sum{user="user3"} 100 +cortex_prometheus_notifications_latency_seconds_count{user="user3"} 1 +# HELP cortex_prometheus_notifications_queue_capacity The capacity of the alert notifications queue. +# TYPE cortex_prometheus_notifications_queue_capacity gauge +cortex_prometheus_notifications_queue_capacity{user="user1"} 1 +cortex_prometheus_notifications_queue_capacity{user="user2"} 10 +cortex_prometheus_notifications_queue_capacity{user="user3"} 100 +# HELP cortex_prometheus_notifications_queue_length The number of alert notifications in the queue. +# TYPE cortex_prometheus_notifications_queue_length gauge +cortex_prometheus_notifications_queue_length{user="user1"} 1 +cortex_prometheus_notifications_queue_length{user="user2"} 10 +cortex_prometheus_notifications_queue_length{user="user3"} 100 +# HELP cortex_prometheus_notifications_sent_total Total number of alerts sent. +# TYPE cortex_prometheus_notifications_sent_total counter +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user1"} 1 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user2"} 10 +cortex_prometheus_notifications_sent_total{alertmanager="alertmanager_1",user="user3"} 100 # HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute. # TYPE cortex_prometheus_rule_evaluation_duration_seconds summary cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1 @@ -277,22 +355,37 @@ func populateManager(base float64) *prometheus.Registry { metrics.groupLastEvalSamples.WithLabelValues("group_one").Add(base * 1000) metrics.groupLastEvalSamples.WithLabelValues("group_two").Add(base * 1000) + metrics.notificationsLatency.WithLabelValues("alertmanager_1").Observe(base) + metrics.notificationsErrors.WithLabelValues("alertmanager_1").Add(base) + metrics.notificationsSent.WithLabelValues("alertmanager_1").Add(base) + metrics.notificationsDropped.Add(base) + metrics.notificationsQueueLength.Set(base) + metrics.notificationsQueueCapacity.Set(base) + metrics.notificationsAlertmanagersDiscovered.Set(base) return r } // Copied from github.com/prometheus/rules/manager.go +// and github.com/prometheus/notifier/notifier.go type groupMetrics struct { - evalDuration prometheus.Summary - iterationDuration prometheus.Summary - iterationsMissed *prometheus.CounterVec - iterationsScheduled *prometheus.CounterVec - evalTotal *prometheus.CounterVec - evalFailures *prometheus.CounterVec - groupInterval *prometheus.GaugeVec - groupLastEvalTime *prometheus.GaugeVec - groupLastDuration *prometheus.GaugeVec - groupRules *prometheus.GaugeVec - groupLastEvalSamples *prometheus.GaugeVec + evalDuration prometheus.Summary + iterationDuration prometheus.Summary + iterationsMissed *prometheus.CounterVec + iterationsScheduled *prometheus.CounterVec + evalTotal *prometheus.CounterVec + evalFailures *prometheus.CounterVec + groupInterval *prometheus.GaugeVec + groupLastEvalTime *prometheus.GaugeVec + groupLastDuration *prometheus.GaugeVec + groupRules *prometheus.GaugeVec + groupLastEvalSamples *prometheus.GaugeVec + notificationsLatency *prometheus.SummaryVec + notificationsErrors *prometheus.CounterVec + notificationsSent *prometheus.CounterVec + notificationsDropped prometheus.Counter + notificationsQueueLength prometheus.Gauge + notificationsQueueCapacity prometheus.Gauge + notificationsAlertmanagersDiscovered prometheus.Gauge } func newGroupMetrics(r prometheus.Registerer) *groupMetrics { @@ -371,8 +464,53 @@ func newGroupMetrics(r prometheus.Registerer) *groupMetrics { }, []string{"rule_group"}, ), + notificationsLatency: promauto.With(r).NewSummaryVec( + prometheus.SummaryOpts{ + Name: "prometheus_notifications_latency_seconds", + Help: "Latency quantiles for sending alert notifications.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }, + []string{"alertmanager"}, + ), + notificationsErrors: promauto.With(r).NewCounterVec( + prometheus.CounterOpts{ + Name: "prometheus_notifications_errors_total", + Help: "Latency quantiles for sending alert notifications.", + }, + []string{"alertmanager"}, + ), + notificationsSent: promauto.With(r).NewCounterVec( + prometheus.CounterOpts{ + Name: "prometheus_notifications_sent_total", + Help: "Total number of errors sending alert notifications", + }, + []string{"alertmanager"}, + ), + notificationsDropped: promauto.With(r).NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_notifications_dropped_total", + Help: "Total number of alerts dropped due to errors when sending to Alertmanager.", + }, + ), + notificationsQueueLength: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Name: "prometheus_notifications_queue_length", + Help: "The number of alert notifications in the queue.", + }, + ), + notificationsQueueCapacity: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Name: "prometheus_notifications_queue_capacity", + Help: "The capacity of the alert notifications queue.", + }, + ), + notificationsAlertmanagersDiscovered: promauto.With(r).NewGauge( + prometheus.GaugeOpts{ + Name: "prometheus_notifications_alertmanagers_discovered", + Help: "The number of alertmanagers discovered and active.", + }, + ), } - return m }