From 1ccc308d84cd1042aa9d992ab84532ddbb21db87 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 01:31:44 -0500
Subject: [PATCH 01/38] feat(network): IPv6 disable_ipv6 sysctl detection
 monitor (Task #17205)

Detection-only monitor (type network-ipv6-sysctl) reading
/proc/sys/net/ipv6/conf/{all,default,<iface>}/disable_ipv6 and emitting
IPv6SysctlMisconfigured when IPv6 is disabled but expected enabled.
Self-registers via init(); cmd wiring owned by #17209.

Adds table-driven unit tests (94.5% coverage) with t.TempDir() /proc
fixtures plus static testdata fixtures.
---
 pkg/monitors/network/ipv6_sysctl.go           | 349 ++++++++
 pkg/monitors/network/ipv6_sysctl_test.go      | 749 ++++++++++++++++++
 .../proc/sys/net/ipv6/conf/all/disable_ipv6   |   1 +
 .../sys/net/ipv6/conf/default/disable_ipv6    |   1 +
 4 files changed, 1100 insertions(+)
 create mode 100644 pkg/monitors/network/ipv6_sysctl.go
 create mode 100644 pkg/monitors/network/ipv6_sysctl_test.go
 create mode 100644 pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/all/disable_ipv6
 create mode 100644 pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/default/disable_ipv6
diff --git a/pkg/monitors/network/ipv6_sysctl.go b/pkg/monitors/network/ipv6_sysctl.go
new file mode 100644
index 0000000..92ce817
--- /dev/null
+++ b/pkg/monitors/network/ipv6_sysctl.go
@@ -0,0 +1,349 @@
+// Package network provides network health monitoring capabilities.
+package network
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"slices"
+	"strings"
+
+	"github.com/supporttools/node-doctor/pkg/monitors"
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+const (
+	// Default configuration values for IPv6 sysctl monitor.
+	defaultIPv6ExpectEnabled       = true
+	defaultIPv6CheckPerInterface   = false
+	defaultIPv6SysctlProcPath      = "/proc"
+	ipv6AllDisableSysctlPath       = "sys/net/ipv6/conf/all/disable_ipv6"
+	ipv6DefaultDisableSysctlPath   = "sys/net/ipv6/conf/default/disable_ipv6"
+	ipv6PerIfaceDisableGlobPattern = "sys/net/ipv6/conf/*/disable_ipv6"
+)
+
+// defaultIPv6SkipInterfaces are interfaces that are excluded from per-interface
+// disable_ipv6 checks. "all" and "default" are the global pseudo-interfaces and
+// are checked separately; "lo" is the loopback and intentionally has IPv6
+// disabled on some hardened images.
+var defaultIPv6SkipInterfaces = []string{"all", "default", "lo"}
+
+// IPv6SysctlConfig holds configuration for the IPv6 sysctl monitor.
+type IPv6SysctlConfig struct {
+	// ExpectIPv6Enabled controls severity. When true, disable_ipv6=1 is treated
+	// as a misconfiguration (warning). When false, the value is recorded but
+	// not flagged.
+	ExpectIPv6Enabled bool
+	// CheckPerInterface enables scanning per-interface disable_ipv6 settings.
+	CheckPerInterface bool
+	// Interfaces, when non-empty, restricts per-interface checks to these
+	// interface names. Empty means check every interface discovered via glob.
+	Interfaces []string
+	// SkipInterfaces lists interface names to exclude from per-interface
+	// checks. Defaults to {"all", "default", "lo"}.
+	SkipInterfaces []string
+	// ProcPath is the base path for the proc filesystem. Defaults to "/proc";
+	// override with "/host/proc" for containerized deployments.
+	ProcPath string
+}
+
+// IPv6SysctlMonitor monitors IPv6 sysctls relevant to Kubernetes networking.
+// This monitor is detection-only and does not modify any sysctls.
+type IPv6SysctlMonitor struct {
+	name   string
+	config *IPv6SysctlConfig
+
+	*monitors.BaseMonitor
+}
+
+// init registers the IPv6 sysctl monitor with the monitor registry.
+func init() {
+	monitors.MustRegister(monitors.MonitorInfo{
+		Type:        "network-ipv6-sysctl",
+		Factory:     NewIPv6SysctlMonitor,
+		Validator:   ValidateIPv6SysctlConfig,
+		Description: "Detection-only monitor for IPv6 disable_ipv6 sysctls (does not modify settings)",
+		DefaultConfig: &types.MonitorConfig{
+			Name:           "ipv6-sysctl-check",
+			Type:           "network-ipv6-sysctl",
+			Enabled:        true,
+			IntervalString: "60s",
+			TimeoutString:  "5s",
+			Config: map[string]any{
+				"expectIPv6Enabled": true,
+				"checkPerInterface": false,
+				"procPath":          "/proc",
+			},
+		},
+	})
+}
+
+// NewIPv6SysctlMonitor creates a new IPv6 sysctl monitor instance.
+func NewIPv6SysctlMonitor(ctx context.Context, config types.MonitorConfig) (types.Monitor, error) {
+	cfg, err := parseIPv6SysctlConfig(config.Config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse ipv6 sysctl config: %w", err)
+	}
+
+	baseMonitor, err := monitors.NewBaseMonitor(config.Name, config.Interval, config.Timeout)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create base monitor: %w", err)
+	}
+
+	monitor := &IPv6SysctlMonitor{
+		name:        config.Name,
+		config:      cfg,
+		BaseMonitor: baseMonitor,
+	}
+
+	if err := baseMonitor.SetCheckFunc(monitor.checkIPv6Sysctl); err != nil {
+		return nil, fmt.Errorf("failed to set check function: %w", err)
+	}
+
+	return monitor, nil
+}
+
+// parseIPv6SysctlConfig parses configuration from a generic map.
+func parseIPv6SysctlConfig(configMap map[string]any) (*IPv6SysctlConfig, error) {
+	config := &IPv6SysctlConfig{
+		ExpectIPv6Enabled: defaultIPv6ExpectEnabled,
+		CheckPerInterface: defaultIPv6CheckPerInterface,
+		ProcPath:          defaultIPv6SysctlProcPath,
+		SkipInterfaces:    append([]string(nil), defaultIPv6SkipInterfaces...),
+	}
+
+	if configMap == nil {
+		return config, nil
+	}
+
+	if v, ok := configMap["expectIPv6Enabled"]; ok {
+		boolVal, ok := v.(bool)
+		if !ok {
+			return nil, fmt.Errorf("expectIPv6Enabled must be a boolean, got %T", v)
+		}
+		config.ExpectIPv6Enabled = boolVal
+	}
+
+	if v, ok := configMap["checkPerInterface"]; ok {
+		boolVal, ok := v.(bool)
+		if !ok {
+			return nil, fmt.Errorf("checkPerInterface must be a boolean, got %T", v)
+		}
+		config.CheckPerInterface = boolVal
+	}
+
+	if v, ok := configMap["interfaces"]; ok {
+		ifaces, err := parseStringList(v, "interfaces")
+		if err != nil {
+			return nil, err
+		}
+		config.Interfaces = ifaces
+	}
+
+	if v, ok := configMap["skipInterfaces"]; ok {
+		ifaces, err := parseStringList(v, "skipInterfaces")
+		if err != nil {
+			return nil, err
+		}
+		// Explicit override replaces the defaults so operators can opt back
+		// into checking lo if desired.
+		config.SkipInterfaces = ifaces
+	}
+
+	if v, ok := configMap["procPath"]; ok {
+		strVal, ok := v.(string)
+		if !ok {
+			return nil, fmt.Errorf("procPath must be a string, got %T", v)
+		}
+		config.ProcPath = strVal
+	}
+
+	return config, nil
+}
+
+// parseStringList accepts either []string or []any (where each element is a
+// string) from a config map. The fieldName is used for error messages.
+func parseStringList(v any, fieldName string) ([]string, error) {
+	switch val := v.(type) {
+	case []string:
+		return val, nil
+	case []any:
+		out := make([]string, 0, len(val))
+		for _, item := range val {
+			strVal, ok := item.(string)
+			if !ok {
+				return nil, fmt.Errorf("%s must be a list of strings, got %T element", fieldName, item)
+			}
+			out = append(out, strVal)
+		}
+		return out, nil
+	default:
+		return nil, fmt.Errorf("%s must be a list of strings, got %T", fieldName, v)
+	}
+}
+
+// ValidateIPv6SysctlConfig validates the IPv6 sysctl monitor configuration.
+func ValidateIPv6SysctlConfig(config types.MonitorConfig) error {
+	_, err := parseIPv6SysctlConfig(config.Config)
+	return err
+}
+
+// checkIPv6Sysctl performs the IPv6 sysctl health check.
+func (m *IPv6SysctlMonitor) checkIPv6Sysctl(ctx context.Context) (*types.Status, error) {
+	status := types.NewStatus(m.name)
+
+	var findings []string
+
+	allPath := filepath.Join(m.config.ProcPath, ipv6AllDisableSysctlPath)
+	defaultPath := filepath.Join(m.config.ProcPath, ipv6DefaultDisableSysctlPath)
+
+	m.checkScopedDisableIPv6(status, "all", allPath, &findings)
+	m.checkScopedDisableIPv6(status, "default", defaultPath, &findings)
+
+	if m.config.CheckPerInterface {
+		ifaceFindings := m.checkPerInterfaceDisableIPv6(status)
+		findings = append(findings, ifaceFindings...)
+	}
+
+	if len(findings) > 0 {
+		status.AddCondition(types.NewCondition(
+			"IPv6SysctlMisconfigured",
+			types.ConditionTrue,
+			"DisableIPv6Set",
+			fmt.Sprintf("IPv6 sysctls flagged: %s", strings.Join(findings, ", ")),
+		))
+	} else {
+		status.AddCondition(types.NewCondition(
+			"IPv6SysctlMisconfigured",
+			types.ConditionFalse,
+			"IPv6SysctlsHealthy",
+			"All checked IPv6 disable_ipv6 sysctls match expectations",
+		))
+		status.AddEvent(types.NewEvent(
+			types.EventInfo,
+			"IPv6SysctlsHealthy",
+			"IPv6 disable_ipv6 sysctls are configured as expected",
+		))
+	}
+
+	return status, nil
+}
+
+// checkScopedDisableIPv6 reads a single all/default disable_ipv6 file. Read
+// errors are reported as warnings (the IPv6 stack may legitimately be absent
+// on hardened kernels), and findings are appended to the supplied slice when
+// the value is set and ExpectIPv6Enabled is true.
+func (m *IPv6SysctlMonitor) checkScopedDisableIPv6(status *types.Status, scope, path string, findings *[]string) {
+	disabled, err := readSysctlBool(path)
+	if err != nil {
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6SysctlReadError",
+			fmt.Sprintf("Failed to read net.ipv6.conf.%s.disable_ipv6 from %s: %v", scope, path, err),
+		))
+		*findings = append(*findings, fmt.Sprintf("net.ipv6.conf.%s.disable_ipv6 (unreadable)", scope))
+		return
+	}
+
+	if !disabled {
+		return
+	}
+
+	setting := fmt.Sprintf("net.ipv6.conf.%s.disable_ipv6=1", scope)
+	if m.config.ExpectIPv6Enabled {
+		*findings = append(*findings, setting)
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6Disabled",
+			fmt.Sprintf("IPv6 is disabled (%s) on scope %q. "+
+				"If this cluster expects IPv6 connectivity, this monitor would block "+
+				"IPv6 pod networking. This monitor is detection-only and does not modify "+
+				"sysctls. To enable: sysctl -w %s", setting, scope, strings.Replace(setting, "=1", "=0", 1)),
+		))
+	} else {
+		status.AddEvent(types.NewEvent(
+			types.EventInfo,
+			"IPv6DisabledExpected",
+			fmt.Sprintf("IPv6 disabled on scope %q (%s); expectIPv6Enabled=false so no action required", scope, setting),
+		))
+	}
+}
+
+// checkPerInterfaceDisableIPv6 globs per-interface disable_ipv6 sysctls and
+// returns descriptions of interfaces with disable_ipv6=1 (when expected to be
+// enabled).
+func (m *IPv6SysctlMonitor) checkPerInterfaceDisableIPv6(status *types.Status) []string {
+	var disabled []string
+
+	pattern := filepath.Join(m.config.ProcPath, ipv6PerIfaceDisableGlobPattern)
+	matches, err := filepath.Glob(pattern)
+	if err != nil {
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6SysctlGlobError",
+			fmt.Sprintf("Failed to glob per-interface disable_ipv6 files: %v", err),
+		))
+		return nil
+	}
+
+	skip := m.config.SkipInterfaces
+	if skip == nil {
+		skip = defaultIPv6SkipInterfaces
+	}
+
+	for _, match := range matches {
+		ifaceName := extractInterfaceName(match)
+		if ifaceName == "" {
+			continue
+		}
+		if slices.Contains(skip, ifaceName) {
+			continue
+		}
+		if len(m.config.Interfaces) > 0 && !slices.Contains(m.config.Interfaces, ifaceName) {
+			continue
+		}
+
+		isDisabled, err := readSysctlBool(match)
+		if err != nil {
+			// Skip unreadable interfaces silently — per-interface files race
+			// with link teardown and noisy errors are not actionable.
+			continue
+		}
+
+		if !isDisabled {
+			continue
+		}
+
+		setting := fmt.Sprintf("net.ipv6.conf.%s.disable_ipv6=1", ifaceName)
+		if m.config.ExpectIPv6Enabled {
+			disabled = append(disabled, setting)
+			status.AddEvent(types.NewEvent(
+				types.EventWarning,
+				"InterfaceIPv6Disabled",
+				fmt.Sprintf("IPv6 disabled on interface %s (%s). Detection-only — no sysctl change applied.",
+					ifaceName, setting),
+			))
+		} else {
+			status.AddEvent(types.NewEvent(
+				types.EventInfo,
+				"InterfaceIPv6DisabledExpected",
+				fmt.Sprintf("IPv6 disabled on interface %s (%s); expectIPv6Enabled=false so no action required",
+					ifaceName, setting),
+			))
+		}
+	}
+
+	return disabled
+}
+
+// readSysctlBool reads a sysctl-style file and returns true when its content
+// (trimmed of whitespace) is "1". Any other value is treated as false. Errors
+// are propagated.
+func readSysctlBool(path string) (bool, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return false, fmt.Errorf("failed to read %s: %w", path, err)
+	}
+	return strings.TrimSpace(string(data)) == "1", nil
+}
diff --git a/pkg/monitors/network/ipv6_sysctl_test.go b/pkg/monitors/network/ipv6_sysctl_test.go
new file mode 100644
index 0000000..9219e77
--- /dev/null
+++ b/pkg/monitors/network/ipv6_sysctl_test.go
@@ -0,0 +1,749 @@
+package network
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+func TestParseIPv6SysctlConfig(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  map[string]any
+		want    *IPv6SysctlConfig
+		wantErr bool
+	}{
+		{
+			name:   "nil config - use defaults",
+			config: nil,
+			want: &IPv6SysctlConfig{
+				ExpectIPv6Enabled: defaultIPv6ExpectEnabled,
+				CheckPerInterface: defaultIPv6CheckPerInterface,
+				ProcPath:          defaultIPv6SysctlProcPath,
+				SkipInterfaces:    defaultIPv6SkipInterfaces,
+			},
+			wantErr: false,
+		},
+		{
+			name:   "empty config - use defaults",
+			config: map[string]any{},
+			want: &IPv6SysctlConfig{
+				ExpectIPv6Enabled: defaultIPv6ExpectEnabled,
+				CheckPerInterface: defaultIPv6CheckPerInterface,
+				ProcPath:          defaultIPv6SysctlProcPath,
+				SkipInterfaces:    defaultIPv6SkipInterfaces,
+			},
+			wantErr: false,
+		},
+		{
+			name: "custom values",
+			config: map[string]any{
+				"expectIPv6Enabled": false,
+				"checkPerInterface": true,
+				"procPath":          "/host/proc",
+			},
+			want: &IPv6SysctlConfig{
+				ExpectIPv6Enabled: false,
+				CheckPerInterface: true,
+				ProcPath:          "/host/proc",
+				SkipInterfaces:    defaultIPv6SkipInterfaces,
+			},
+			wantErr: false,
+		},
+		{
+			name: "with interfaces list",
+			config: map[string]any{
+				"interfaces": []any{"eth0", "eth1"},
+			},
+			want: &IPv6SysctlConfig{
+				ExpectIPv6Enabled: defaultIPv6ExpectEnabled,
+				CheckPerInterface: defaultIPv6CheckPerInterface,
+				ProcPath:          defaultIPv6SysctlProcPath,
+				SkipInterfaces:    defaultIPv6SkipInterfaces,
+				Interfaces:        []string{"eth0", "eth1"},
+			},
+			wantErr: false,
+		},
+		{
+			name: "with interfaces list as []string",
+			config: map[string]any{
+				"interfaces": []string{"eth0"},
+			},
+			want: &IPv6SysctlConfig{
+				ExpectIPv6Enabled: defaultIPv6ExpectEnabled,
+				CheckPerInterface: defaultIPv6CheckPerInterface,
+				ProcPath:          defaultIPv6SysctlProcPath,
+				SkipInterfaces:    defaultIPv6SkipInterfaces,
+				Interfaces:        []string{"eth0"},
+			},
+			wantErr: false,
+		},
+		{
+			name: "skipInterfaces overrides defaults",
+			config: map[string]any{
+				"skipInterfaces": []any{"all", "default"},
+			},
+			want: &IPv6SysctlConfig{
+				ExpectIPv6Enabled: defaultIPv6ExpectEnabled,
+				CheckPerInterface: defaultIPv6CheckPerInterface,
+				ProcPath:          defaultIPv6SysctlProcPath,
+				SkipInterfaces:    []string{"all", "default"},
+			},
+			wantErr: false,
+		},
+		{
+			name: "invalid expectIPv6Enabled type",
+			config: map[string]any{
+				"expectIPv6Enabled": "yes",
+			},
+			wantErr: true,
+		},
+		{
+			name: "invalid checkPerInterface type",
+			config: map[string]any{
+				"checkPerInterface": 1,
+			},
+			wantErr: true,
+		},
+		{
+			name: "invalid procPath type",
+			config: map[string]any{
+				"procPath": 123,
+			},
+			wantErr: true,
+		},
+		{
+			name: "invalid interfaces type",
+			config: map[string]any{
+				"interfaces": "eth0",
+			},
+			wantErr: true,
+		},
+		{
+			name: "invalid interfaces element type",
+			config: map[string]any{
+				"interfaces": []any{123},
+			},
+			wantErr: true,
+		},
+		{
+			name: "invalid skipInterfaces element type",
+			config: map[string]any{
+				"skipInterfaces": []any{true},
+			},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parseIPv6SysctlConfig(tt.config)
+
+			if (err != nil) != tt.wantErr {
+				t.Errorf("parseIPv6SysctlConfig() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+
+			if tt.wantErr {
+				return
+			}
+
+			if got.ExpectIPv6Enabled != tt.want.ExpectIPv6Enabled {
+				t.Errorf("ExpectIPv6Enabled = %v, want %v", got.ExpectIPv6Enabled, tt.want.ExpectIPv6Enabled)
+			}
+			if got.CheckPerInterface != tt.want.CheckPerInterface {
+				t.Errorf("CheckPerInterface = %v, want %v", got.CheckPerInterface, tt.want.CheckPerInterface)
+			}
+			if got.ProcPath != tt.want.ProcPath {
+				t.Errorf("ProcPath = %v, want %v", got.ProcPath, tt.want.ProcPath)
+			}
+			if !equalStringSlice(got.Interfaces, tt.want.Interfaces) {
+				t.Errorf("Interfaces = %v, want %v", got.Interfaces, tt.want.Interfaces)
+			}
+			if !equalStringSlice(got.SkipInterfaces, tt.want.SkipInterfaces) {
+				t.Errorf("SkipInterfaces = %v, want %v", got.SkipInterfaces, tt.want.SkipInterfaces)
+			}
+		})
+	}
+}
+
+func equalStringSlice(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func TestValidateIPv6SysctlConfig(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  map[string]any
+		wantErr bool
+	}{
+		{
+			name:    "valid config",
+			config:  map[string]any{"expectIPv6Enabled": true},
+			wantErr: false,
+		},
+		{
+			name:    "invalid config",
+			config:  map[string]any{"expectIPv6Enabled": "yes"},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			monitorConfig := types.MonitorConfig{
+				Name:     "test-ipv6-sysctl",
+				Type:     "network-ipv6-sysctl",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config:   tt.config,
+			}
+			err := ValidateIPv6SysctlConfig(monitorConfig)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("ValidateIPv6SysctlConfig() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestNewIPv6SysctlMonitor(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  types.MonitorConfig
+		wantErr bool
+	}{
+		{
+			name: "valid config",
+			config: types.MonitorConfig{
+				Name:     "test-ipv6-sysctl",
+				Type:     "network-ipv6-sysctl",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config: map[string]any{
+					"expectIPv6Enabled": true,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name: "invalid config - bad type",
+			config: types.MonitorConfig{
+				Name:     "test-ipv6-sysctl",
+				Type:     "network-ipv6-sysctl",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config: map[string]any{
+					"expectIPv6Enabled": "invalid",
+				},
+			},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			monitor, err := NewIPv6SysctlMonitor(context.Background(), tt.config)
+
+			if (err != nil) != tt.wantErr {
+				t.Errorf("NewIPv6SysctlMonitor() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+
+			if !tt.wantErr && monitor == nil {
+				t.Error("NewIPv6SysctlMonitor() returned nil monitor")
+			}
+		})
+	}
+}
+
+// createMockIPv6ProcFS creates a mock /proc/sys/net/ipv6/conf directory tree for
+// testing. The allValue/defaultValue strings populate all/disable_ipv6 and
+// default/disable_ipv6 respectively (empty string skips the file). The
+// interfaces map populates per-interface disable_ipv6 files.
+func createMockIPv6ProcFS(t *testing.T, allValue, defaultValue string, interfaces map[string]string) string {
+	t.Helper()
+
+	procDir := t.TempDir()
+
+	writeScope := func(scope, value string) {
+		if value == "" {
+			return
+		}
+		dir := filepath.Join(procDir, "sys", "net", "ipv6", "conf", scope)
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			t.Fatalf("Failed to create %s dir: %v", scope, err)
+		}
+		if err := os.WriteFile(filepath.Join(dir, "disable_ipv6"), []byte(value+"\n"), 0644); err != nil {
+			t.Fatalf("Failed to write disable_ipv6 for %s: %v", scope, err)
+		}
+	}
+
+	writeScope("all", allValue)
+	writeScope("default", defaultValue)
+
+	for ifaceName, value := range interfaces {
+		writeScope(ifaceName, value)
+	}
+
+	return procDir
+}
+
+// findCondition returns the IPv6SysctlMisconfigured condition, or nil if absent.
+func findIPv6Condition(status *types.Status) *types.Condition {
+	for i := range status.Conditions {
+		if status.Conditions[i].Type == "IPv6SysctlMisconfigured" {
+			return &status.Conditions[i]
+		}
+	}
+	return nil
+}
+
+func hasEventReason(status *types.Status, reason string) bool {
+	for _, event := range status.Events {
+		if event.Reason == reason {
+			return true
+		}
+	}
+	return false
+}
+
+func TestCheckIPv6Sysctl_AllEnabled(t *testing.T) {
+	procDir := createMockIPv6ProcFS(t, "0", "0", nil)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6SysctlMisconfigured=False, got %s", cond.Status)
+	}
+	if !hasEventReason(status, "IPv6SysctlsHealthy") {
+		t.Error("Expected IPv6SysctlsHealthy event, but not found")
+	}
+}
+
+func TestCheckIPv6Sysctl_AllDisabled(t *testing.T) {
+	procDir := createMockIPv6ProcFS(t, "1", "0", nil)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6SysctlMisconfigured=True, got %s", cond.Status)
+	}
+	if cond.Reason != "DisableIPv6Set" {
+		t.Errorf("Expected reason DisableIPv6Set, got %s", cond.Reason)
+	}
+	if !hasEventReason(status, "IPv6Disabled") {
+		t.Error("Expected IPv6Disabled warning event, but not found")
+	}
+	for _, event := range status.Events {
+		if event.Reason == "IPv6Disabled" && event.Severity != types.EventWarning {
+			t.Errorf("Expected Warning severity for IPv6Disabled, got %s", event.Severity)
+		}
+	}
+}
+
+func TestCheckIPv6Sysctl_DefaultDisabled(t *testing.T) {
+	procDir := createMockIPv6ProcFS(t, "0", "1", nil)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6SysctlMisconfigured=True (default disabled), got %s", cond.Status)
+	}
+	if !hasEventReason(status, "IPv6Disabled") {
+		t.Error("Expected IPv6Disabled warning event, but not found")
+	}
+}
+
+func TestCheckIPv6Sysctl_ExpectDisabledSuppressesSeverity(t *testing.T) {
+	procDir := createMockIPv6ProcFS(t, "1", "1", nil)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: false,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	// With expectIPv6Enabled=false, disable_ipv6=1 is not a finding.
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6SysctlMisconfigured=False (expect disabled), got %s", cond.Status)
+	}
+	// Should emit informational events, not warnings.
+	if !hasEventReason(status, "IPv6DisabledExpected") {
+		t.Error("Expected IPv6DisabledExpected info event, but not found")
+	}
+	if hasEventReason(status, "IPv6Disabled") {
+		t.Error("Did not expect IPv6Disabled warning event when expectIPv6Enabled=false")
+	}
+}
+
+func TestCheckIPv6Sysctl_PerInterfaceMix(t *testing.T) {
+	interfaces := map[string]string{
+		"eth0": "0",
+		"eth1": "1",
+		"lo":   "1", // skipped by default
+	}
+	procDir := createMockIPv6ProcFS(t, "0", "0", interfaces)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			CheckPerInterface: true,
+			SkipInterfaces:    defaultIPv6SkipInterfaces,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6SysctlMisconfigured=True (eth1 disabled), got %s", cond.Status)
+	}
+	if !hasEventReason(status, "InterfaceIPv6Disabled") {
+		t.Error("Expected InterfaceIPv6Disabled event for eth1, but not found")
+	}
+}
+
+func TestCheckIPv6Sysctl_InterfacesFilter(t *testing.T) {
+	interfaces := map[string]string{
+		"eth0": "1", // disabled but not in filter
+		"eth1": "0", // enabled, in filter
+	}
+	procDir := createMockIPv6ProcFS(t, "0", "0", interfaces)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			CheckPerInterface: true,
+			Interfaces:        []string{"eth1"},
+			SkipInterfaces:    defaultIPv6SkipInterfaces,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6SysctlMisconfigured=False (only eth1 checked, enabled), got %s", cond.Status)
+	}
+	if hasEventReason(status, "InterfaceIPv6Disabled") {
+		t.Error("Did not expect InterfaceIPv6Disabled event when eth0 filtered out")
+	}
+}
+
+func TestCheckIPv6Sysctl_SkipInterfacesRespected(t *testing.T) {
+	// all/default/lo all disabled but should be skipped in per-interface scan.
+	// The scoped all/default checks run separately, so set them enabled here and
+	// verify lo (disabled) is skipped by the per-interface scan.
+	interfaces := map[string]string{
+		"lo": "1",
+	}
+	procDir := createMockIPv6ProcFS(t, "0", "0", interfaces)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			CheckPerInterface: true,
+			SkipInterfaces:    defaultIPv6SkipInterfaces,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6SysctlMisconfigured=False (lo skipped), got %s", cond.Status)
+	}
+	if hasEventReason(status, "InterfaceIPv6Disabled") {
+		t.Error("Did not expect InterfaceIPv6Disabled event for skipped lo interface")
+	}
+}
+
+func TestCheckIPv6Sysctl_SkipInterfacesNilFallsBackToDefault(t *testing.T) {
+	// SkipInterfaces is nil (not set), so checkPerInterfaceDisableIPv6 must fall
+	// back to defaultIPv6SkipInterfaces and skip lo.
+	interfaces := map[string]string{
+		"lo": "1",
+	}
+	procDir := createMockIPv6ProcFS(t, "0", "0", interfaces)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			CheckPerInterface: true,
+			SkipInterfaces:    nil,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6SysctlMisconfigured=False (lo skipped via default), got %s", cond.Status)
+	}
+}
+
+func TestCheckIPv6Sysctl_PerInterfaceExpectDisabled(t *testing.T) {
+	interfaces := map[string]string{
+		"eth1": "1",
+	}
+	procDir := createMockIPv6ProcFS(t, "0", "0", interfaces)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: false,
+			CheckPerInterface: true,
+			SkipInterfaces:    defaultIPv6SkipInterfaces,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6SysctlMisconfigured=False (expect disabled), got %s", cond.Status)
+	}
+	if !hasEventReason(status, "InterfaceIPv6DisabledExpected") {
+		t.Error("Expected InterfaceIPv6DisabledExpected info event, but not found")
+	}
+}
+
+func TestCheckIPv6Sysctl_MissingFiles(t *testing.T) {
+	// procDir exists but has no disable_ipv6 files -> read errors become
+	// warnings + findings (not a hard error).
+	procDir := createMockIPv6ProcFS(t, "", "", nil)
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error (should not hard error): %v", err)
+	}
+
+	if !hasEventReason(status, "IPv6SysctlReadError") {
+		t.Error("Expected IPv6SysctlReadError warning event for missing files, but not found")
+	}
+	for _, event := range status.Events {
+		if event.Reason == "IPv6SysctlReadError" && event.Severity != types.EventWarning {
+			t.Errorf("Expected Warning severity for IPv6SysctlReadError, got %s", event.Severity)
+		}
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6SysctlMisconfigured=True (unreadable files flagged), got %s", cond.Status)
+	}
+}
+
+func TestCheckIPv6Sysctl_NonexistentProcPath(t *testing.T) {
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			CheckPerInterface: true,
+			SkipInterfaces:    defaultIPv6SkipInterfaces,
+			ProcPath:          "/nonexistent/proc",
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	if !hasEventReason(status, "IPv6SysctlReadError") {
+		t.Error("Expected IPv6SysctlReadError event for nonexistent procPath, but not found")
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6SysctlMisconfigured=True for nonexistent procPath, got %s", cond.Status)
+	}
+}
+
+func TestCheckIPv6Sysctl_TestdataFixture(t *testing.T) {
+	procDir := filepath.Join("testdata", "proc")
+
+	monitor := &IPv6SysctlMonitor{
+		name: "test-ipv6-sysctl",
+		config: &IPv6SysctlConfig{
+			ExpectIPv6Enabled: true,
+			ProcPath:          procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Sysctl(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Sysctl() unexpected error: %v", err)
+	}
+
+	cond := findIPv6Condition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6SysctlMisconfigured condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6SysctlMisconfigured=False from healthy fixture, got %s", cond.Status)
+	}
+	if !hasEventReason(status, "IPv6SysctlsHealthy") {
+		t.Error("Expected IPv6SysctlsHealthy event from healthy fixture, but not found")
+	}
+}
+
+func TestReadSysctlBool(t *testing.T) {
+	tests := []struct {
+		name    string
+		content string
+		want    bool
+		wantErr bool
+	}{
+		{name: "disabled", content: "1\n", want: true},
+		{name: "enabled", content: "0\n", want: false},
+		{name: "disabled no newline", content: "1", want: true},
+		{name: "enabled with whitespace", content: " 0 \n", want: false},
+		{name: "unexpected value", content: "2\n", want: false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpFile := filepath.Join(t.TempDir(), "disable_ipv6")
+			if err := os.WriteFile(tmpFile, []byte(tt.content), 0644); err != nil {
+				t.Fatalf("Failed to write test file: %v", err)
+			}
+
+			got, err := readSysctlBool(tmpFile)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("readSysctlBool() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if got != tt.want {
+				t.Errorf("readSysctlBool() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+
+	t.Run("non-existent file", func(t *testing.T) {
+		_, err := readSysctlBool("/nonexistent/file")
+		if err == nil {
+			t.Error("Expected error for non-existent file, got nil")
+		}
+	})
+}
diff --git a/pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/all/disable_ipv6 b/pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/all/disable_ipv6
new file mode 100644
index 0000000..573541a
--- /dev/null
+++ b/pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/all/disable_ipv6
@@ -0,0 +1 @@
+0
diff --git a/pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/default/disable_ipv6 b/pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/default/disable_ipv6
new file mode 100644
index 0000000..573541a
--- /dev/null
+++ b/pkg/monitors/network/testdata/proc/sys/net/ipv6/conf/default/disable_ipv6
@@ -0,0 +1 @@
+0

From f2b98f423430d1ffc8ece9a6d6ffb7b7f93c0512 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 01:40:42 -0500
Subject: [PATCH 02/38] feat(network): dual-stack gateway selection via
 addressFamily config (Task #17239)

Wire the landed detectDefaultIPv6Gateway parser into the gateway monitor.
New config key addressFamily (ipv4|ipv6|auto, default ipv4 preserves prior
behavior); getGatewayIP now returns the probed family and records it on
types.GatewayLatency.AddressFamily. auto prefers IPv4, falls back to IPv6.
Prometheus address_family label deferred to #17216.

Note: task's stated filesAffected (pkg/metrics/latency.go, pkg/events/
network.go) do not exist; GatewayLatency lives in pkg/types/types.go.
---
 pkg/monitors/network/gateway.go      | 146 ++++++++++++++---
 pkg/monitors/network/gateway_test.go | 226 ++++++++++++++++++++++++++-
 pkg/types/types.go                   |   5 +
 3 files changed, 352 insertions(+), 25 deletions(-)

diff --git a/pkg/monitors/network/gateway.go b/pkg/monitors/network/gateway.go
index bc4b8cb..d99312a 100644
--- a/pkg/monitors/network/gateway.go
+++ b/pkg/monitors/network/gateway.go
@@ -22,6 +22,9 @@ const (
 	// procNetRoute is the path to the Linux IPv4 routing table.
 	procNetRoute = "/proc/net/route"
 
+	// procNetIPv6Route is the path to the Linux IPv6 routing table.
+	procNetIPv6Route = "/proc/net/ipv6_route"
+
 	// ipv6RouteHexLen is the number of hex chars representing a 16-byte
 	// IPv6 address as written by the kernel in /proc/net/ipv6_route.
 	ipv6RouteHexLen = 32
@@ -37,6 +40,21 @@ const (
 	defaultLatencyThreshold      = 100 * time.Millisecond
 	defaultFailureCountThreshold = 3
 	defaultAutoDetectGateway     = true
+
+	// Address family selection modes for the gateway monitor.
+	//
+	//	familyIPv4 selects the IPv4 default route only (default; preserves the
+	//	           historical behavior of probing /proc/net/route).
+	//	familyIPv6 selects the IPv6 default route only (/proc/net/ipv6_route).
+	//	familyAuto prefers the IPv4 default route and falls back to the IPv6
+	//	           default route when no IPv4 default route exists.
+	familyIPv4 = FamilyIPv4 // "ipv4"
+	familyIPv6 = FamilyIPv6 // "ipv6"
+	familyAuto = "auto"
+
+	// defaultAddressFamily preserves the pre-dual-stack behavior: probe the
+	// IPv4 default gateway only.
+	defaultAddressFamily = familyIPv4
 )
 
 // GatewayMonitorConfig holds the configuration for the gateway monitor.
@@ -53,6 +71,16 @@ type GatewayMonitorConfig struct {
 	ManualGateway string
 	// FailureCountThreshold is the number of consecutive failures before reporting NetworkUnreachable.
 	FailureCountThreshold int
+	// AddressFamily selects which IP family's default route to probe when
+	// auto-detecting the gateway. Accepted values are "ipv4" (default),
+	// "ipv6", and "auto" (prefer IPv4, fall back to IPv6).
+	AddressFamily string
+
+	// procRoutePath and procIPv6RoutePath override the kernel route-table
+	// paths for testing. When empty the canonical /proc paths are used.
+	// They are unexported so they are never settable from user config.
+	procRoutePath     string
+	procIPv6RoutePath string
 }
 
 // GatewayMonitor monitors the default gateway's reachability and latency.
@@ -86,6 +114,7 @@ func init() {
 				"latencyThreshold":      "100ms",
 				"autoDetectGateway":     true,
 				"failureCountThreshold": 3,
+				"addressFamily":         "ipv4",
 			},
 		},
 	})
@@ -130,6 +159,7 @@ func parseGatewayConfig(configMap map[string]interface{}) (*GatewayMonitorConfig
 		AutoDetectGateway:     defaultAutoDetectGateway,
 		ManualGateway:         "",
 		FailureCountThreshold: defaultFailureCountThreshold,
+		AddressFamily:         defaultAddressFamily,
 	}
 
 	if configMap == nil {
@@ -206,6 +236,25 @@ func parseGatewayConfig(configMap map[string]interface{}) (*GatewayMonitorConfig
 		}
 	}
 
+	// Parse address family selection (ipv4 | ipv6 | auto).
+	if v, ok := configMap["addressFamily"]; ok {
+		strVal, ok := v.(string)
+		if !ok {
+			return nil, fmt.Errorf("addressFamily must be a string, got %T", v)
+		}
+		switch strings.ToLower(strings.TrimSpace(strVal)) {
+		case familyIPv4:
+			config.AddressFamily = familyIPv4
+		case familyIPv6:
+			config.AddressFamily = familyIPv6
+		case familyAuto:
+			config.AddressFamily = familyAuto
+		default:
+			return nil, fmt.Errorf("addressFamily must be one of %q, %q, or %q, got %q",
+				familyIPv4, familyIPv6, familyAuto, strVal)
+		}
+	}
+
 	return config, nil
 }
 
@@ -233,8 +282,8 @@ func ValidateGatewayConfig(config types.MonitorConfig) error {
 func (m *GatewayMonitor) checkGateway(ctx context.Context) (*types.Status, error) {
 	status := types.NewStatus(m.name)
 
-	// Determine gateway IP
-	gatewayIP, err := m.getGatewayIP()
+	// Determine gateway IP and the address family it belongs to.
+	gatewayIP, family, err := m.getGatewayIP()
 	if err != nil {
 		m.updateFailureTracking(false, status)
 		status.AddEvent(types.NewEvent(
@@ -297,13 +346,14 @@ func (m *GatewayMonitor) checkGateway(ctx context.Context) (*types.Status, error
 	// Set latency metrics for Prometheus export
 	status.SetLatencyMetrics(&types.LatencyMetrics{
 		Gateway: &types.GatewayLatency{
-			GatewayIP:    gatewayIP,
-			LatencyMs:    float64(avgLatency.Microseconds()) / 1000.0,
-			AvgLatencyMs: float64(avgLatency.Microseconds()) / 1000.0,
-			MaxLatencyMs: float64(maxRTT.Microseconds()) / 1000.0,
-			Reachable:    true,
-			PingCount:    len(results),
-			SuccessCount: successCount,
+			GatewayIP:     gatewayIP,
+			LatencyMs:     float64(avgLatency.Microseconds()) / 1000.0,
+			AvgLatencyMs:  float64(avgLatency.Microseconds()) / 1000.0,
+			MaxLatencyMs:  float64(maxRTT.Microseconds()) / 1000.0,
+			Reachable:     true,
+			PingCount:     len(results),
+			SuccessCount:  successCount,
+			AddressFamily: family,
 		},
 	})
 
@@ -328,24 +378,82 @@ func (m *GatewayMonitor) checkGateway(ctx context.Context) (*types.Status, error
 	return status, nil
 }
 
-// getGatewayIP determines the gateway IP to ping.
-func (m *GatewayMonitor) getGatewayIP() (string, error) {
-	// Use manual gateway if configured
+// getGatewayIP determines the gateway IP to ping along with the address family
+// ("ipv4" or "ipv6") it belongs to. The family is empty only when it cannot be
+// classified (e.g. a malformed manual gateway, which should already have been
+// rejected during config parsing).
+func (m *GatewayMonitor) getGatewayIP() (string, string, error) {
+	// Use manual gateway if configured. Classify its family from the literal.
 	if m.config.ManualGateway != "" {
-		return m.config.ManualGateway, nil
+		return m.config.ManualGateway, classifyIPFamily(m.config.ManualGateway), nil
 	}
 
-	// Auto-detect gateway if enabled
+	// Auto-detect gateway if enabled.
 	if m.config.AutoDetectGateway {
-		return detectDefaultGateway()
+		return m.detectGatewayForFamily()
 	}
 
-	return "", fmt.Errorf("no gateway configured and auto-detection is disabled")
+	return "", "", fmt.Errorf("no gateway configured and auto-detection is disabled")
 }
 
-// detectDefaultGateway detects the default IPv4 gateway from /proc/net/route.
-func detectDefaultGateway() (string, error) {
-	return detectDefaultGatewayFromFile(procNetRoute)
+// detectGatewayForFamily resolves the default gateway according to the
+// configured address family selection mode.
+func (m *GatewayMonitor) detectGatewayForFamily() (string, string, error) {
+	switch m.config.AddressFamily {
+	case familyIPv6:
+		ip, err := detectDefaultIPv6GatewayFromFile(m.ipv6RoutePath())
+		if err != nil {
+			return "", "", err
+		}
+		return ip, familyIPv6, nil
+
+	case familyAuto:
+		// Prefer IPv4; fall back to IPv6 when no IPv4 default route exists.
+		if ip, err := detectDefaultGatewayFromFile(m.routePath()); err == nil {
+			return ip, familyIPv4, nil
+		}
+		ip, err := detectDefaultIPv6GatewayFromFile(m.ipv6RoutePath())
+		if err != nil {
+			return "", "", fmt.Errorf("no default gateway found for either address family: %w", err)
+		}
+		return ip, familyIPv6, nil
+
+	default: // familyIPv4 (also the zero value / unset case)
+		ip, err := detectDefaultGatewayFromFile(m.routePath())
+		if err != nil {
+			return "", "", err
+		}
+		return ip, familyIPv4, nil
+	}
+}
+
+// routePath returns the IPv4 route-table path, honoring the test override.
+func (m *GatewayMonitor) routePath() string {
+	if m.config.procRoutePath != "" {
+		return m.config.procRoutePath
+	}
+	return procNetRoute
+}
+
+// ipv6RoutePath returns the IPv6 route-table path, honoring the test override.
+func (m *GatewayMonitor) ipv6RoutePath() string {
+	if m.config.procIPv6RoutePath != "" {
+		return m.config.procIPv6RoutePath
+	}
+	return procNetIPv6Route
+}
+
+// classifyIPFamily returns FamilyIPv4 / FamilyIPv6 for a literal IP string, or
+// "" when the string is not a valid IP address.
+func classifyIPFamily(ip string) string {
+	parsed := net.ParseIP(ip)
+	if parsed == nil {
+		return ""
+	}
+	if parsed.To4() != nil {
+		return FamilyIPv4
+	}
+	return FamilyIPv6
 }
 
 // detectDefaultGatewayFromFile opens the given path and parses it as a Linux
diff --git a/pkg/monitors/network/gateway_test.go b/pkg/monitors/network/gateway_test.go
index 85b24f0..4b86ecd 100644
--- a/pkg/monitors/network/gateway_test.go
+++ b/pkg/monitors/network/gateway_test.go
@@ -914,6 +914,7 @@ func TestGatewayMonitor_getGatewayIP(t *testing.T) {
 		name       string
 		config     *GatewayMonitorConfig
 		wantIP     string
+		wantFamily string
 		wantErr    bool
 		errContain string
 	}{
@@ -923,8 +924,9 @@ func TestGatewayMonitor_getGatewayIP(t *testing.T) {
 				ManualGateway:     "192.168.1.1",
 				AutoDetectGateway: false,
 			},
-			wantIP:  "192.168.1.1",
-			wantErr: false,
+			wantIP:     "192.168.1.1",
+			wantFamily: FamilyIPv4,
+			wantErr:    false,
 		},
 		{
 			name: "manual gateway takes precedence over auto-detect",
@@ -932,8 +934,19 @@ func TestGatewayMonitor_getGatewayIP(t *testing.T) {
 				ManualGateway:     "10.0.0.1",
 				AutoDetectGateway: true, // should be ignored when manual is set
 			},
-			wantIP:  "10.0.0.1",
-			wantErr: false,
+			wantIP:     "10.0.0.1",
+			wantFamily: FamilyIPv4,
+			wantErr:    false,
+		},
+		{
+			name: "manual IPv6 gateway classified as ipv6",
+			config: &GatewayMonitorConfig{
+				ManualGateway:     "fe80::1",
+				AutoDetectGateway: false,
+			},
+			wantIP:     "fe80::1",
+			wantFamily: FamilyIPv6,
+			wantErr:    false,
 		},
 		{
 			name: "no gateway and auto-detect disabled",
@@ -953,7 +966,7 @@ func TestGatewayMonitor_getGatewayIP(t *testing.T) {
 				config: tt.config,
 			}
 
-			ip, err := monitor.getGatewayIP()
+			ip, family, err := monitor.getGatewayIP()
 
 			if tt.wantErr {
 				if err == nil {
@@ -972,8 +985,209 @@ func TestGatewayMonitor_getGatewayIP(t *testing.T) {
 			}
 
 			if ip != tt.wantIP {
-				t.Errorf("getGatewayIP() = %q, want %q", ip, tt.wantIP)
+				t.Errorf("getGatewayIP() ip = %q, want %q", ip, tt.wantIP)
+			}
+			if family != tt.wantFamily {
+				t.Errorf("getGatewayIP() family = %q, want %q", family, tt.wantFamily)
+			}
+		})
+	}
+}
+
+// TestParseGatewayConfig_AddressFamily covers parsing/validation of the
+// addressFamily config key (ipv4 | ipv6 | auto | invalid) and the default.
+func TestParseGatewayConfig_AddressFamily(t *testing.T) {
+	tests := []struct {
+		name       string
+		config     map[string]interface{}
+		wantFamily string
+		wantErr    bool
+	}{
+		{
+			name:       "unset defaults to ipv4",
+			config:     map[string]interface{}{},
+			wantFamily: familyIPv4,
+		},
+		{
+			name:       "nil config defaults to ipv4",
+			config:     nil,
+			wantFamily: familyIPv4,
+		},
+		{
+			name:       "explicit ipv4",
+			config:     map[string]interface{}{"addressFamily": "ipv4"},
+			wantFamily: familyIPv4,
+		},
+		{
+			name:       "explicit ipv6",
+			config:     map[string]interface{}{"addressFamily": "ipv6"},
+			wantFamily: familyIPv6,
+		},
+		{
+			name:       "auto",
+			config:     map[string]interface{}{"addressFamily": "auto"},
+			wantFamily: familyAuto,
+		},
+		{
+			name:       "case-insensitive and whitespace tolerant",
+			config:     map[string]interface{}{"addressFamily": "  IPv6 "},
+			wantFamily: familyIPv6,
+		},
+		{
+			name:    "invalid value rejected",
+			config:  map[string]interface{}{"addressFamily": "ipv7"},
+			wantErr: true,
+		},
+		{
+			name:    "non-string type rejected",
+			config:  map[string]interface{}{"addressFamily": 6},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parseGatewayConfig(tt.config)
+			if (err != nil) != tt.wantErr {
+				t.Fatalf("parseGatewayConfig() error = %v, wantErr %v", err, tt.wantErr)
+			}
+			if tt.wantErr {
+				return
+			}
+			if got.AddressFamily != tt.wantFamily {
+				t.Errorf("AddressFamily = %q, want %q", got.AddressFamily, tt.wantFamily)
+			}
+		})
+	}
+}
+
+// TestGatewayMonitor_getGatewayIP_DualStack exercises the auto-detection path
+// across address families using fixture route tables instead of /proc.
+func TestGatewayMonitor_getGatewayIP_DualStack(t *testing.T) {
+	tmpDir := t.TempDir()
+
+	// IPv4 route table with a default gateway 192.168.1.1 (hex 0101A8C0).
+	ipv4Route := "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n" +
+		"eth0\t00000000\t0101A8C0\t0003\t0\t0\t100\t00000000\t0\t0\t0\n"
+	ipv4RoutePath := filepath.Join(tmpDir, "route")
+	if err := os.WriteFile(ipv4RoutePath, []byte(ipv4Route), 0o644); err != nil {
+		t.Fatalf("write ipv4 route fixture: %v", err)
+	}
+
+	// IPv4 route table WITHOUT any default route (only an on-link subnet).
+	ipv4NoDefault := "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n" +
+		"eth0\t0000A8C0\t00000000\t0001\t0\t0\t100\t00FFFFFF\t0\t0\t0\n"
+	ipv4NoDefaultPath := filepath.Join(tmpDir, "route_nodefault")
+	if err := os.WriteFile(ipv4NoDefaultPath, []byte(ipv4NoDefault), 0o644); err != nil {
+		t.Fatalf("write ipv4 no-default fixture: %v", err)
+	}
+
+	// Reuse the committed IPv6 fixture (default route via fe80::1).
+	const ipv6FixturePath = "testdata/proc/net/ipv6_route"
+
+	tests := []struct {
+		name              string
+		addressFamily     string
+		procRoutePath     string
+		procIPv6RoutePath string
+		wantIP            string
+		wantFamily        string
+		wantErr           bool
+	}{
+		{
+			name:          "family ipv4 selects IPv4 default route",
+			addressFamily: familyIPv4,
+			procRoutePath: ipv4RoutePath,
+			wantIP:        "192.168.1.1",
+			wantFamily:    FamilyIPv4,
+		},
+		{
+			name:              "family ipv6 selects IPv6 default route",
+			addressFamily:     familyIPv6,
+			procIPv6RoutePath: ipv6FixturePath,
+			wantIP:            "fe80::1",
+			wantFamily:        FamilyIPv6,
+		},
+		{
+			name:              "auto prefers IPv4 when present",
+			addressFamily:     familyAuto,
+			procRoutePath:     ipv4RoutePath,
+			procIPv6RoutePath: ipv6FixturePath,
+			wantIP:            "192.168.1.1",
+			wantFamily:        FamilyIPv4,
+		},
+		{
+			name:              "auto falls back to IPv6 when no IPv4 default route",
+			addressFamily:     familyAuto,
+			procRoutePath:     ipv4NoDefaultPath,
+			procIPv6RoutePath: ipv6FixturePath,
+			wantIP:            "fe80::1",
+			wantFamily:        FamilyIPv6,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			monitor := &GatewayMonitor{
+				name: "test-gateway",
+				config: &GatewayMonitorConfig{
+					AutoDetectGateway: true,
+					AddressFamily:     tt.addressFamily,
+					procRoutePath:     tt.procRoutePath,
+					procIPv6RoutePath: tt.procIPv6RoutePath,
+				},
+			}
+
+			ip, family, err := monitor.getGatewayIP()
+			if (err != nil) != tt.wantErr {
+				t.Fatalf("getGatewayIP() error = %v, wantErr %v", err, tt.wantErr)
+			}
+			if tt.wantErr {
+				return
+			}
+			if ip != tt.wantIP {
+				t.Errorf("getGatewayIP() ip = %q, want %q", ip, tt.wantIP)
+			}
+			if family != tt.wantFamily {
+				t.Errorf("getGatewayIP() family = %q, want %q", family, tt.wantFamily)
 			}
 		})
 	}
 }
+
+// TestGatewayMonitor_DefaultFamilyUnchanged verifies that with no addressFamily
+// configured, auto-detection probes IPv4 only (preserving historical behavior).
+func TestGatewayMonitor_DefaultFamilyUnchanged(t *testing.T) {
+	tmpDir := t.TempDir()
+	ipv4Route := "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n" +
+		"eth0\t00000000\t0101A8C0\t0003\t0\t0\t100\t00000000\t0\t0\t0\n"
+	ipv4RoutePath := filepath.Join(tmpDir, "route")
+	if err := os.WriteFile(ipv4RoutePath, []byte(ipv4Route), 0o644); err != nil {
+		t.Fatalf("write ipv4 route fixture: %v", err)
+	}
+
+	// Parse an empty config so AddressFamily picks up the package default.
+	cfg, err := parseGatewayConfig(map[string]interface{}{})
+	if err != nil {
+		t.Fatalf("parseGatewayConfig() error: %v", err)
+	}
+	if cfg.AddressFamily != familyIPv4 {
+		t.Fatalf("default AddressFamily = %q, want %q", cfg.AddressFamily, familyIPv4)
+	}
+
+	cfg.AutoDetectGateway = true
+	cfg.procRoutePath = ipv4RoutePath
+	// Deliberately do NOT set procIPv6RoutePath; IPv4-only must not touch it.
+
+	monitor := &GatewayMonitor{name: "test-gateway", config: cfg}
+	ip, family, err := monitor.getGatewayIP()
+	if err != nil {
+		t.Fatalf("getGatewayIP() unexpected error: %v", err)
+	}
+	if ip != "192.168.1.1" {
+		t.Errorf("getGatewayIP() ip = %q, want %q", ip, "192.168.1.1")
+	}
+	if family != FamilyIPv4 {
+		t.Errorf("getGatewayIP() family = %q, want %q", family, FamilyIPv4)
+	}
+}
diff --git a/pkg/types/types.go b/pkg/types/types.go
index ef0bb8c..923b33c 100644
--- a/pkg/types/types.go
+++ b/pkg/types/types.go
@@ -451,6 +451,11 @@ type GatewayLatency struct {
 	Reachable    bool    `json:"reachable"`
 	PingCount    int     `json:"ping_count"`
 	SuccessCount int     `json:"success_count"`
+	// AddressFamily records which IP family the probed gateway belongs to
+	// ("ipv4" or "ipv6"). It lets downstream consumers distinguish dual-stack
+	// gateway probes. Empty when the family is unknown (e.g. a manually
+	// configured gateway whose family could not be classified).
+	AddressFamily string `json:"address_family,omitempty"`
 }
 
 // PeerLatency represents latency to a peer node.

From 8f69d7039d7870fbe4d36a40b7d1bf8a58079e2c Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 01:45:26 -0500
Subject: [PATCH 03/38] feat(network): standalone IPv6 default-route detection
 monitor (Task #17206)

New detection-only monitor (type network-ipv6-route) that reuses the
landed detectDefaultIPv6GatewayFromFile parser to verify an IPv6 default
route is present when expected, emitting IPv6DefaultRouteMissing.
Distinguishes unreadable route table (warning, IPv6 may be absent) from
genuinely-absent default route. Self-registers via init(); cmd wiring
owned by #17209. Unit tests at 95.9% coverage.
---
 pkg/monitors/network/ipv6_route.go      | 222 +++++++++++++
 pkg/monitors/network/ipv6_route_test.go | 420 ++++++++++++++++++++++++
 2 files changed, 642 insertions(+)
 create mode 100644 pkg/monitors/network/ipv6_route.go
 create mode 100644 pkg/monitors/network/ipv6_route_test.go

diff --git a/pkg/monitors/network/ipv6_route.go b/pkg/monitors/network/ipv6_route.go
new file mode 100644
index 0000000..4e3f236
--- /dev/null
+++ b/pkg/monitors/network/ipv6_route.go
@@ -0,0 +1,222 @@
+// Package network provides network health monitoring capabilities.
+package network
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io/fs"
+	"path/filepath"
+
+	"github.com/supporttools/node-doctor/pkg/monitors"
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+const (
+	// Default configuration values for the IPv6 default-route monitor.
+	defaultIPv6RouteExpectDefault = true
+	defaultIPv6RouteProcPath      = "/proc"
+
+	// ipv6RouteRelPath is the route-table path relative to the proc mount.
+	// The monitor reads <procPath>/net/ipv6_route.
+	ipv6RouteRelPath = "net/ipv6_route"
+)
+
+// IPv6RouteConfig holds configuration for the IPv6 default-route monitor.
+type IPv6RouteConfig struct {
+	// ExpectDefaultRoute controls severity. When true, the absence of an IPv6
+	// default route is treated as a problem (condition True). When false, the
+	// absence is recorded but not flagged.
+	ExpectDefaultRoute bool
+	// ProcPath is the base path for the proc filesystem. Defaults to "/proc";
+	// override with "/host/proc" for containerized deployments. The monitor
+	// reads <ProcPath>/net/ipv6_route.
+	ProcPath string
+}
+
+// IPv6RouteMonitor checks whether an IPv6 default route is present on the node.
+// This monitor is detection-only and never modifies routes.
+type IPv6RouteMonitor struct {
+	name   string
+	config *IPv6RouteConfig
+
+	*monitors.BaseMonitor
+}
+
+// init registers the IPv6 default-route monitor with the monitor registry.
+func init() {
+	monitors.MustRegister(monitors.MonitorInfo{
+		Type:        "network-ipv6-route",
+		Factory:     NewIPv6RouteMonitor,
+		Validator:   ValidateIPv6RouteConfig,
+		Description: "Detection-only monitor for the IPv6 default route (does not modify routes)",
+		DefaultConfig: &types.MonitorConfig{
+			Name:           "ipv6-route-check",
+			Type:           "network-ipv6-route",
+			Enabled:        true,
+			IntervalString: "60s",
+			TimeoutString:  "5s",
+			Config: map[string]any{
+				"expectDefaultRoute": true,
+				"procPath":           "/proc",
+			},
+		},
+	})
+}
+
+// NewIPv6RouteMonitor creates a new IPv6 default-route monitor instance.
+func NewIPv6RouteMonitor(ctx context.Context, config types.MonitorConfig) (types.Monitor, error) {
+	cfg, err := parseIPv6RouteConfig(config.Config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse ipv6 route config: %w", err)
+	}
+
+	baseMonitor, err := monitors.NewBaseMonitor(config.Name, config.Interval, config.Timeout)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create base monitor: %w", err)
+	}
+
+	monitor := &IPv6RouteMonitor{
+		name:        config.Name,
+		config:      cfg,
+		BaseMonitor: baseMonitor,
+	}
+
+	if err := baseMonitor.SetCheckFunc(monitor.checkIPv6Route); err != nil {
+		return nil, fmt.Errorf("failed to set check function: %w", err)
+	}
+
+	return monitor, nil
+}
+
+// parseIPv6RouteConfig parses configuration from a generic map.
+func parseIPv6RouteConfig(configMap map[string]any) (*IPv6RouteConfig, error) {
+	config := &IPv6RouteConfig{
+		ExpectDefaultRoute: defaultIPv6RouteExpectDefault,
+		ProcPath:           defaultIPv6RouteProcPath,
+	}
+
+	if configMap == nil {
+		return config, nil
+	}
+
+	if v, ok := configMap["expectDefaultRoute"]; ok {
+		boolVal, ok := v.(bool)
+		if !ok {
+			return nil, fmt.Errorf("expectDefaultRoute must be a boolean, got %T", v)
+		}
+		config.ExpectDefaultRoute = boolVal
+	}
+
+	if v, ok := configMap["procPath"]; ok {
+		strVal, ok := v.(string)
+		if !ok {
+			return nil, fmt.Errorf("procPath must be a string, got %T", v)
+		}
+		config.ProcPath = strVal
+	}
+
+	return config, nil
+}
+
+// ValidateIPv6RouteConfig validates the IPv6 default-route monitor configuration.
+func ValidateIPv6RouteConfig(config types.MonitorConfig) error {
+	_, err := parseIPv6RouteConfig(config.Config)
+	return err
+}
+
+// ipv6RoutePath returns the full path to the IPv6 route table for this monitor.
+func (m *IPv6RouteMonitor) ipv6RoutePath() string {
+	return filepath.Join(m.config.ProcPath, ipv6RouteRelPath)
+}
+
+// checkIPv6Route performs the IPv6 default-route health check. It reuses the
+// IPv6 route-table parser from the gateway monitor
+// (detectDefaultIPv6GatewayFromFile) rather than re-parsing the route table.
+func (m *IPv6RouteMonitor) checkIPv6Route(ctx context.Context) (*types.Status, error) {
+	status := types.NewStatus(m.name)
+
+	path := m.ipv6RoutePath()
+
+	gateway, err := detectDefaultIPv6GatewayFromFile(path)
+	if err != nil {
+		// A missing or unreadable route table means the IPv6 stack may be
+		// legitimately absent (e.g. a hardened or IPv4-only node). Treat this
+		// as a warning rather than a hard error, consistent with the IPv6
+		// sysctl monitor.
+		if isIPv6RouteUnreadable(err) {
+			status.AddEvent(types.NewEvent(
+				types.EventWarning,
+				"IPv6RouteReadError",
+				fmt.Sprintf("Failed to read IPv6 route table from %s: %v. "+
+					"The IPv6 stack may be absent on this node.", path, err),
+			))
+			m.recordDefaultRouteAbsent(status, "IPv6RouteTableUnreadable",
+				fmt.Sprintf("IPv6 route table %s is unreadable; cannot confirm an IPv6 default route", path))
+			return status, nil
+		}
+
+		// The route table was readable but contains no IPv6 default route
+		// (only on-link/link-scoped routes, or no routes at all).
+		m.recordDefaultRouteAbsent(status, "NoIPv6DefaultRoute",
+			"No IPv6 default route is present in the IPv6 route table")
+		return status, nil
+	}
+
+	// A default route exists.
+	status.AddCondition(types.NewCondition(
+		"IPv6DefaultRouteMissing",
+		types.ConditionFalse,
+		"IPv6DefaultRoutePresent",
+		fmt.Sprintf("IPv6 default route present via gateway %s", gateway),
+	))
+	status.AddEvent(types.NewEvent(
+		types.EventInfo,
+		"IPv6DefaultRoutePresent",
+		fmt.Sprintf("IPv6 default route is present (next-hop %s)", gateway),
+	))
+
+	return status, nil
+}
+
+// recordDefaultRouteAbsent records the condition and event for an absent IPv6
+// default route. Severity depends on ExpectDefaultRoute: when a default route
+// is expected the condition is True (a problem); otherwise it is False and the
+// absence is reported informationally.
+func (m *IPv6RouteMonitor) recordDefaultRouteAbsent(status *types.Status, reason, message string) {
+	if m.config.ExpectDefaultRoute {
+		status.AddCondition(types.NewCondition(
+			"IPv6DefaultRouteMissing",
+			types.ConditionTrue,
+			reason,
+			message,
+		))
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			reason,
+			fmt.Sprintf("%s. This monitor is detection-only and does not modify routes.", message),
+		))
+		return
+	}
+
+	status.AddCondition(types.NewCondition(
+		"IPv6DefaultRouteMissing",
+		types.ConditionFalse,
+		"IPv6DefaultRouteNotExpected",
+		fmt.Sprintf("%s; expectDefaultRoute=false so no action required", message),
+	))
+	status.AddEvent(types.NewEvent(
+		types.EventInfo,
+		"IPv6DefaultRouteNotExpected",
+		fmt.Sprintf("%s; expectDefaultRoute=false so no action required", message),
+	))
+}
+
+// isIPv6RouteUnreadable reports whether the error from
+// detectDefaultIPv6GatewayFromFile indicates the route table could not be read
+// (as opposed to being read successfully but containing no default route). The
+// parser wraps os.Open failures, so we match against fs path errors.
+func isIPv6RouteUnreadable(err error) bool {
+	var pathErr *fs.PathError
+	return errors.As(err, &pathErr)
+}
diff --git a/pkg/monitors/network/ipv6_route_test.go b/pkg/monitors/network/ipv6_route_test.go
new file mode 100644
index 0000000..aa06e65
--- /dev/null
+++ b/pkg/monitors/network/ipv6_route_test.go
@@ -0,0 +1,420 @@
+package network
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+func TestParseIPv6RouteConfig(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  map[string]any
+		want    *IPv6RouteConfig
+		wantErr bool
+	}{
+		{
+			name:   "nil config - use defaults",
+			config: nil,
+			want: &IPv6RouteConfig{
+				ExpectDefaultRoute: defaultIPv6RouteExpectDefault,
+				ProcPath:           defaultIPv6RouteProcPath,
+			},
+		},
+		{
+			name:   "empty config - use defaults",
+			config: map[string]any{},
+			want: &IPv6RouteConfig{
+				ExpectDefaultRoute: defaultIPv6RouteExpectDefault,
+				ProcPath:           defaultIPv6RouteProcPath,
+			},
+		},
+		{
+			name: "custom values",
+			config: map[string]any{
+				"expectDefaultRoute": false,
+				"procPath":           "/host/proc",
+			},
+			want: &IPv6RouteConfig{
+				ExpectDefaultRoute: false,
+				ProcPath:           "/host/proc",
+			},
+		},
+		{
+			name:    "invalid expectDefaultRoute type",
+			config:  map[string]any{"expectDefaultRoute": "yes"},
+			wantErr: true,
+		},
+		{
+			name:    "invalid procPath type",
+			config:  map[string]any{"procPath": 123},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parseIPv6RouteConfig(tt.config)
+
+			if (err != nil) != tt.wantErr {
+				t.Errorf("parseIPv6RouteConfig() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if tt.wantErr {
+				return
+			}
+
+			if got.ExpectDefaultRoute != tt.want.ExpectDefaultRoute {
+				t.Errorf("ExpectDefaultRoute = %v, want %v", got.ExpectDefaultRoute, tt.want.ExpectDefaultRoute)
+			}
+			if got.ProcPath != tt.want.ProcPath {
+				t.Errorf("ProcPath = %v, want %v", got.ProcPath, tt.want.ProcPath)
+			}
+		})
+	}
+}
+
+func TestValidateIPv6RouteConfig(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  map[string]any
+		wantErr bool
+	}{
+		{
+			name:    "valid config",
+			config:  map[string]any{"expectDefaultRoute": true},
+			wantErr: false,
+		},
+		{
+			name:    "invalid config",
+			config:  map[string]any{"expectDefaultRoute": "yes"},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			monitorConfig := types.MonitorConfig{
+				Name:     "test-ipv6-route",
+				Type:     "network-ipv6-route",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config:   tt.config,
+			}
+			err := ValidateIPv6RouteConfig(monitorConfig)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("ValidateIPv6RouteConfig() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestNewIPv6RouteMonitor(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  types.MonitorConfig
+		wantErr bool
+	}{
+		{
+			name: "valid config",
+			config: types.MonitorConfig{
+				Name:     "test-ipv6-route",
+				Type:     "network-ipv6-route",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config:   map[string]any{"expectDefaultRoute": true},
+			},
+			wantErr: false,
+		},
+		{
+			name: "invalid config - bad type",
+			config: types.MonitorConfig{
+				Name:     "test-ipv6-route",
+				Type:     "network-ipv6-route",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config:   map[string]any{"expectDefaultRoute": "invalid"},
+			},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			monitor, err := NewIPv6RouteMonitor(context.Background(), tt.config)
+
+			if (err != nil) != tt.wantErr {
+				t.Errorf("NewIPv6RouteMonitor() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !tt.wantErr && monitor == nil {
+				t.Error("NewIPv6RouteMonitor() returned nil monitor")
+			}
+		})
+	}
+}
+
+// writeMockIPv6Route creates <procDir>/net/ipv6_route with the given content
+// and returns the proc directory root.
+func writeMockIPv6Route(t *testing.T, content string) string {
+	t.Helper()
+
+	procDir := t.TempDir()
+	netDir := filepath.Join(procDir, "net")
+	if err := os.MkdirAll(netDir, 0755); err != nil {
+		t.Fatalf("Failed to create net dir: %v", err)
+	}
+	if err := os.WriteFile(filepath.Join(netDir, "ipv6_route"), []byte(content), 0644); err != nil {
+		t.Fatalf("Failed to write ipv6_route: %v", err)
+	}
+	return procDir
+}
+
+// findIPv6RouteCondition returns the IPv6DefaultRouteMissing condition, or nil.
+func findIPv6RouteCondition(status *types.Status) *types.Condition {
+	for i := range status.Conditions {
+		if status.Conditions[i].Type == "IPv6DefaultRouteMissing" {
+			return &status.Conditions[i]
+		}
+	}
+	return nil
+}
+
+// ipv6RouteFixture is a route table that contains a default route (line 1) via
+// fe80::1 plus several non-default/on-link routes. Mirrors the format of the
+// committed testdata fixture.
+const ipv6RouteWithDefault = "00000000000000000000000000000000 00 00000000000000000000000000000000 00 fe800000000000000000000000000001 00000400 00000003 00000000 00000003     eth0\n" +
+	"2001000000000000000000000000abcd 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001     eth0\n" +
+	"fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000007 00000000 00000001     eth0\n"
+
+// ipv6RouteNoDefault contains only on-link / prefix routes (no default route).
+const ipv6RouteNoDefault = "2001000000000000000000000000abcd 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001     eth0\n" +
+	"fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000007 00000000 00000001     eth0\n"
+
+func TestCheckIPv6Route_DefaultPresent(t *testing.T) {
+	procDir := writeMockIPv6Route(t, ipv6RouteWithDefault)
+
+	monitor := &IPv6RouteMonitor{
+		name: "test-ipv6-route",
+		config: &IPv6RouteConfig{
+			ExpectDefaultRoute: true,
+			ProcPath:           procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Route(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Route() unexpected error: %v", err)
+	}
+
+	cond := findIPv6RouteCondition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6DefaultRouteMissing condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6DefaultRouteMissing=False, got %s", cond.Status)
+	}
+	if !hasEventReason(status, "IPv6DefaultRoutePresent") {
+		t.Error("Expected IPv6DefaultRoutePresent event, but not found")
+	}
+}
+
+func TestCheckIPv6Route_DefaultAbsentExpected(t *testing.T) {
+	procDir := writeMockIPv6Route(t, ipv6RouteNoDefault)
+
+	monitor := &IPv6RouteMonitor{
+		name: "test-ipv6-route",
+		config: &IPv6RouteConfig{
+			ExpectDefaultRoute: true,
+			ProcPath:           procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Route(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Route() unexpected error: %v", err)
+	}
+
+	cond := findIPv6RouteCondition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6DefaultRouteMissing condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6DefaultRouteMissing=True, got %s", cond.Status)
+	}
+	if cond.Reason != "NoIPv6DefaultRoute" {
+		t.Errorf("Expected reason NoIPv6DefaultRoute, got %s", cond.Reason)
+	}
+	if !hasEventReason(status, "NoIPv6DefaultRoute") {
+		t.Error("Expected NoIPv6DefaultRoute event, but not found")
+	}
+	for _, event := range status.Events {
+		if event.Reason == "NoIPv6DefaultRoute" && event.Severity != types.EventWarning {
+			t.Errorf("Expected Warning severity for NoIPv6DefaultRoute, got %s", event.Severity)
+		}
+	}
+}
+
+func TestCheckIPv6Route_DefaultAbsentNotExpected(t *testing.T) {
+	procDir := writeMockIPv6Route(t, ipv6RouteNoDefault)
+
+	monitor := &IPv6RouteMonitor{
+		name: "test-ipv6-route",
+		config: &IPv6RouteConfig{
+			ExpectDefaultRoute: false,
+			ProcPath:           procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Route(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Route() unexpected error: %v", err)
+	}
+
+	cond := findIPv6RouteCondition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6DefaultRouteMissing condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6DefaultRouteMissing=False (not expected), got %s", cond.Status)
+	}
+	if !hasEventReason(status, "IPv6DefaultRouteNotExpected") {
+		t.Error("Expected IPv6DefaultRouteNotExpected info event, but not found")
+	}
+	if hasEventReason(status, "NoIPv6DefaultRoute") {
+		t.Error("Did not expect NoIPv6DefaultRoute warning event when expectDefaultRoute=false")
+	}
+	for _, event := range status.Events {
+		if event.Reason == "IPv6DefaultRouteNotExpected" && event.Severity != types.EventInfo {
+			t.Errorf("Expected Info severity for IPv6DefaultRouteNotExpected, got %s", event.Severity)
+		}
+	}
+}
+
+func TestCheckIPv6Route_MissingFileExpected(t *testing.T) {
+	// procDir exists but has no net/ipv6_route file -> read error becomes a
+	// warning, not a hard error.
+	procDir := t.TempDir()
+
+	monitor := &IPv6RouteMonitor{
+		name: "test-ipv6-route",
+		config: &IPv6RouteConfig{
+			ExpectDefaultRoute: true,
+			ProcPath:           procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Route(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Route() unexpected error (should not hard error): %v", err)
+	}
+
+	if !hasEventReason(status, "IPv6RouteReadError") {
+		t.Error("Expected IPv6RouteReadError warning event for missing file, but not found")
+	}
+	for _, event := range status.Events {
+		if event.Reason == "IPv6RouteReadError" && event.Severity != types.EventWarning {
+			t.Errorf("Expected Warning severity for IPv6RouteReadError, got %s", event.Severity)
+		}
+	}
+
+	cond := findIPv6RouteCondition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6DefaultRouteMissing condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6DefaultRouteMissing=True (unreadable, expected), got %s", cond.Status)
+	}
+	if cond.Reason != "IPv6RouteTableUnreadable" {
+		t.Errorf("Expected reason IPv6RouteTableUnreadable, got %s", cond.Reason)
+	}
+}
+
+func TestCheckIPv6Route_MissingFileNotExpected(t *testing.T) {
+	procDir := t.TempDir()
+
+	monitor := &IPv6RouteMonitor{
+		name: "test-ipv6-route",
+		config: &IPv6RouteConfig{
+			ExpectDefaultRoute: false,
+			ProcPath:           procDir,
+		},
+	}
+
+	status, err := monitor.checkIPv6Route(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Route() unexpected error: %v", err)
+	}
+
+	// The read error is still surfaced as a warning event.
+	if !hasEventReason(status, "IPv6RouteReadError") {
+		t.Error("Expected IPv6RouteReadError warning event for missing file, but not found")
+	}
+
+	cond := findIPv6RouteCondition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6DefaultRouteMissing condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6DefaultRouteMissing=False (unreadable, not expected), got %s", cond.Status)
+	}
+}
+
+func TestCheckIPv6Route_NonexistentProcPath(t *testing.T) {
+	monitor := &IPv6RouteMonitor{
+		name: "test-ipv6-route",
+		config: &IPv6RouteConfig{
+			ExpectDefaultRoute: true,
+			ProcPath:           "/nonexistent/proc",
+		},
+	}
+
+	status, err := monitor.checkIPv6Route(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Route() unexpected error: %v", err)
+	}
+
+	if !hasEventReason(status, "IPv6RouteReadError") {
+		t.Error("Expected IPv6RouteReadError event for nonexistent procPath, but not found")
+	}
+
+	cond := findIPv6RouteCondition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6DefaultRouteMissing condition, but not found")
+	}
+	if cond.Status != types.ConditionTrue {
+		t.Errorf("Expected IPv6DefaultRouteMissing=True for nonexistent procPath, got %s", cond.Status)
+	}
+}
+
+func TestCheckIPv6Route_TestdataFixture(t *testing.T) {
+	// The committed fixture testdata/proc/net/ipv6_route contains a default
+	// route via fe80::1, so the monitor reports the route present.
+	monitor := &IPv6RouteMonitor{
+		name: "test-ipv6-route",
+		config: &IPv6RouteConfig{
+			ExpectDefaultRoute: true,
+			ProcPath:           filepath.Join("testdata", "proc"),
+		},
+	}
+
+	status, err := monitor.checkIPv6Route(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Route() unexpected error: %v", err)
+	}
+
+	cond := findIPv6RouteCondition(status)
+	if cond == nil {
+		t.Fatal("Expected IPv6DefaultRouteMissing condition, but not found")
+	}
+	if cond.Status != types.ConditionFalse {
+		t.Errorf("Expected IPv6DefaultRouteMissing=False from fixture with default route, got %s", cond.Status)
+	}
+	if !hasEventReason(status, "IPv6DefaultRoutePresent") {
+		t.Error("Expected IPv6DefaultRoutePresent event from fixture, but not found")
+	}
+}

From dc665d36f357b6d7971485750afc3cc4e8ad05f6 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 01:54:23 -0500
Subject: [PATCH 04/38] feat(network): IPv6 RA/SLAAC + address-presence
 detection monitor (Task #17207)

New /proc-based detection-only monitor (type network-ipv6-neighbor)
parsing /proc/net/if_inet6 (link-local/global address presence) and
accept_ra/autoconf sysctls. Emits IPv6LinkLocalMissing,
IPv6GlobalAddressMissing, IPv6RouterAdvertisementDisabled. No new deps;
netlink NDP neighbor-cache documented as out-of-scope follow-up.
Self-registers via init() (#17209 owns cmd wiring). 93.2% coverage.
---
 pkg/monitors/network/ipv6_neighbor.go      | 644 ++++++++++++++++++
 pkg/monitors/network/ipv6_neighbor_test.go | 752 +++++++++++++++++++++
 2 files changed, 1396 insertions(+)
 create mode 100644 pkg/monitors/network/ipv6_neighbor.go
 create mode 100644 pkg/monitors/network/ipv6_neighbor_test.go

diff --git a/pkg/monitors/network/ipv6_neighbor.go b/pkg/monitors/network/ipv6_neighbor.go
new file mode 100644
index 0000000..a6dc0ff
--- /dev/null
+++ b/pkg/monitors/network/ipv6_neighbor.go
@@ -0,0 +1,644 @@
+// Package network provides network health monitoring capabilities.
+package network
+
+import (
+	"bufio"
+	"context"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"slices"
+	"strconv"
+	"strings"
+
+	"github.com/supporttools/node-doctor/pkg/monitors"
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+// NOTE (out of scope): True NDP neighbor-cache reachability (i.e. inspecting the
+// kernel neighbor table for REACHABLE/STALE/FAILED states of on-link IPv6
+// neighbors and the default router) requires a netlink RTM_GETNEIGH dump.
+// node-doctor does not vendor a netlink library and this task explicitly forbids
+// adding a new dependency or shelling out to `ip`. Reading the neighbor cache via
+// /proc is not possible (there is no stable /proc representation of the IPv6
+// neighbor table). This monitor therefore assesses RA/SLAAC *configuration and
+// outcome* using readable /proc sources (configured addresses + accept_ra /
+// autoconf sysctls) rather than live neighbor reachability. A follow-up that adds
+// netlink-based NDP reachability is tracked as a separate task (#17207 follow-up).
+
+const (
+	// Default configuration values for the IPv6 neighbor / RA / SLAAC monitor.
+	defaultIPv6NeighborExpectEnabled        = true
+	defaultIPv6NeighborCheckPerIface        = true
+	defaultIPv6NeighborRequireGlobal        = false
+	defaultIPv6NeighborProcPath             = "/proc"
+	ipv6IfInet6RelPath                      = "net/if_inet6"
+	ipv6AcceptRARelGlob                     = "sys/net/ipv6/conf/*/accept_ra"
+	ipv6ConfDirRelPath                      = "sys/net/ipv6/conf"
+	ipv6NeighborAutoconfFileName            = "autoconf"
+	ipv6LinkLocalScopeHex            uint64 = 0x20
+)
+
+// Condition types emitted by the IPv6 neighbor monitor.
+const (
+	conditionIPv6LinkLocalMissing = "IPv6LinkLocalMissing"
+	conditionIPv6GlobalMissing    = "IPv6GlobalAddressMissing"
+	conditionIPv6RADisabled       = "IPv6RouterAdvertisementDisabled"
+)
+
+// defaultIPv6NeighborSkipInterfaces are interfaces excluded from per-interface
+// address and RA/autoconf checks. "all"/"default" are global pseudo-interfaces
+// (no entries in if_inet6) and "lo" is the loopback, which carries only ::1 and
+// never participates in RA/SLAAC.
+var defaultIPv6NeighborSkipInterfaces = []string{"all", "default", "lo"}
+
+// IPv6NeighborConfig holds configuration for the IPv6 neighbor / RA / SLAAC monitor.
+type IPv6NeighborConfig struct {
+	// ExpectIPv6Enabled controls severity. When true, missing link-local
+	// addresses and disabled RA where IPv6 is expected are treated as problems.
+	// When false the findings are recorded informationally only.
+	ExpectIPv6Enabled bool
+	// CheckPerInterface enables scanning per-interface accept_ra / autoconf
+	// sysctls. Address checks (if_inet6) always run.
+	CheckPerInterface bool
+	// RequireGlobalAddress controls whether a non-loopback interface that has a
+	// link-local address but no global/SLAAC address is flagged as a warning.
+	RequireGlobalAddress bool
+	// Interfaces, when non-empty, restricts checks to these interface names.
+	// Empty means check every interface discovered via if_inet6 / glob.
+	Interfaces []string
+	// SkipInterfaces lists interface names to exclude. Defaults to
+	// {"all", "default", "lo"}.
+	SkipInterfaces []string
+	// ProcPath is the base path for the proc filesystem. Defaults to "/proc";
+	// override with "/host/proc" for containerized deployments.
+	ProcPath string
+}
+
+// IPv6NeighborMonitor assesses IPv6 RA/SLAAC health from /proc. It is
+// detection-only and never modifies addresses, routes, or sysctls.
+type IPv6NeighborMonitor struct {
+	name   string
+	config *IPv6NeighborConfig
+
+	*monitors.BaseMonitor
+}
+
+// init registers the IPv6 neighbor / RA / SLAAC monitor with the registry.
+func init() {
+	monitors.MustRegister(monitors.MonitorInfo{
+		Type:        "network-ipv6-neighbor",
+		Factory:     NewIPv6NeighborMonitor,
+		Validator:   ValidateIPv6NeighborConfig,
+		Description: "Detection-only monitor for IPv6 RA/SLAAC health via configured addresses and accept_ra/autoconf sysctls (does not modify state)",
+		DefaultConfig: &types.MonitorConfig{
+			Name:           "ipv6-neighbor-check",
+			Type:           "network-ipv6-neighbor",
+			Enabled:        true,
+			IntervalString: "60s",
+			TimeoutString:  "5s",
+			Config: map[string]any{
+				"expectIPv6Enabled":    true,
+				"checkPerInterface":    true,
+				"requireGlobalAddress": false,
+				"procPath":             "/proc",
+			},
+		},
+	})
+}
+
+// NewIPv6NeighborMonitor creates a new IPv6 neighbor / RA / SLAAC monitor instance.
+func NewIPv6NeighborMonitor(ctx context.Context, config types.MonitorConfig) (types.Monitor, error) {
+	cfg, err := parseIPv6NeighborConfig(config.Config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse ipv6 neighbor config: %w", err)
+	}
+
+	baseMonitor, err := monitors.NewBaseMonitor(config.Name, config.Interval, config.Timeout)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create base monitor: %w", err)
+	}
+
+	monitor := &IPv6NeighborMonitor{
+		name:        config.Name,
+		config:      cfg,
+		BaseMonitor: baseMonitor,
+	}
+
+	if err := baseMonitor.SetCheckFunc(monitor.checkIPv6Neighbor); err != nil {
+		return nil, fmt.Errorf("failed to set check function: %w", err)
+	}
+
+	return monitor, nil
+}
+
+// parseIPv6NeighborConfig parses configuration from a generic map.
+func parseIPv6NeighborConfig(configMap map[string]any) (*IPv6NeighborConfig, error) {
+	config := &IPv6NeighborConfig{
+		ExpectIPv6Enabled:    defaultIPv6NeighborExpectEnabled,
+		CheckPerInterface:    defaultIPv6NeighborCheckPerIface,
+		RequireGlobalAddress: defaultIPv6NeighborRequireGlobal,
+		ProcPath:             defaultIPv6NeighborProcPath,
+		SkipInterfaces:       append([]string(nil), defaultIPv6NeighborSkipInterfaces...),
+	}
+
+	if configMap == nil {
+		return config, nil
+	}
+
+	if v, ok := configMap["expectIPv6Enabled"]; ok {
+		boolVal, ok := v.(bool)
+		if !ok {
+			return nil, fmt.Errorf("expectIPv6Enabled must be a boolean, got %T", v)
+		}
+		config.ExpectIPv6Enabled = boolVal
+	}
+
+	if v, ok := configMap["checkPerInterface"]; ok {
+		boolVal, ok := v.(bool)
+		if !ok {
+			return nil, fmt.Errorf("checkPerInterface must be a boolean, got %T", v)
+		}
+		config.CheckPerInterface = boolVal
+	}
+
+	if v, ok := configMap["requireGlobalAddress"]; ok {
+		boolVal, ok := v.(bool)
+		if !ok {
+			return nil, fmt.Errorf("requireGlobalAddress must be a boolean, got %T", v)
+		}
+		config.RequireGlobalAddress = boolVal
+	}
+
+	if v, ok := configMap["interfaces"]; ok {
+		ifaces, err := parseStringList(v, "interfaces")
+		if err != nil {
+			return nil, err
+		}
+		config.Interfaces = ifaces
+	}
+
+	if v, ok := configMap["skipInterfaces"]; ok {
+		ifaces, err := parseStringList(v, "skipInterfaces")
+		if err != nil {
+			return nil, err
+		}
+		// Explicit override replaces the defaults so operators can opt back into
+		// checking lo if desired.
+		config.SkipInterfaces = ifaces
+	}
+
+	if v, ok := configMap["procPath"]; ok {
+		strVal, ok := v.(string)
+		if !ok {
+			return nil, fmt.Errorf("procPath must be a string, got %T", v)
+		}
+		config.ProcPath = strVal
+	}
+
+	return config, nil
+}
+
+// ValidateIPv6NeighborConfig validates the IPv6 neighbor monitor configuration.
+func ValidateIPv6NeighborConfig(config types.MonitorConfig) error {
+	_, err := parseIPv6NeighborConfig(config.Config)
+	return err
+}
+
+// ipv6IfInet6Path returns the full path to the if_inet6 address table.
+func (m *IPv6NeighborMonitor) ipv6IfInet6Path() string {
+	return filepath.Join(m.config.ProcPath, ipv6IfInet6RelPath)
+}
+
+// ipv6Address describes a single IPv6 address parsed from /proc/net/if_inet6.
+type ipv6Address struct {
+	// IfaceName is the device name (e.g. "eth0").
+	IfaceName string
+	// Scope is the raw scope value from if_inet6 (0x20 = link-local, 0x00 =
+	// global).
+	Scope uint64
+	// IsLinkLocal reports whether the scope indicates a link-local address.
+	IsLinkLocal bool
+	// IsGlobal reports whether the scope indicates a global address (scope 0).
+	IsGlobal bool
+}
+
+// ifaceAddrSummary aggregates per-interface address presence.
+type ifaceAddrSummary struct {
+	hasLinkLocal bool
+	hasGlobal    bool
+}
+
+// checkIPv6Neighbor performs the IPv6 RA/SLAAC health check.
+func (m *IPv6NeighborMonitor) checkIPv6Neighbor(ctx context.Context) (*types.Status, error) {
+	status := types.NewStatus(m.name)
+
+	skip := m.config.SkipInterfaces
+	if skip == nil {
+		skip = defaultIPv6NeighborSkipInterfaces
+	}
+
+	addrs, readErr := parseIfInet6File(m.ipv6IfInet6Path())
+	if readErr != nil {
+		// A missing/unreadable if_inet6 means the IPv6 stack may legitimately be
+		// absent (hardened or IPv4-only node). Treat as a warning, consistent
+		// with the IPv6 sysctl monitor, and report the conditions as unknown
+		// outcomes rather than hard-failing.
+		if ipv6IfInet6Unreadable(readErr) {
+			status.AddEvent(types.NewEvent(
+				types.EventWarning,
+				"IPv6IfInet6ReadError",
+				fmt.Sprintf("Failed to read IPv6 address table from %s: %v. "+
+					"The IPv6 stack may be absent on this node.", m.ipv6IfInet6Path(), readErr),
+			))
+			m.recordAddressTableUnreadable(status)
+			// Still attempt the RA/autoconf sysctl scan below; those files may be
+			// present even when if_inet6 is not.
+		} else {
+			status.AddEvent(types.NewEvent(
+				types.EventWarning,
+				"IPv6IfInet6ParseError",
+				fmt.Sprintf("Failed to parse IPv6 address table from %s: %v", m.ipv6IfInet6Path(), readErr),
+			))
+			m.recordAddressTableUnreadable(status)
+		}
+	} else {
+		m.evaluateAddresses(status, addrs, skip)
+	}
+
+	if m.config.CheckPerInterface {
+		m.checkRouterAdvertisement(status, skip)
+	} else {
+		// RA scan disabled: record the condition as healthy so consumers always
+		// see a definitive state.
+		status.AddCondition(types.NewCondition(
+			conditionIPv6RADisabled,
+			types.ConditionFalse,
+			"IPv6RADisabledCheckSkipped",
+			"Per-interface RA/autoconf check disabled (checkPerInterface=false)",
+		))
+	}
+
+	return status, nil
+}
+
+// evaluateAddresses inspects the parsed if_inet6 addresses and records the
+// link-local and global address conditions.
+func (m *IPv6NeighborMonitor) evaluateAddresses(status *types.Status, addrs []ipv6Address, skip []string) {
+	summaries := make(map[string]*ifaceAddrSummary)
+	order := make([]string, 0)
+
+	for _, addr := range addrs {
+		if slices.Contains(skip, addr.IfaceName) {
+			continue
+		}
+		if len(m.config.Interfaces) > 0 && !slices.Contains(m.config.Interfaces, addr.IfaceName) {
+			continue
+		}
+		s, ok := summaries[addr.IfaceName]
+		if !ok {
+			s = &ifaceAddrSummary{}
+			summaries[addr.IfaceName] = s
+			order = append(order, addr.IfaceName)
+		}
+		if addr.IsLinkLocal {
+			s.hasLinkLocal = true
+		}
+		if addr.IsGlobal {
+			s.hasGlobal = true
+		}
+	}
+
+	var linkLocalMissing []string
+	var globalMissing []string
+
+	for _, iface := range order {
+		s := summaries[iface]
+		if !s.hasLinkLocal {
+			linkLocalMissing = append(linkLocalMissing, iface)
+			status.AddEvent(types.NewEvent(
+				m.severity(),
+				"IPv6LinkLocalMissing",
+				fmt.Sprintf("Interface %s has no IPv6 link-local address; "+
+					"RA/SLAAC and on-link neighbor discovery cannot operate without one. "+
+					"This monitor is detection-only.", iface),
+			))
+		}
+		if !s.hasGlobal {
+			globalMissing = append(globalMissing, iface)
+			if m.config.RequireGlobalAddress {
+				status.AddEvent(types.NewEvent(
+					m.severity(),
+					"IPv6GlobalAddressMissing",
+					fmt.Sprintf("Interface %s has no global/SLAAC IPv6 address; "+
+						"the node may lack IPv6 connectivity. This monitor is detection-only.", iface),
+				))
+			}
+		}
+	}
+
+	m.recordLinkLocalCondition(status, linkLocalMissing, len(order))
+	m.recordGlobalCondition(status, globalMissing, len(order))
+}
+
+// recordLinkLocalCondition records the IPv6LinkLocalMissing condition.
+func (m *IPv6NeighborMonitor) recordLinkLocalCondition(status *types.Status, missing []string, ifaceCount int) {
+	if len(missing) > 0 && m.config.ExpectIPv6Enabled {
+		status.AddCondition(types.NewCondition(
+			conditionIPv6LinkLocalMissing,
+			types.ConditionTrue,
+			"IPv6LinkLocalMissing",
+			fmt.Sprintf("Interfaces missing an IPv6 link-local address: %s", strings.Join(missing, ", ")),
+		))
+		return
+	}
+
+	if len(missing) > 0 {
+		// expectIPv6Enabled=false: record but do not flag.
+		status.AddCondition(types.NewCondition(
+			conditionIPv6LinkLocalMissing,
+			types.ConditionFalse,
+			"IPv6LinkLocalMissingNotExpected",
+			fmt.Sprintf("Interfaces without an IPv6 link-local address (%s); expectIPv6Enabled=false so no action required",
+				strings.Join(missing, ", ")),
+		))
+		return
+	}
+
+	reason := "IPv6LinkLocalPresent"
+	msg := "All checked interfaces have an IPv6 link-local address"
+	if ifaceCount == 0 {
+		reason = "IPv6NoInterfacesObserved"
+		msg = "No non-skipped IPv6 interfaces observed in if_inet6"
+	}
+	status.AddCondition(types.NewCondition(
+		conditionIPv6LinkLocalMissing,
+		types.ConditionFalse,
+		reason,
+		msg,
+	))
+}
+
+// recordGlobalCondition records the IPv6GlobalAddressMissing condition. The
+// condition is only flagged True when RequireGlobalAddress is set (and IPv6 is
+// expected).
+func (m *IPv6NeighborMonitor) recordGlobalCondition(status *types.Status, missing []string, ifaceCount int) {
+	if len(missing) > 0 && m.config.RequireGlobalAddress && m.config.ExpectIPv6Enabled {
+		status.AddCondition(types.NewCondition(
+			conditionIPv6GlobalMissing,
+			types.ConditionTrue,
+			"IPv6GlobalAddressMissing",
+			fmt.Sprintf("Interfaces missing a global/SLAAC IPv6 address: %s", strings.Join(missing, ", ")),
+		))
+		return
+	}
+
+	if len(missing) > 0 {
+		reason := "IPv6GlobalAddressMissingNotRequired"
+		msg := fmt.Sprintf("Interfaces without a global/SLAAC IPv6 address (%s); requireGlobalAddress=false so no action required",
+			strings.Join(missing, ", "))
+		if !m.config.ExpectIPv6Enabled {
+			reason = "IPv6GlobalAddressMissingNotExpected"
+			msg = fmt.Sprintf("Interfaces without a global/SLAAC IPv6 address (%s); expectIPv6Enabled=false so no action required",
+				strings.Join(missing, ", "))
+		}
+		status.AddCondition(types.NewCondition(
+			conditionIPv6GlobalMissing,
+			types.ConditionFalse,
+			reason,
+			msg,
+		))
+		return
+	}
+
+	reason := "IPv6GlobalAddressPresent"
+	msg := "All checked interfaces have a global/SLAAC IPv6 address"
+	if ifaceCount == 0 {
+		reason = "IPv6NoInterfacesObserved"
+		msg = "No non-skipped IPv6 interfaces observed in if_inet6"
+	}
+	status.AddCondition(types.NewCondition(
+		conditionIPv6GlobalMissing,
+		types.ConditionFalse,
+		reason,
+		msg,
+	))
+}
+
+// recordAddressTableUnreadable records both address conditions as False (cannot
+// confirm a problem) when if_inet6 could not be read.
+func (m *IPv6NeighborMonitor) recordAddressTableUnreadable(status *types.Status) {
+	status.AddCondition(types.NewCondition(
+		conditionIPv6LinkLocalMissing,
+		types.ConditionFalse,
+		"IPv6AddressTableUnreadable",
+		"IPv6 address table (if_inet6) is unreadable; cannot confirm link-local addresses",
+	))
+	status.AddCondition(types.NewCondition(
+		conditionIPv6GlobalMissing,
+		types.ConditionFalse,
+		"IPv6AddressTableUnreadable",
+		"IPv6 address table (if_inet6) is unreadable; cannot confirm global addresses",
+	))
+}
+
+// checkRouterAdvertisement scans per-interface accept_ra (and the companion
+// autoconf) sysctls and records the IPv6RouterAdvertisementDisabled condition.
+func (m *IPv6NeighborMonitor) checkRouterAdvertisement(status *types.Status, skip []string) {
+	pattern := filepath.Join(m.config.ProcPath, ipv6AcceptRARelGlob)
+	matches, err := filepath.Glob(pattern)
+	if err != nil {
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6AcceptRAGlobError",
+			fmt.Sprintf("Failed to glob per-interface accept_ra files: %v", err),
+		))
+		status.AddCondition(types.NewCondition(
+			conditionIPv6RADisabled,
+			types.ConditionFalse,
+			"IPv6AcceptRAUnreadable",
+			"Per-interface accept_ra files could not be enumerated; cannot confirm RA acceptance",
+		))
+		return
+	}
+
+	if len(matches) == 0 {
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6AcceptRAReadError",
+			fmt.Sprintf("No per-interface accept_ra sysctls found under %s; the IPv6 stack may be absent",
+				filepath.Join(m.config.ProcPath, ipv6ConfDirRelPath)),
+		))
+		status.AddCondition(types.NewCondition(
+			conditionIPv6RADisabled,
+			types.ConditionFalse,
+			"IPv6AcceptRAUnreadable",
+			"No per-interface accept_ra sysctls found; cannot confirm RA acceptance",
+		))
+		return
+	}
+
+	var disabled []string
+
+	for _, match := range matches {
+		ifaceName := extractInterfaceName(match)
+		if ifaceName == "" {
+			continue
+		}
+		if slices.Contains(skip, ifaceName) {
+			continue
+		}
+		if len(m.config.Interfaces) > 0 && !slices.Contains(m.config.Interfaces, ifaceName) {
+			continue
+		}
+
+		raVal, err := readSysctlInt(match)
+		if err != nil {
+			// Per-interface files race with link teardown; skip silently.
+			continue
+		}
+
+		autoconfPath := filepath.Join(filepath.Dir(match), ipv6NeighborAutoconfFileName)
+		autoconfVal, autoconfErr := readSysctlInt(autoconfPath)
+
+		raOff := raVal == 0
+		autoconfOff := autoconfErr == nil && autoconfVal == 0
+
+		if !raOff && !autoconfOff {
+			continue
+		}
+
+		var parts []string
+		if raOff {
+			parts = append(parts, fmt.Sprintf("net.ipv6.conf.%s.accept_ra=0", ifaceName))
+		}
+		if autoconfOff {
+			parts = append(parts, fmt.Sprintf("net.ipv6.conf.%s.autoconf=0", ifaceName))
+		}
+		finding := strings.Join(parts, ", ")
+
+		if m.config.ExpectIPv6Enabled {
+			disabled = append(disabled, finding)
+			status.AddEvent(types.NewEvent(
+				types.EventWarning,
+				"IPv6RouterAdvertisementDisabled",
+				fmt.Sprintf("Router advertisement / SLAAC disabled on interface %s (%s); "+
+					"the interface will not auto-configure an IPv6 address from RAs. "+
+					"This monitor is detection-only and does not modify sysctls.", ifaceName, finding),
+			))
+		} else {
+			status.AddEvent(types.NewEvent(
+				types.EventInfo,
+				"IPv6RouterAdvertisementDisabledExpected",
+				fmt.Sprintf("Router advertisement / SLAAC disabled on interface %s (%s); expectIPv6Enabled=false so no action required",
+					ifaceName, finding),
+			))
+		}
+	}
+
+	if len(disabled) > 0 {
+		status.AddCondition(types.NewCondition(
+			conditionIPv6RADisabled,
+			types.ConditionTrue,
+			"IPv6RouterAdvertisementDisabled",
+			fmt.Sprintf("RA/SLAAC disabled: %s", strings.Join(disabled, "; ")),
+		))
+		return
+	}
+
+	status.AddCondition(types.NewCondition(
+		conditionIPv6RADisabled,
+		types.ConditionFalse,
+		"IPv6RouterAdvertisementEnabled",
+		"All checked interfaces accept router advertisements (accept_ra/autoconf not disabled)",
+	))
+}
+
+// severity returns the event severity that corresponds to ExpectIPv6Enabled:
+// warnings when IPv6 is expected, informational otherwise.
+func (m *IPv6NeighborMonitor) severity() types.EventSeverity {
+	if m.config.ExpectIPv6Enabled {
+		return types.EventWarning
+	}
+	return types.EventInfo
+}
+
+// parseIfInet6File reads and parses <procPath>/net/if_inet6. Each line is
+// whitespace-separated:
+//
+//	<32-hex-addr> <ifindex_hex> <prefixlen_hex> <scope_hex> <flags_hex> <devname>
+//
+// Scope 0x20 = link-local, 0x00 = global. Parse errors on individual lines are
+// skipped; a read/open failure is returned to the caller.
+func parseIfInet6File(path string) ([]ipv6Address, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open %s: %w", path, err)
+	}
+	defer f.Close()
+
+	var addrs []ipv6Address
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if line == "" {
+			continue
+		}
+		fields := strings.Fields(line)
+		// Need at least scope (field index 3) and devname (last field).
+		if len(fields) < 6 {
+			continue
+		}
+		scope, err := parseHexScope(fields[3])
+		if err != nil {
+			continue
+		}
+		devName := fields[len(fields)-1]
+		addrs = append(addrs, ipv6Address{
+			IfaceName:   devName,
+			Scope:       scope,
+			IsLinkLocal: scope == ipv6LinkLocalScopeHex,
+			IsGlobal:    scope == 0,
+		})
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("failed to read %s: %w", path, err)
+	}
+	return addrs, nil
+}
+
+// parseHexScope parses a scope field from if_inet6, tolerating an optional "0x"
+// prefix (the kernel writes a bare two-hex-digit value, e.g. "20", but we accept
+// "0x20" for robustness).
+func parseHexScope(s string) (uint64, error) {
+	s = strings.TrimSpace(s)
+	s = strings.TrimPrefix(s, "0x")
+	s = strings.TrimPrefix(s, "0X")
+	if s == "" {
+		return 0, errors.New("empty scope field")
+	}
+	return strconv.ParseUint(s, 16, 64)
+}
+
+// readSysctlInt reads a sysctl-style file and returns its integer value (after
+// trimming whitespace). Used for accept_ra / autoconf, which take values 0/1/2.
+func readSysctlInt(path string) (int, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return 0, fmt.Errorf("failed to read %s: %w", path, err)
+	}
+	val, err := strconv.Atoi(strings.TrimSpace(string(data)))
+	if err != nil {
+		return 0, fmt.Errorf("failed to parse %s: %w", path, err)
+	}
+	return val, nil
+}
+
+// ipv6IfInet6Unreadable reports whether the error from parseIfInet6File
+// indicates the file could not be opened/read (as opposed to a parse failure).
+func ipv6IfInet6Unreadable(err error) bool {
+	var pathErr *fs.PathError
+	return errors.As(err, &pathErr)
+}
diff --git a/pkg/monitors/network/ipv6_neighbor_test.go b/pkg/monitors/network/ipv6_neighbor_test.go
new file mode 100644
index 0000000..77d7eea
--- /dev/null
+++ b/pkg/monitors/network/ipv6_neighbor_test.go
@@ -0,0 +1,752 @@
+package network
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+func TestParseIPv6NeighborConfig(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  map[string]any
+		want    *IPv6NeighborConfig
+		wantErr bool
+	}{
+		{
+			name:   "nil config - use defaults",
+			config: nil,
+			want: &IPv6NeighborConfig{
+				ExpectIPv6Enabled:    defaultIPv6NeighborExpectEnabled,
+				CheckPerInterface:    defaultIPv6NeighborCheckPerIface,
+				RequireGlobalAddress: defaultIPv6NeighborRequireGlobal,
+				ProcPath:             defaultIPv6NeighborProcPath,
+				SkipInterfaces:       defaultIPv6NeighborSkipInterfaces,
+			},
+		},
+		{
+			name:   "empty config - use defaults",
+			config: map[string]any{},
+			want: &IPv6NeighborConfig{
+				ExpectIPv6Enabled:    defaultIPv6NeighborExpectEnabled,
+				CheckPerInterface:    defaultIPv6NeighborCheckPerIface,
+				RequireGlobalAddress: defaultIPv6NeighborRequireGlobal,
+				ProcPath:             defaultIPv6NeighborProcPath,
+				SkipInterfaces:       defaultIPv6NeighborSkipInterfaces,
+			},
+		},
+		{
+			name: "custom values",
+			config: map[string]any{
+				"expectIPv6Enabled":    false,
+				"checkPerInterface":    false,
+				"requireGlobalAddress": true,
+				"procPath":             "/host/proc",
+			},
+			want: &IPv6NeighborConfig{
+				ExpectIPv6Enabled:    false,
+				CheckPerInterface:    false,
+				RequireGlobalAddress: true,
+				ProcPath:             "/host/proc",
+				SkipInterfaces:       defaultIPv6NeighborSkipInterfaces,
+			},
+		},
+		{
+			name: "interfaces and skipInterfaces",
+			config: map[string]any{
+				"interfaces":     []any{"eth0", "eth1"},
+				"skipInterfaces": []string{"lo"},
+			},
+			want: &IPv6NeighborConfig{
+				ExpectIPv6Enabled:    defaultIPv6NeighborExpectEnabled,
+				CheckPerInterface:    defaultIPv6NeighborCheckPerIface,
+				RequireGlobalAddress: defaultIPv6NeighborRequireGlobal,
+				ProcPath:             defaultIPv6NeighborProcPath,
+				Interfaces:           []string{"eth0", "eth1"},
+				SkipInterfaces:       []string{"lo"},
+			},
+		},
+		{name: "invalid expectIPv6Enabled", config: map[string]any{"expectIPv6Enabled": "yes"}, wantErr: true},
+		{name: "invalid checkPerInterface", config: map[string]any{"checkPerInterface": 1}, wantErr: true},
+		{name: "invalid requireGlobalAddress", config: map[string]any{"requireGlobalAddress": "no"}, wantErr: true},
+		{name: "invalid procPath", config: map[string]any{"procPath": 123}, wantErr: true},
+		{name: "invalid interfaces type", config: map[string]any{"interfaces": "eth0"}, wantErr: true},
+		{name: "invalid interfaces element", config: map[string]any{"interfaces": []any{123}}, wantErr: true},
+		{name: "invalid skipInterfaces element", config: map[string]any{"skipInterfaces": []any{true}}, wantErr: true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := parseIPv6NeighborConfig(tt.config)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("parseIPv6NeighborConfig() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if tt.wantErr {
+				return
+			}
+			if got.ExpectIPv6Enabled != tt.want.ExpectIPv6Enabled {
+				t.Errorf("ExpectIPv6Enabled = %v, want %v", got.ExpectIPv6Enabled, tt.want.ExpectIPv6Enabled)
+			}
+			if got.CheckPerInterface != tt.want.CheckPerInterface {
+				t.Errorf("CheckPerInterface = %v, want %v", got.CheckPerInterface, tt.want.CheckPerInterface)
+			}
+			if got.RequireGlobalAddress != tt.want.RequireGlobalAddress {
+				t.Errorf("RequireGlobalAddress = %v, want %v", got.RequireGlobalAddress, tt.want.RequireGlobalAddress)
+			}
+			if got.ProcPath != tt.want.ProcPath {
+				t.Errorf("ProcPath = %v, want %v", got.ProcPath, tt.want.ProcPath)
+			}
+			if !equalStringSlice(got.Interfaces, tt.want.Interfaces) {
+				t.Errorf("Interfaces = %v, want %v", got.Interfaces, tt.want.Interfaces)
+			}
+			if !equalStringSlice(got.SkipInterfaces, tt.want.SkipInterfaces) {
+				t.Errorf("SkipInterfaces = %v, want %v", got.SkipInterfaces, tt.want.SkipInterfaces)
+			}
+		})
+	}
+}
+
+func TestValidateIPv6NeighborConfig(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  map[string]any
+		wantErr bool
+	}{
+		{name: "valid config", config: map[string]any{"expectIPv6Enabled": true}, wantErr: false},
+		{name: "invalid config", config: map[string]any{"requireGlobalAddress": "yes"}, wantErr: true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cfg := types.MonitorConfig{
+				Name:     "test-ipv6-neighbor",
+				Type:     "network-ipv6-neighbor",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config:   tt.config,
+			}
+			if err := ValidateIPv6NeighborConfig(cfg); (err != nil) != tt.wantErr {
+				t.Errorf("ValidateIPv6NeighborConfig() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestNewIPv6NeighborMonitor(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  types.MonitorConfig
+		wantErr bool
+	}{
+		{
+			name: "valid config",
+			config: types.MonitorConfig{
+				Name:     "test-ipv6-neighbor",
+				Type:     "network-ipv6-neighbor",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config:   map[string]any{"expectIPv6Enabled": true},
+			},
+			wantErr: false,
+		},
+		{
+			name: "invalid config - bad type",
+			config: types.MonitorConfig{
+				Name:     "test-ipv6-neighbor",
+				Type:     "network-ipv6-neighbor",
+				Interval: 60 * time.Second,
+				Timeout:  5 * time.Second,
+				Config:   map[string]any{"expectIPv6Enabled": "invalid"},
+			},
+			wantErr: true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			monitor, err := NewIPv6NeighborMonitor(context.Background(), tt.config)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("NewIPv6NeighborMonitor() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !tt.wantErr && monitor == nil {
+				t.Error("NewIPv6NeighborMonitor() returned nil monitor")
+			}
+		})
+	}
+}
+
+// mockIfInet6Addr describes one address to write into a mock if_inet6 file.
+type mockIfInet6Addr struct {
+	addr      string // 32 hex chars; defaults to a filler if empty
+	ifindex   string // hex, defaults to "01"
+	prefixlen string // hex, defaults to "40"
+	scope     string // hex scope, e.g. "20" (link-local) or "00" (global)
+	flags     string // hex, defaults to "80"
+	dev       string // device name
+}
+
+// writeMockNeighborProcFS builds a mock proc tree:
+//   - <proc>/net/if_inet6 from the supplied addresses (skipped if addrs is nil)
+//   - <proc>/sys/net/ipv6/conf/<iface>/{accept_ra,autoconf} from raConf
+//
+// raConf maps interface name -> {accept_ra, autoconf} string values; an empty
+// value skips that file.
+func writeMockNeighborProcFS(t *testing.T, addrs []mockIfInet6Addr, writeIfInet6 bool, raConf map[string][2]string) string {
+	t.Helper()
+	procDir := t.TempDir()
+
+	if writeIfInet6 {
+		netDir := filepath.Join(procDir, "net")
+		if err := os.MkdirAll(netDir, 0755); err != nil {
+			t.Fatalf("mkdir net: %v", err)
+		}
+		var b []byte
+		for _, a := range addrs {
+			addr := a.addr
+			if addr == "" {
+				addr = "fe800000000000000000000000000001"
+			}
+			ifindex := a.ifindex
+			if ifindex == "" {
+				ifindex = "01"
+			}
+			prefixlen := a.prefixlen
+			if prefixlen == "" {
+				prefixlen = "40"
+			}
+			flags := a.flags
+			if flags == "" {
+				flags = "80"
+			}
+			line := addr + " " + ifindex + " " + prefixlen + " " + a.scope + " " + flags + " " + a.dev + "\n"
+			b = append(b, []byte(line)...)
+		}
+		if err := os.WriteFile(filepath.Join(netDir, "if_inet6"), b, 0644); err != nil {
+			t.Fatalf("write if_inet6: %v", err)
+		}
+	}
+
+	for iface, vals := range raConf {
+		dir := filepath.Join(procDir, "sys", "net", "ipv6", "conf", iface)
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			t.Fatalf("mkdir conf/%s: %v", iface, err)
+		}
+		if vals[0] != "" {
+			if err := os.WriteFile(filepath.Join(dir, "accept_ra"), []byte(vals[0]+"\n"), 0644); err != nil {
+				t.Fatalf("write accept_ra: %v", err)
+			}
+		}
+		if vals[1] != "" {
+			if err := os.WriteFile(filepath.Join(dir, "autoconf"), []byte(vals[1]+"\n"), 0644); err != nil {
+				t.Fatalf("write autoconf: %v", err)
+			}
+		}
+	}
+
+	return procDir
+}
+
+// findNeighborCondition returns the named condition, or nil if absent.
+func findNeighborCondition(status *types.Status, condType string) *types.Condition {
+	for i := range status.Conditions {
+		if status.Conditions[i].Type == condType {
+			return &status.Conditions[i]
+		}
+	}
+	return nil
+}
+
+func TestCheckIPv6Neighbor_Healthy(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{scope: "20", dev: "eth0"}, // link-local
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"}, // global
+	}
+	raConf := map[string][2]string{"eth0": {"1", "1"}}
+	procDir := writeMockNeighborProcFS(t, addrs, true, raConf)
+
+	m := &IPv6NeighborMonitor{
+		name: "test",
+		config: &IPv6NeighborConfig{
+			ExpectIPv6Enabled:    true,
+			CheckPerInterface:    true,
+			RequireGlobalAddress: true,
+			SkipInterfaces:       defaultIPv6NeighborSkipInterfaces,
+			ProcPath:             procDir,
+		},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	for _, ct := range []string{conditionIPv6LinkLocalMissing, conditionIPv6GlobalMissing, conditionIPv6RADisabled} {
+		cond := findNeighborCondition(status, ct)
+		if cond == nil {
+			t.Fatalf("missing condition %s", ct)
+		}
+		if cond.Status != types.ConditionFalse {
+			t.Errorf("condition %s = %s, want False", ct, cond.Status)
+		}
+	}
+}
+
+func TestCheckIPv6Neighbor_LinkLocalMissing(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"}, // global only
+	}
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"eth0": {"1", "1"}})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: true, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6LinkLocalMissing)
+	if cond == nil || cond.Status != types.ConditionTrue {
+		t.Fatalf("expected IPv6LinkLocalMissing=True, got %+v", cond)
+	}
+	if !hasEventReason(status, "IPv6LinkLocalMissing") {
+		t.Error("expected IPv6LinkLocalMissing event")
+	}
+}
+
+func TestCheckIPv6Neighbor_GlobalMissingRequired(t *testing.T) {
+	addrs := []mockIfInet6Addr{{scope: "20", dev: "eth0"}} // link-local only
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"eth0": {"1", "1"}})
+
+	m := &IPv6NeighborMonitor{
+		name: "test",
+		config: &IPv6NeighborConfig{
+			ExpectIPv6Enabled: true, CheckPerInterface: true, RequireGlobalAddress: true,
+			SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir,
+		},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6GlobalMissing)
+	if cond == nil || cond.Status != types.ConditionTrue {
+		t.Fatalf("expected IPv6GlobalAddressMissing=True, got %+v", cond)
+	}
+	if !hasEventReason(status, "IPv6GlobalAddressMissing") {
+		t.Error("expected IPv6GlobalAddressMissing event")
+	}
+	// Link-local present, so that condition stays False.
+	if ll := findNeighborCondition(status, conditionIPv6LinkLocalMissing); ll == nil || ll.Status != types.ConditionFalse {
+		t.Errorf("expected IPv6LinkLocalMissing=False, got %+v", ll)
+	}
+}
+
+func TestCheckIPv6Neighbor_GlobalMissingNotRequiredSuppressed(t *testing.T) {
+	addrs := []mockIfInet6Addr{{scope: "20", dev: "eth0"}} // link-local only
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"eth0": {"1", "1"}})
+
+	m := &IPv6NeighborMonitor{
+		name: "test",
+		config: &IPv6NeighborConfig{
+			ExpectIPv6Enabled: true, CheckPerInterface: true, RequireGlobalAddress: false,
+			SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir,
+		},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6GlobalMissing)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6GlobalAddressMissing=False (not required), got %+v", cond)
+	}
+	if hasEventReason(status, "IPv6GlobalAddressMissing") {
+		t.Error("did not expect IPv6GlobalAddressMissing event when requireGlobalAddress=false")
+	}
+}
+
+func TestCheckIPv6Neighbor_AcceptRADisabled(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{scope: "20", dev: "eth0"},
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"},
+	}
+	// accept_ra=0 on eth0 with autoconf enabled.
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"eth0": {"0", "1"}})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: true, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6RADisabled)
+	if cond == nil || cond.Status != types.ConditionTrue {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=True, got %+v", cond)
+	}
+	if !hasEventReason(status, "IPv6RouterAdvertisementDisabled") {
+		t.Error("expected IPv6RouterAdvertisementDisabled event")
+	}
+}
+
+func TestCheckIPv6Neighbor_AutoconfDisabled(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{scope: "20", dev: "eth0"},
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"},
+	}
+	// accept_ra=1 but autoconf=0.
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"eth0": {"1", "0"}})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: true, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6RADisabled)
+	if cond == nil || cond.Status != types.ConditionTrue {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=True (autoconf=0), got %+v", cond)
+	}
+}
+
+func TestCheckIPv6Neighbor_AcceptRADisabledExpectFalse(t *testing.T) {
+	addrs := []mockIfInet6Addr{{scope: "20", dev: "eth0"}}
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"eth0": {"0", "0"}})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: false, CheckPerInterface: true, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6RADisabled)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=False (expectIPv6Enabled=false), got %+v", cond)
+	}
+	if !hasEventReason(status, "IPv6RouterAdvertisementDisabledExpected") {
+		t.Error("expected IPv6RouterAdvertisementDisabledExpected info event")
+	}
+	if hasEventReason(status, "IPv6RouterAdvertisementDisabled") {
+		t.Error("did not expect warning IPv6RouterAdvertisementDisabled when expectIPv6Enabled=false")
+	}
+}
+
+func TestCheckIPv6Neighbor_SkipInterfacesRespected(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{addr: "00000000000000000000000000000001", scope: "10", dev: "lo"}, // lo, skipped
+		{scope: "20", dev: "eth0"},
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"},
+	}
+	// lo has accept_ra=0 but is skipped; eth0 healthy.
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{
+		"lo":   {"0", "0"},
+		"eth0": {"1", "1"},
+	})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: true, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if cond := findNeighborCondition(status, conditionIPv6RADisabled); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=False (lo skipped), got %+v", cond)
+	}
+	if cond := findNeighborCondition(status, conditionIPv6LinkLocalMissing); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6LinkLocalMissing=False (lo skipped), got %+v", cond)
+	}
+}
+
+func TestCheckIPv6Neighbor_InterfacesFilter(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{addr: "20010db8000000000000000000000099", scope: "00", dev: "eth0"}, // global only (would fail link-local) but filtered out
+		{scope: "20", dev: "eth1"},
+	}
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{
+		"eth0": {"0", "0"}, // disabled but filtered out
+		"eth1": {"1", "1"},
+	})
+
+	m := &IPv6NeighborMonitor{
+		name: "test",
+		config: &IPv6NeighborConfig{
+			ExpectIPv6Enabled: true, CheckPerInterface: true,
+			Interfaces:     []string{"eth1"},
+			SkipInterfaces: defaultIPv6NeighborSkipInterfaces,
+			ProcPath:       procDir,
+		},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if cond := findNeighborCondition(status, conditionIPv6LinkLocalMissing); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6LinkLocalMissing=False (only eth1 checked), got %+v", cond)
+	}
+	if cond := findNeighborCondition(status, conditionIPv6RADisabled); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=False (eth0 filtered out), got %+v", cond)
+	}
+}
+
+func TestCheckIPv6Neighbor_MissingIfInet6(t *testing.T) {
+	// No if_inet6 written; accept_ra files present and healthy.
+	procDir := writeMockNeighborProcFS(t, nil, false, map[string][2]string{"eth0": {"1", "1"}})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: true, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error (should not hard error): %v", err)
+	}
+
+	if !hasEventReason(status, "IPv6IfInet6ReadError") {
+		t.Error("expected IPv6IfInet6ReadError warning event")
+	}
+	for _, ev := range status.Events {
+		if ev.Reason == "IPv6IfInet6ReadError" && ev.Severity != types.EventWarning {
+			t.Errorf("expected Warning severity for IPv6IfInet6ReadError, got %s", ev.Severity)
+		}
+	}
+	// Address conditions reported False (cannot confirm), RA condition healthy.
+	if cond := findNeighborCondition(status, conditionIPv6LinkLocalMissing); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6LinkLocalMissing=False (unreadable), got %+v", cond)
+	}
+	if cond := findNeighborCondition(status, conditionIPv6RADisabled); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=False, got %+v", cond)
+	}
+}
+
+func TestCheckIPv6Neighbor_NonexistentProcPath(t *testing.T) {
+	m := &IPv6NeighborMonitor{
+		name: "test",
+		config: &IPv6NeighborConfig{
+			ExpectIPv6Enabled: true, CheckPerInterface: true,
+			SkipInterfaces: defaultIPv6NeighborSkipInterfaces,
+			ProcPath:       "/nonexistent/proc",
+		},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if !hasEventReason(status, "IPv6IfInet6ReadError") {
+		t.Error("expected IPv6IfInet6ReadError event for nonexistent procPath")
+	}
+	// Glob over nonexistent path yields no matches -> RA condition reported
+	// unreadable (False) with a warning.
+	if cond := findNeighborCondition(status, conditionIPv6RADisabled); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=False, got %+v", cond)
+	}
+	if !hasEventReason(status, "IPv6AcceptRAReadError") {
+		t.Error("expected IPv6AcceptRAReadError event for nonexistent procPath")
+	}
+}
+
+func TestCheckIPv6Neighbor_PerInterfaceCheckDisabled(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{scope: "20", dev: "eth0"},
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"},
+	}
+	// accept_ra=0 but checkPerInterface=false means RA scan is skipped entirely.
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"eth0": {"0", "0"}})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: false, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6RADisabled)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=False (check skipped), got %+v", cond)
+	}
+	if cond.Reason != "IPv6RADisabledCheckSkipped" {
+		t.Errorf("expected reason IPv6RADisabledCheckSkipped, got %s", cond.Reason)
+	}
+}
+
+func TestCheckIPv6Neighbor_NoInterfacesObserved(t *testing.T) {
+	// if_inet6 present but only contains lo (skipped) -> no observed interfaces.
+	addrs := []mockIfInet6Addr{
+		{addr: "00000000000000000000000000000001", scope: "10", dev: "lo"},
+	}
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{"lo": {"1", "1"}})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: true, SkipInterfaces: defaultIPv6NeighborSkipInterfaces, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	cond := findNeighborCondition(status, conditionIPv6LinkLocalMissing)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6LinkLocalMissing=False, got %+v", cond)
+	}
+	if cond.Reason != "IPv6NoInterfacesObserved" {
+		t.Errorf("expected reason IPv6NoInterfacesObserved, got %s", cond.Reason)
+	}
+}
+
+func TestCheckIPv6Neighbor_SkipInterfacesNilFallsBack(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{addr: "00000000000000000000000000000001", scope: "10", dev: "lo"},
+		{scope: "20", dev: "eth0"},
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"},
+	}
+	procDir := writeMockNeighborProcFS(t, addrs, true, map[string][2]string{
+		"lo":   {"0", "0"},
+		"eth0": {"1", "1"},
+	})
+
+	m := &IPv6NeighborMonitor{
+		name:   "test",
+		config: &IPv6NeighborConfig{ExpectIPv6Enabled: true, CheckPerInterface: true, SkipInterfaces: nil, ProcPath: procDir},
+	}
+
+	status, err := m.checkIPv6Neighbor(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// lo skipped via default fallback -> conditions healthy.
+	if cond := findNeighborCondition(status, conditionIPv6LinkLocalMissing); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6LinkLocalMissing=False, got %+v", cond)
+	}
+	if cond := findNeighborCondition(status, conditionIPv6RADisabled); cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected IPv6RouterAdvertisementDisabled=False, got %+v", cond)
+	}
+}
+
+func TestParseIfInet6File(t *testing.T) {
+	addrs := []mockIfInet6Addr{
+		{addr: "fe800000000000000000000000000abc", scope: "20", dev: "eth0"},
+		{addr: "20010db8000000000000000000000001", scope: "00", dev: "eth0"},
+	}
+	procDir := writeMockNeighborProcFS(t, addrs, true, nil)
+
+	got, err := parseIfInet6File(filepath.Join(procDir, "net", "if_inet6"))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(got) != 2 {
+		t.Fatalf("expected 2 addresses, got %d", len(got))
+	}
+	if !got[0].IsLinkLocal || got[0].IsGlobal {
+		t.Errorf("addr[0] expected link-local, got %+v", got[0])
+	}
+	if got[1].IsLinkLocal || !got[1].IsGlobal {
+		t.Errorf("addr[1] expected global, got %+v", got[1])
+	}
+
+	t.Run("0x prefixed scope", func(t *testing.T) {
+		dir := t.TempDir()
+		netDir := filepath.Join(dir, "net")
+		if err := os.MkdirAll(netDir, 0755); err != nil {
+			t.Fatalf("mkdir: %v", err)
+		}
+		content := "fe800000000000000000000000000abc 02 40 0x20 80 wlan0\n" +
+			"short line skipped\n" +
+			"\n" +
+			"badscope0000000000000000000000ff 02 40 zz 80 bad0\n"
+		if err := os.WriteFile(filepath.Join(netDir, "if_inet6"), []byte(content), 0644); err != nil {
+			t.Fatalf("write: %v", err)
+		}
+		got, err := parseIfInet6File(filepath.Join(netDir, "if_inet6"))
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(got) != 1 {
+			t.Fatalf("expected 1 valid address (others skipped), got %d", len(got))
+		}
+		if !got[0].IsLinkLocal || got[0].IfaceName != "wlan0" {
+			t.Errorf("unexpected parsed addr: %+v", got[0])
+		}
+	})
+
+	t.Run("missing file", func(t *testing.T) {
+		_, err := parseIfInet6File(filepath.Join(t.TempDir(), "nope"))
+		if err == nil {
+			t.Error("expected error for missing file")
+		}
+		if !ipv6IfInet6Unreadable(err) {
+			t.Error("expected ipv6IfInet6Unreadable to report true for missing file")
+		}
+	})
+}
+
+func TestReadSysctlInt(t *testing.T) {
+	tests := []struct {
+		name    string
+		content string
+		want    int
+		wantErr bool
+	}{
+		{name: "zero", content: "0\n", want: 0},
+		{name: "one", content: "1\n", want: 1},
+		{name: "two with whitespace", content: " 2 \n", want: 2},
+		{name: "non-numeric", content: "abc\n", wantErr: true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			f := filepath.Join(t.TempDir(), "accept_ra")
+			if err := os.WriteFile(f, []byte(tt.content), 0644); err != nil {
+				t.Fatalf("write: %v", err)
+			}
+			got, err := readSysctlInt(f)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("readSysctlInt() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !tt.wantErr && got != tt.want {
+				t.Errorf("readSysctlInt() = %d, want %d", got, tt.want)
+			}
+		})
+	}
+
+	t.Run("missing file", func(t *testing.T) {
+		if _, err := readSysctlInt("/nonexistent/accept_ra"); err == nil {
+			t.Error("expected error for missing file")
+		}
+	})
+}

From 09e1ae3a3e9f40b7ebd7d1d57cbe6c6e7872f3f5 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:03:20 -0500
Subject: [PATCH 05/38] feat(network): IPv6 firewall sanity detection monitor
 (Task #17208)

DETECTION-ONLY monitor (type network-ipv6-firewall) that lists ip6tables
(-S) or nft (list ruleset) read-only and flags IPv6FirewallBlackhole when
all built-in filter chains default-DROP with no ACCEPT anywhere. backend
config auto|ip6tables|nft. Missing binary / read error -> warning, never
an error; never issues a mutating firewall verb. Injectable command
executor for tests; 94.9% coverage. Self-registers (#17209 owns cmd).
---
 pkg/monitors/network/ipv6_firewall.go      | 527 +++++++++++++++++++++
 pkg/monitors/network/ipv6_firewall_test.go | 507 ++++++++++++++++++++
 2 files changed, 1034 insertions(+)
 create mode 100644 pkg/monitors/network/ipv6_firewall.go
 create mode 100644 pkg/monitors/network/ipv6_firewall_test.go

diff --git a/pkg/monitors/network/ipv6_firewall.go b/pkg/monitors/network/ipv6_firewall.go
new file mode 100644
index 0000000..b504b6d
--- /dev/null
+++ b/pkg/monitors/network/ipv6_firewall.go
@@ -0,0 +1,527 @@
+// Package network provides network health monitoring capabilities.
+package network
+
+import (
+	"context"
+	"fmt"
+	"os/exec"
+	"strings"
+
+	"github.com/supporttools/node-doctor/pkg/monitors"
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+// Backend identifiers for the IPv6 firewall monitor.
+const (
+	// ipv6FirewallBackendAuto selects nft when the nft binary is present,
+	// otherwise falls back to ip6tables.
+	ipv6FirewallBackendAuto = "auto"
+	// ipv6FirewallBackendIP6Tables forces the legacy ip6tables backend.
+	ipv6FirewallBackendIP6Tables = "ip6tables"
+	// ipv6FirewallBackendNFT forces the nftables backend.
+	ipv6FirewallBackendNFT = "nft"
+)
+
+const (
+	// Default configuration values for the IPv6 firewall sanity monitor.
+	defaultIPv6FirewallExpectEnabled = true
+	defaultIPv6FirewallBackend       = ipv6FirewallBackendAuto
+
+	// ip6tablesBinary / nftBinary are the firewall tools this monitor reads.
+	ip6tablesBinary = "ip6tables"
+	nftBinary       = "nft"
+
+	// conditionIPv6FirewallBlackhole is the condition reported by this monitor.
+	conditionIPv6FirewallBlackhole = "IPv6FirewallBlackhole"
+
+	// ipv6FilterChains are the built-in filter-table chains whose default
+	// policy this monitor inspects for an obvious black-hole.
+	chainInput   = "INPUT"
+	chainForward = "FORWARD"
+	chainOutput  = "OUTPUT"
+)
+
+// ipv6FilterChains is the set of built-in filter-table chains checked for a
+// DROP/REJECT default policy with no ACCEPT rules.
+var ipv6FilterChains = []string{chainInput, chainForward, chainOutput}
+
+// CommandExecutor abstracts read-only command execution so tests can inject
+// canned ip6tables / nft output. This mirrors the executor pattern used by the
+// custom log-pattern monitor and the network remediator.
+type CommandExecutor interface {
+	// LookPath reports whether the named binary is resolvable in PATH.
+	LookPath(name string) (string, error)
+	// Run executes name with args and returns combined output. It is only ever
+	// invoked with read-only listing verbs by this monitor.
+	Run(ctx context.Context, name string, args ...string) ([]byte, error)
+}
+
+// defaultCommandExecutor implements CommandExecutor using os/exec.
+type defaultCommandExecutor struct{}
+
+func (e *defaultCommandExecutor) LookPath(name string) (string, error) {
+	return exec.LookPath(name)
+}
+
+func (e *defaultCommandExecutor) Run(ctx context.Context, name string, args ...string) ([]byte, error) {
+	cmd := exec.CommandContext(ctx, name, args...)
+	return cmd.CombinedOutput()
+}
+
+// IPv6FirewallConfig holds configuration for the IPv6 firewall sanity monitor.
+type IPv6FirewallConfig struct {
+	// ExpectIPv6Enabled controls severity. When true, an obviously black-holed
+	// IPv6 firewall (default DROP with no ACCEPT rules) is treated as a problem
+	// (condition True, warning events). When false, the same observation is
+	// recorded informationally and the condition is reported False.
+	ExpectIPv6Enabled bool
+	// Backend forces a firewall backend: "auto" (default), "ip6tables", or
+	// "nft". In auto mode the monitor prefers nft when the nft binary is present
+	// and falls back to ip6tables.
+	Backend string
+}
+
+// IPv6FirewallMonitor performs read-only sanity checks of the IPv6 firewall.
+//
+// DETECTION ONLY: this monitor never adds, deletes, or modifies firewall rules.
+// It issues only read-only listing commands (`nft list ruleset`,
+// `ip6tables -S`) and reports findings; it applies no remediation.
+//
+// The heuristic is intentionally conservative to avoid false positives: it only
+// flags a node when every built-in filter chain (INPUT/FORWARD/OUTPUT) has a
+// default policy of DROP (or REJECT) and the ruleset contains no ACCEPT rule at
+// all — i.e. IPv6 traffic is effectively black-holed. It does not attempt to
+// validate rule correctness.
+type IPv6FirewallMonitor struct {
+	name     string
+	config   *IPv6FirewallConfig
+	executor CommandExecutor
+
+	*monitors.BaseMonitor
+}
+
+// init registers the IPv6 firewall sanity monitor with the registry.
+func init() {
+	monitors.MustRegister(monitors.MonitorInfo{
+		Type:        "network-ipv6-firewall",
+		Factory:     NewIPv6FirewallMonitor,
+		Validator:   ValidateIPv6FirewallConfig,
+		Description: "Detection-only sanity monitor for the IPv6 firewall (ip6tables/nft); reads ruleset state but never modifies rules",
+		DefaultConfig: &types.MonitorConfig{
+			Name:           "ipv6-firewall-check",
+			Type:           "network-ipv6-firewall",
+			Enabled:        true,
+			IntervalString: "60s",
+			TimeoutString:  "5s",
+			Config: map[string]any{
+				"expectIPv6Enabled": true,
+				"backend":           ipv6FirewallBackendAuto,
+			},
+		},
+	})
+}
+
+// NewIPv6FirewallMonitor creates a new IPv6 firewall sanity monitor instance.
+func NewIPv6FirewallMonitor(ctx context.Context, config types.MonitorConfig) (types.Monitor, error) {
+	cfg, err := parseIPv6FirewallConfig(config.Config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse ipv6 firewall config: %w", err)
+	}
+
+	baseMonitor, err := monitors.NewBaseMonitor(config.Name, config.Interval, config.Timeout)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create base monitor: %w", err)
+	}
+
+	monitor := &IPv6FirewallMonitor{
+		name:        config.Name,
+		config:      cfg,
+		executor:    &defaultCommandExecutor{},
+		BaseMonitor: baseMonitor,
+	}
+
+	if err := baseMonitor.SetCheckFunc(monitor.checkIPv6Firewall); err != nil {
+		return nil, fmt.Errorf("failed to set check function: %w", err)
+	}
+
+	return monitor, nil
+}
+
+// SetCommandExecutor overrides the command executor (used in tests to inject
+// canned ip6tables / nft output).
+func (m *IPv6FirewallMonitor) SetCommandExecutor(executor CommandExecutor) {
+	m.executor = executor
+}
+
+// parseIPv6FirewallConfig parses configuration from a generic map.
+func parseIPv6FirewallConfig(configMap map[string]any) (*IPv6FirewallConfig, error) {
+	config := &IPv6FirewallConfig{
+		ExpectIPv6Enabled: defaultIPv6FirewallExpectEnabled,
+		Backend:           defaultIPv6FirewallBackend,
+	}
+
+	if configMap == nil {
+		return config, nil
+	}
+
+	if v, ok := configMap["expectIPv6Enabled"]; ok {
+		boolVal, ok := v.(bool)
+		if !ok {
+			return nil, fmt.Errorf("expectIPv6Enabled must be a boolean, got %T", v)
+		}
+		config.ExpectIPv6Enabled = boolVal
+	}
+
+	if v, ok := configMap["backend"]; ok {
+		strVal, ok := v.(string)
+		if !ok {
+			return nil, fmt.Errorf("backend must be a string, got %T", v)
+		}
+		switch strVal {
+		case ipv6FirewallBackendAuto, ipv6FirewallBackendIP6Tables, ipv6FirewallBackendNFT:
+			config.Backend = strVal
+		default:
+			return nil, fmt.Errorf("backend must be one of %q, %q, or %q, got %q",
+				ipv6FirewallBackendAuto, ipv6FirewallBackendIP6Tables, ipv6FirewallBackendNFT, strVal)
+		}
+	}
+
+	return config, nil
+}
+
+// ValidateIPv6FirewallConfig validates the IPv6 firewall monitor configuration.
+func ValidateIPv6FirewallConfig(config types.MonitorConfig) error {
+	_, err := parseIPv6FirewallConfig(config.Config)
+	return err
+}
+
+// checkIPv6Firewall performs the IPv6 firewall sanity check.
+func (m *IPv6FirewallMonitor) checkIPv6Firewall(ctx context.Context) (*types.Status, error) {
+	status := types.NewStatus(m.name)
+
+	backend := m.resolveBackend()
+	if backend == "" {
+		// Neither tool is present. The node may legitimately lack a firewall
+		// tool; report as a warning, not an error, and leave the condition
+		// False (we cannot confirm a problem).
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6FirewallToolNotFound",
+			fmt.Sprintf("Neither %q nor %q was found in PATH; cannot assess the IPv6 firewall. "+
+				"This may be expected on a node without a host firewall.", nftBinary, ip6tablesBinary),
+		))
+		m.recordBlackholeAbsent(status, "IPv6FirewallToolUnavailable",
+			"No IPv6 firewall tool available; cannot confirm an IPv6 firewall black-hole")
+		return status, nil
+	}
+
+	if backend == ipv6FirewallBackendNFT {
+		m.checkNFT(ctx, status)
+		return status, nil
+	}
+	m.checkIP6Tables(ctx, status)
+	return status, nil
+}
+
+// resolveBackend determines which firewall backend to read. In auto mode it
+// prefers nft when present and falls back to ip6tables. A forced backend is
+// returned even if its binary is missing so the missing-binary path can report
+// it explicitly.
+func (m *IPv6FirewallMonitor) resolveBackend() string {
+	switch m.config.Backend {
+	case ipv6FirewallBackendNFT:
+		return ipv6FirewallBackendNFT
+	case ipv6FirewallBackendIP6Tables:
+		return ipv6FirewallBackendIP6Tables
+	default: // auto
+		if _, err := m.executor.LookPath(nftBinary); err == nil {
+			return ipv6FirewallBackendNFT
+		}
+		if _, err := m.executor.LookPath(ip6tablesBinary); err == nil {
+			return ipv6FirewallBackendIP6Tables
+		}
+		return ""
+	}
+}
+
+// checkNFT reads the nft ruleset (`nft list ruleset`) and evaluates it.
+func (m *IPv6FirewallMonitor) checkNFT(ctx context.Context, status *types.Status) {
+	if _, err := m.executor.LookPath(nftBinary); err != nil {
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6FirewallToolNotFound",
+			fmt.Sprintf("%q not found in PATH; cannot assess the IPv6 firewall via nft. "+
+				"This may be expected on a node without nftables.", nftBinary),
+		))
+		m.recordBlackholeAbsent(status, "IPv6FirewallToolUnavailable",
+			"nft is not available; cannot confirm an IPv6 firewall black-hole")
+		return
+	}
+
+	// Read-only: `nft list ruleset` only lists the current ruleset.
+	out, err := m.executor.Run(ctx, nftBinary, "list", "ruleset")
+	if err != nil {
+		m.recordReadError(status, nftBinary, err)
+		return
+	}
+
+	blackholed, chains := evaluateNFTRuleset(string(out))
+	m.recordBlackholeFinding(status, ipv6FirewallBackendNFT, blackholed, chains)
+}
+
+// checkIP6Tables reads the ip6tables filter table (`ip6tables -S`) and
+// evaluates it.
+func (m *IPv6FirewallMonitor) checkIP6Tables(ctx context.Context, status *types.Status) {
+	if _, err := m.executor.LookPath(ip6tablesBinary); err != nil {
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6FirewallToolNotFound",
+			fmt.Sprintf("%q not found in PATH; cannot assess the IPv6 firewall via ip6tables. "+
+				"This may be expected on a node without ip6tables.", ip6tablesBinary),
+		))
+		m.recordBlackholeAbsent(status, "IPv6FirewallToolUnavailable",
+			"ip6tables is not available; cannot confirm an IPv6 firewall black-hole")
+		return
+	}
+
+	// Read-only: `ip6tables -S` only prints (saves) the current rules.
+	out, err := m.executor.Run(ctx, ip6tablesBinary, "-S")
+	if err != nil {
+		m.recordReadError(status, ip6tablesBinary, err)
+		return
+	}
+
+	blackholed, chains := evaluateIP6TablesRuleset(string(out))
+	m.recordBlackholeFinding(status, ipv6FirewallBackendIP6Tables, blackholed, chains)
+}
+
+// recordReadError records a warning + False condition when the ruleset command
+// fails (e.g. permission denied). We cannot confirm a problem, so the condition
+// is reported False.
+func (m *IPv6FirewallMonitor) recordReadError(status *types.Status, tool string, err error) {
+	status.AddEvent(types.NewEvent(
+		types.EventWarning,
+		"IPv6FirewallReadError",
+		fmt.Sprintf("Failed to read the IPv6 firewall ruleset via %q: %v. "+
+			"This may indicate missing privileges (CAP_NET_ADMIN). "+
+			"This monitor is detection-only and does not modify rules.", tool, err),
+	))
+	m.recordBlackholeAbsent(status, "IPv6FirewallRulesetUnreadable",
+		fmt.Sprintf("IPv6 firewall ruleset could not be read via %q; cannot confirm a black-hole", tool))
+}
+
+// recordBlackholeFinding records the black-hole condition based on the
+// evaluation result and the ExpectIPv6Enabled gate.
+func (m *IPv6FirewallMonitor) recordBlackholeFinding(status *types.Status, backend string, blackholed bool, droppedChains []string) {
+	if !blackholed {
+		status.AddCondition(types.NewCondition(
+			conditionIPv6FirewallBlackhole,
+			types.ConditionFalse,
+			"IPv6FirewallHealthy",
+			fmt.Sprintf("IPv6 firewall (%s backend) is not black-holing traffic", backend),
+		))
+		status.AddEvent(types.NewEvent(
+			types.EventInfo,
+			"IPv6FirewallHealthy",
+			fmt.Sprintf("IPv6 firewall (%s backend) sanity check passed", backend),
+		))
+		return
+	}
+
+	finding := fmt.Sprintf("default policy DROP/REJECT with no ACCEPT rules on chains %s",
+		strings.Join(droppedChains, ", "))
+
+	if m.config.ExpectIPv6Enabled {
+		status.AddCondition(types.NewCondition(
+			conditionIPv6FirewallBlackhole,
+			types.ConditionTrue,
+			"IPv6FirewallBlackhole",
+			fmt.Sprintf("IPv6 firewall (%s backend) appears to black-hole IPv6 traffic: %s", backend, finding),
+		))
+		status.AddEvent(types.NewEvent(
+			types.EventWarning,
+			"IPv6FirewallBlackhole",
+			fmt.Sprintf("IPv6 firewall (%s backend) appears to black-hole IPv6 traffic: %s. "+
+				"If this cluster expects IPv6 connectivity, IPv6 pod networking may be broken. "+
+				"This monitor is detection-only and does not modify firewall rules.", backend, finding),
+		))
+		return
+	}
+
+	status.AddCondition(types.NewCondition(
+		conditionIPv6FirewallBlackhole,
+		types.ConditionFalse,
+		"IPv6FirewallBlackholeNotExpected",
+		fmt.Sprintf("IPv6 firewall (%s backend) black-holes IPv6 traffic (%s); expectIPv6Enabled=false so no action required",
+			backend, finding),
+	))
+	status.AddEvent(types.NewEvent(
+		types.EventInfo,
+		"IPv6FirewallBlackholeNotExpected",
+		fmt.Sprintf("IPv6 firewall (%s backend) black-holes IPv6 traffic (%s); expectIPv6Enabled=false so no action required",
+			backend, finding),
+	))
+}
+
+// recordBlackholeAbsent records the black-hole condition as False with the
+// supplied reason/message. Used when the monitor cannot confirm a problem
+// (tool missing, ruleset unreadable).
+func (m *IPv6FirewallMonitor) recordBlackholeAbsent(status *types.Status, reason, message string) {
+	status.AddCondition(types.NewCondition(
+		conditionIPv6FirewallBlackhole,
+		types.ConditionFalse,
+		reason,
+		message,
+	))
+}
+
+// evaluateNFTRuleset applies the black-hole heuristic to `nft list ruleset`
+// output. It returns true (with the offending chain names) only when every
+// inet/ip6 base chain of type filter with hook input/forward/output has a
+// "policy drop" (or reject) and the ruleset contains no "accept" verdict.
+//
+// The heuristic is conservative: presence of any accept rule anywhere clears
+// the finding, and chains are matched on hook name so this works for the common
+// `table inet filter` and `table ip6 filter` layouts.
+func evaluateNFTRuleset(ruleset string) (blackholed bool, droppedChains []string) {
+	hooksSeen := map[string]bool{}
+	hooksDropped := map[string]bool{}
+	hasAccept := false
+
+	var (
+		inChain      bool
+		chainHook    string
+		chainDropped bool
+	)
+
+	flush := func() {
+		if inChain && chainHook != "" {
+			hooksSeen[chainHook] = true
+			if chainDropped {
+				hooksDropped[chainHook] = true
+			}
+		}
+		inChain = false
+		chainHook = ""
+		chainDropped = false
+	}
+
+	for _, raw := range strings.Split(ruleset, "\n") {
+		line := strings.TrimSpace(raw)
+		if line == "" {
+			continue
+		}
+
+		// A new chain block begins with "chain <name> {".
+		if strings.HasPrefix(line, "chain ") && strings.HasSuffix(line, "{") {
+			flush()
+			inChain = true
+			continue
+		}
+		if line == "}" {
+			flush()
+			continue
+		}
+
+		// Any accept verdict (rule or policy) clears the black-hole finding.
+		if strings.Contains(line, "accept") {
+			hasAccept = true
+		}
+
+		if !inChain {
+			continue
+		}
+
+		// Base chain declaration: "type filter hook input priority 0; policy drop;"
+		if strings.Contains(line, "hook ") {
+			for _, hook := range []string{"input", "forward", "output"} {
+				if strings.Contains(line, "hook "+hook) {
+					chainHook = hook
+				}
+			}
+		}
+		if strings.Contains(line, "policy drop") || strings.Contains(line, "policy reject") {
+			chainDropped = true
+		}
+	}
+	flush()
+
+	return blackholeFromChainMap(hooksSeen, hooksDropped, hasAccept, []string{"input", "forward", "output"})
+}
+
+// evaluateIP6TablesRuleset applies the black-hole heuristic to `ip6tables -S`
+// output (the filter table). It returns true only when the default policy for
+// INPUT, FORWARD and OUTPUT is all DROP/REJECT and no "-A <chain> ... -j ACCEPT"
+// rule exists.
+func evaluateIP6TablesRuleset(ruleset string) (blackholed bool, droppedChains []string) {
+	policy := map[string]string{}
+	hasAccept := false
+
+	for _, raw := range strings.Split(ruleset, "\n") {
+		line := strings.TrimSpace(raw)
+		if line == "" {
+			continue
+		}
+
+		fields := strings.Fields(line)
+
+		// Policy line: "-P INPUT DROP"
+		if len(fields) >= 3 && fields[0] == "-P" {
+			policy[fields[1]] = strings.ToUpper(fields[2])
+			continue
+		}
+
+		// Append rule: "-A INPUT ... -j ACCEPT"
+		if len(fields) >= 2 && fields[0] == "-A" {
+			if strings.Contains(line, "-j ACCEPT") || strings.Contains(line, "--jump ACCEPT") {
+				hasAccept = true
+			}
+		}
+	}
+
+	seen := map[string]bool{}
+	dropped := map[string]bool{}
+	for _, chain := range ipv6FilterChains {
+		if pol, ok := policy[chain]; ok {
+			seen[chain] = true
+			if pol == "DROP" || pol == "REJECT" {
+				dropped[chain] = true
+			}
+		}
+	}
+
+	return blackholeFromChainMap(seen, dropped, hasAccept, ipv6FilterChains)
+}
+
+// blackholeFromChainMap returns the conservative black-hole verdict: true only
+// when at least one of the target chains was observed, every observed target
+// chain has a DROP/REJECT policy, all target chains were observed, and the
+// ruleset contains no ACCEPT verdict. droppedChains lists the offending chains
+// in canonical order.
+func blackholeFromChainMap(seen, dropped map[string]bool, hasAccept bool, order []string) (bool, []string) {
+	if hasAccept {
+		return false, nil
+	}
+
+	var droppedChains []string
+	allSeenAndDropped := true
+	for _, chain := range order {
+		if !seen[chain] {
+			allSeenAndDropped = false
+			continue
+		}
+		if dropped[chain] {
+			droppedChains = append(droppedChains, chain)
+		} else {
+			allSeenAndDropped = false
+		}
+	}
+
+	// Require every target chain to be present and dropping; a partially
+	// observed ruleset is treated as inconclusive to avoid false positives.
+	if !allSeenAndDropped || len(droppedChains) != len(order) {
+		return false, nil
+	}
+
+	return true, droppedChains
+}
diff --git a/pkg/monitors/network/ipv6_firewall_test.go b/pkg/monitors/network/ipv6_firewall_test.go
new file mode 100644
index 0000000..f47fb1d
--- /dev/null
+++ b/pkg/monitors/network/ipv6_firewall_test.go
@@ -0,0 +1,507 @@
+package network
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+// fakeFirewallExecutor is a test double for CommandExecutor. It returns canned
+// LookPath results and command output so tests never exec real ip6tables/nft.
+type fakeFirewallExecutor struct {
+	// present maps binary name -> whether LookPath should succeed.
+	present map[string]bool
+	// output maps "name args..." -> canned combined output.
+	output map[string]string
+	// runErr maps "name args..." -> error to return from Run.
+	runErr map[string]error
+	// calls records every Run invocation as "name args...".
+	calls []string
+}
+
+func newFakeFirewallExecutor() *fakeFirewallExecutor {
+	return &fakeFirewallExecutor{
+		present: map[string]bool{},
+		output:  map[string]string{},
+		runErr:  map[string]error{},
+	}
+}
+
+func (f *fakeFirewallExecutor) LookPath(name string) (string, error) {
+	if f.present[name] {
+		return "/usr/sbin/" + name, nil
+	}
+	return "", errors.New("exec: \"" + name + "\": executable file not found in $PATH")
+}
+
+func (f *fakeFirewallExecutor) Run(ctx context.Context, name string, args ...string) ([]byte, error) {
+	key := name
+	if len(args) > 0 {
+		key = name + " " + strings.Join(args, " ")
+	}
+	f.calls = append(f.calls, key)
+	if err, ok := f.runErr[key]; ok {
+		return nil, err
+	}
+	return []byte(f.output[key]), nil
+}
+
+// newTestFirewallMonitor builds a monitor with the supplied config and fake
+// executor for direct check-function invocation.
+func newTestFirewallMonitor(t *testing.T, cfg *IPv6FirewallConfig, exec CommandExecutor) *IPv6FirewallMonitor {
+	t.Helper()
+	monitor, err := NewIPv6FirewallMonitor(context.Background(), types.MonitorConfig{
+		Name:     "test-ipv6-firewall",
+		Type:     "network-ipv6-firewall",
+		Interval: 60 * time.Second,
+		Timeout:  5 * time.Second,
+	})
+	if err != nil {
+		t.Fatalf("NewIPv6FirewallMonitor() unexpected error: %v", err)
+	}
+	m := monitor.(*IPv6FirewallMonitor)
+	if cfg != nil {
+		m.config = cfg
+	}
+	if exec != nil {
+		m.SetCommandExecutor(exec)
+	}
+	return m
+}
+
+// findFirewallCondition returns the black-hole condition or nil.
+func findFirewallCondition(status *types.Status) *types.Condition {
+	for i := range status.Conditions {
+		if status.Conditions[i].Type == conditionIPv6FirewallBlackhole {
+			return &status.Conditions[i]
+		}
+	}
+	return nil
+}
+
+func hasEventSeverity(status *types.Status, sev types.EventSeverity) bool {
+	for i := range status.Events {
+		if status.Events[i].Severity == sev {
+			return true
+		}
+	}
+	return false
+}
+
+func TestParseIPv6FirewallConfigDefaults(t *testing.T) {
+	cfg, err := parseIPv6FirewallConfig(nil)
+	if err != nil {
+		t.Fatalf("parseIPv6FirewallConfig(nil) error: %v", err)
+	}
+	if !cfg.ExpectIPv6Enabled {
+		t.Errorf("ExpectIPv6Enabled default = false, want true")
+	}
+	if cfg.Backend != ipv6FirewallBackendAuto {
+		t.Errorf("Backend default = %q, want %q", cfg.Backend, ipv6FirewallBackendAuto)
+	}
+}
+
+func TestParseIPv6FirewallConfigValues(t *testing.T) {
+	cfg, err := parseIPv6FirewallConfig(map[string]any{
+		"expectIPv6Enabled": false,
+		"backend":           "nft",
+	})
+	if err != nil {
+		t.Fatalf("parseIPv6FirewallConfig() error: %v", err)
+	}
+	if cfg.ExpectIPv6Enabled {
+		t.Errorf("ExpectIPv6Enabled = true, want false")
+	}
+	if cfg.Backend != ipv6FirewallBackendNFT {
+		t.Errorf("Backend = %q, want %q", cfg.Backend, ipv6FirewallBackendNFT)
+	}
+}
+
+func TestParseIPv6FirewallConfigInvalid(t *testing.T) {
+	tests := []struct {
+		name      string
+		configMap map[string]any
+	}{
+		{"invalid backend", map[string]any{"backend": "iptables"}},
+		{"backend wrong type", map[string]any{"backend": 5}},
+		{"expect wrong type", map[string]any{"expectIPv6Enabled": "yes"}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if _, err := parseIPv6FirewallConfig(tt.configMap); err == nil {
+				t.Errorf("parseIPv6FirewallConfig(%v) expected error, got nil", tt.configMap)
+			}
+		})
+	}
+}
+
+func TestValidateIPv6FirewallConfig(t *testing.T) {
+	if err := ValidateIPv6FirewallConfig(types.MonitorConfig{
+		Config: map[string]any{"backend": "ip6tables"},
+	}); err != nil {
+		t.Errorf("ValidateIPv6FirewallConfig() valid config error: %v", err)
+	}
+	if err := ValidateIPv6FirewallConfig(types.MonitorConfig{
+		Config: map[string]any{"backend": "bogus"},
+	}); err == nil {
+		t.Errorf("ValidateIPv6FirewallConfig() invalid backend expected error")
+	}
+}
+
+func TestNewIPv6FirewallMonitor(t *testing.T) {
+	monitor, err := NewIPv6FirewallMonitor(context.Background(), types.MonitorConfig{
+		Name:     "fw",
+		Type:     "network-ipv6-firewall",
+		Interval: 60 * time.Second,
+		Timeout:  5 * time.Second,
+	})
+	if err != nil {
+		t.Fatalf("NewIPv6FirewallMonitor() error: %v", err)
+	}
+	fw, ok := monitor.(*IPv6FirewallMonitor)
+	if !ok {
+		t.Fatalf("NewIPv6FirewallMonitor returned wrong type")
+	}
+	if fw.GetName() != "fw" {
+		t.Errorf("GetName() = %q, want %q", fw.GetName(), "fw")
+	}
+}
+
+func TestNewIPv6FirewallMonitorInvalidConfig(t *testing.T) {
+	_, err := NewIPv6FirewallMonitor(context.Background(), types.MonitorConfig{
+		Name:     "fw",
+		Type:     "network-ipv6-firewall",
+		Interval: 60 * time.Second,
+		Timeout:  5 * time.Second,
+		Config:   map[string]any{"backend": "nope"},
+	})
+	if err == nil {
+		t.Fatalf("NewIPv6FirewallMonitor() expected error for invalid backend")
+	}
+}
+
+const healthyIP6TablesOutput = `-P INPUT ACCEPT
+-P FORWARD ACCEPT
+-P OUTPUT ACCEPT
+-A INPUT -p ipv6-icmp -j ACCEPT`
+
+const blackholeIP6TablesOutput = `-P INPUT DROP
+-P FORWARD DROP
+-P OUTPUT DROP`
+
+const partialDropIP6TablesOutput = `-P INPUT DROP
+-P FORWARD ACCEPT
+-P OUTPUT DROP`
+
+const healthyNFTOutput = `table inet filter {
+	chain input {
+		type filter hook input priority 0; policy drop;
+		ct state established,related accept
+	}
+	chain forward {
+		type filter hook forward priority 0; policy drop;
+	}
+	chain output {
+		type filter hook output priority 0; policy accept;
+	}
+}`
+
+const blackholeNFTOutput = `table inet filter {
+	chain input {
+		type filter hook input priority 0; policy drop;
+	}
+	chain forward {
+		type filter hook forward priority 0; policy drop;
+	}
+	chain output {
+		type filter hook output priority 0; policy drop;
+	}
+}`
+
+func TestCheckIPv6FirewallHealthyIP6Tables(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[ip6tablesBinary] = true
+	exec.output["ip6tables -S"] = healthyIP6TablesOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendIP6Tables}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected condition False, got %+v", cond)
+	}
+	// Confirm only the read-only -S verb was issued.
+	for _, c := range exec.calls {
+		if !strings.HasPrefix(c, "ip6tables -S") {
+			t.Errorf("unexpected command issued: %q", c)
+		}
+	}
+}
+
+func TestCheckIPv6FirewallBlackholeIP6Tables(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[ip6tablesBinary] = true
+	exec.output["ip6tables -S"] = blackholeIP6TablesOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendIP6Tables}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionTrue {
+		t.Fatalf("expected condition True (blackhole), got %+v", cond)
+	}
+	if !hasEventReason(status, "IPv6FirewallBlackhole") {
+		t.Errorf("expected IPv6FirewallBlackhole event")
+	}
+	if !hasEventSeverity(status, types.EventWarning) {
+		t.Errorf("expected a warning event")
+	}
+}
+
+func TestCheckIPv6FirewallBlackholeSuppressedWhenNotExpected(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[ip6tablesBinary] = true
+	exec.output["ip6tables -S"] = blackholeIP6TablesOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: false, Backend: ipv6FirewallBackendIP6Tables}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected condition False (suppressed), got %+v", cond)
+	}
+	if hasEventSeverity(status, types.EventWarning) {
+		t.Errorf("expected no warning event when expectIPv6Enabled=false")
+	}
+	if !hasEventReason(status, "IPv6FirewallBlackholeNotExpected") {
+		t.Errorf("expected IPv6FirewallBlackholeNotExpected event")
+	}
+}
+
+func TestCheckIPv6FirewallPartialDropNotBlackhole(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[ip6tablesBinary] = true
+	exec.output["ip6tables -S"] = partialDropIP6TablesOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendIP6Tables}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected condition False (partial drop is not a blackhole), got %+v", cond)
+	}
+}
+
+func TestCheckIPv6FirewallHealthyNFT(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[nftBinary] = true
+	exec.output["nft list ruleset"] = healthyNFTOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendNFT}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected condition False, got %+v", cond)
+	}
+	for _, c := range exec.calls {
+		if c != "nft list ruleset" {
+			t.Errorf("unexpected command issued: %q", c)
+		}
+	}
+}
+
+func TestCheckIPv6FirewallBlackholeNFT(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[nftBinary] = true
+	exec.output["nft list ruleset"] = blackholeNFTOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendNFT}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionTrue {
+		t.Fatalf("expected condition True (nft blackhole), got %+v", cond)
+	}
+	if !hasEventReason(status, "IPv6FirewallBlackhole") {
+		t.Errorf("expected IPv6FirewallBlackhole event")
+	}
+}
+
+func TestCheckIPv6FirewallAutoPrefersNFT(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[nftBinary] = true
+	exec.present[ip6tablesBinary] = true
+	exec.output["nft list ruleset"] = healthyNFTOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendAuto}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	if findFirewallCondition(status) == nil {
+		t.Fatalf("expected a condition")
+	}
+	for _, c := range exec.calls {
+		if strings.HasPrefix(c, "ip6tables") {
+			t.Errorf("auto mode should prefer nft, but ip6tables was invoked: %q", c)
+		}
+	}
+}
+
+func TestCheckIPv6FirewallAutoFallsBackToIP6Tables(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[ip6tablesBinary] = true // nft absent
+	exec.output["ip6tables -S"] = healthyIP6TablesOutput
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendAuto}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	if findFirewallCondition(status) == nil {
+		t.Fatalf("expected a condition")
+	}
+	ranIP6Tables := false
+	for _, c := range exec.calls {
+		if strings.HasPrefix(c, "ip6tables -S") {
+			ranIP6Tables = true
+		}
+	}
+	if !ranIP6Tables {
+		t.Errorf("auto mode should fall back to ip6tables -S; calls=%v", exec.calls)
+	}
+}
+
+func TestCheckIPv6FirewallToolNotFound(t *testing.T) {
+	exec := newFakeFirewallExecutor() // nothing present
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendAuto}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() should not hard error when tools absent: %v", err)
+	}
+	if !hasEventReason(status, "IPv6FirewallToolNotFound") {
+		t.Errorf("expected IPv6FirewallToolNotFound event")
+	}
+	if !hasEventSeverity(status, types.EventWarning) {
+		t.Errorf("expected warning severity for missing tool")
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected condition False when tools absent, got %+v", cond)
+	}
+}
+
+func TestCheckIPv6FirewallForcedBackendMissingBinary(t *testing.T) {
+	exec := newFakeFirewallExecutor() // nft forced but absent
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendNFT}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() error: %v", err)
+	}
+	if !hasEventReason(status, "IPv6FirewallToolNotFound") {
+		t.Errorf("expected IPv6FirewallToolNotFound event for forced-but-missing nft")
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected condition False, got %+v", cond)
+	}
+}
+
+func TestCheckIPv6FirewallPermissionDenied(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[ip6tablesBinary] = true
+	exec.runErr["ip6tables -S"] = errors.New("Permission denied (you must be root)")
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendIP6Tables}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() should not hard error on read failure: %v", err)
+	}
+	if !hasEventReason(status, "IPv6FirewallReadError") {
+		t.Errorf("expected IPv6FirewallReadError event")
+	}
+	if !hasEventSeverity(status, types.EventWarning) {
+		t.Errorf("expected warning severity for read error")
+	}
+	cond := findFirewallCondition(status)
+	if cond == nil || cond.Status != types.ConditionFalse {
+		t.Fatalf("expected condition False on read error, got %+v", cond)
+	}
+}
+
+func TestCheckIPv6FirewallNFTReadError(t *testing.T) {
+	exec := newFakeFirewallExecutor()
+	exec.present[nftBinary] = true
+	exec.runErr["nft list ruleset"] = errors.New("Operation not permitted")
+
+	m := newTestFirewallMonitor(t, &IPv6FirewallConfig{ExpectIPv6Enabled: true, Backend: ipv6FirewallBackendNFT}, exec)
+	status, err := m.checkIPv6Firewall(context.Background())
+	if err != nil {
+		t.Fatalf("checkIPv6Firewall() should not hard error: %v", err)
+	}
+	if !hasEventReason(status, "IPv6FirewallReadError") {
+		t.Errorf("expected IPv6FirewallReadError event")
+	}
+}
+
+func TestEvaluateIP6TablesRuleset(t *testing.T) {
+	tests := []struct {
+		name       string
+		ruleset    string
+		blackholed bool
+	}{
+		{"all drop no accept", blackholeIP6TablesOutput, true},
+		{"has accept rule", healthyIP6TablesOutput, false},
+		{"partial drop", partialDropIP6TablesOutput, false},
+		{"empty ruleset", "", false},
+		{"reject policy", "-P INPUT REJECT\n-P FORWARD REJECT\n-P OUTPUT REJECT", true},
+		{"only input observed", "-P INPUT DROP", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, _ := evaluateIP6TablesRuleset(tt.ruleset)
+			if got != tt.blackholed {
+				t.Errorf("evaluateIP6TablesRuleset() = %v, want %v", got, tt.blackholed)
+			}
+		})
+	}
+}
+
+func TestEvaluateNFTRuleset(t *testing.T) {
+	tests := []struct {
+		name       string
+		ruleset    string
+		blackholed bool
+	}{
+		{"all drop no accept", blackholeNFTOutput, true},
+		{"has accept somewhere", healthyNFTOutput, false},
+		{"empty", "", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, _ := evaluateNFTRuleset(tt.ruleset)
+			if got != tt.blackholed {
+				t.Errorf("evaluateNFTRuleset() = %v, want %v", got, tt.blackholed)
+			}
+		})
+	}
+}

From b8d6a793c8430137d66814e556d0053bdc36b231 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:05:27 -0500
Subject: [PATCH 06/38] test(cmd): verify new IPv6 monitors register +
 auto-enable (Task #17209)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The four IPv6 monitors (network-ipv6-{sysctl,route,neighbor,firewall})
self-register via the blank import of pkg/monitors/network and are
auto-enabled by ApplyDefaultMonitors since each provides a DefaultConfig
— no per-monitor wiring needed in cmd. This test pins that contract
(registration + default-application) so a future regression fails loudly.
Shipped-config additions are owned by #17228.
---
 .../main_ipv6_registration_test.go            | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 cmd/node-doctor/main_ipv6_registration_test.go

diff --git a/cmd/node-doctor/main_ipv6_registration_test.go b/cmd/node-doctor/main_ipv6_registration_test.go
new file mode 100644
index 0000000..b3670b5
--- /dev/null
+++ b/cmd/node-doctor/main_ipv6_registration_test.go
@@ -0,0 +1,73 @@
+package main
+
+import (
+	"testing"
+
+	"github.com/supporttools/node-doctor/pkg/monitors"
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+// newIPv6MonitorTypes are the IPv6/dual-stack monitors added under feature 1125.
+// They self-register via their package init() (reached through the blank import
+// of pkg/monitors/network in main.go) and are auto-enabled at startup by
+// monitors.ApplyDefaultMonitors because each provides a DefaultConfig.
+//
+// This test is the deliverable for task #17209 "Register new IPv6 monitors in
+// cmd/node-doctor": it pins the contract that these types are reachable from the
+// command binary and applied by default, so a future change to the blank import
+// or a monitor's init()/DefaultConfig fails loudly here.
+var newIPv6MonitorTypes = []string{
+	"network-ipv6-sysctl",
+	"network-ipv6-route",
+	"network-ipv6-neighbor",
+	"network-ipv6-firewall",
+}
+
+func TestIPv6Monitors_RegisteredInCommand(t *testing.T) {
+	for _, monitorType := range newIPv6MonitorTypes {
+		t.Run(monitorType, func(t *testing.T) {
+			if !monitors.IsRegistered(monitorType) {
+				t.Fatalf("monitor type %q is not registered; check the blank import of pkg/monitors/network in main.go and the monitor's init()", monitorType)
+			}
+
+			info := monitors.GetMonitorInfo(monitorType)
+			if info == nil {
+				t.Fatalf("GetMonitorInfo(%q) returned nil despite IsRegistered=true", monitorType)
+			}
+			if info.Factory == nil {
+				t.Errorf("monitor %q has a nil Factory", monitorType)
+			}
+			if info.DefaultConfig == nil {
+				t.Errorf("monitor %q has a nil DefaultConfig and so will NOT be auto-enabled by ApplyDefaultMonitors", monitorType)
+			}
+		})
+	}
+}
+
+func TestIPv6Monitors_AutoAppliedAsDefaults(t *testing.T) {
+	// Start from an empty configuration: ApplyDefaultMonitors should inject a
+	// default config for every registered monitor type that has a DefaultConfig,
+	// including the four new IPv6 monitors.
+	cfg := &types.NodeDoctorConfig{}
+
+	added := monitors.ApplyDefaultMonitors(cfg)
+
+	addedSet := make(map[string]bool, len(added))
+	for _, monitorType := range added {
+		addedSet[monitorType] = true
+	}
+
+	configuredSet := make(map[string]bool, len(cfg.Monitors))
+	for _, m := range cfg.Monitors {
+		configuredSet[m.Type] = true
+	}
+
+	for _, monitorType := range newIPv6MonitorTypes {
+		if !addedSet[monitorType] {
+			t.Errorf("ApplyDefaultMonitors did not add %q to a fresh config", monitorType)
+		}
+		if !configuredSet[monitorType] {
+			t.Errorf("after ApplyDefaultMonitors, config.Monitors is missing %q", monitorType)
+		}
+	}
+}

From 6074ae72f360c010a0c59fb534af650c913fdd7a Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:15:49 -0500
Subject: [PATCH 07/38] feat(net): dual-stack (::) default bind for exporter +
 health server (Task #17203)

Bind Prometheus exporter and health server to :: by default (accepts
IPv4+IPv6 when bindv6only=0), with graceful fallback to 0.0.0.0 when the
IPv6/dual-stack bind fails (nodes may have IPv6 disabled). Exporter bind
address is now configurable via prometheus.bindAddress (default ::);
replaced hardcoded 0.0.0.0 sprintf with net.JoinHostPort. Extracted
listenWithFallback/isDualStackHost helpers (unit-tested). Server inherits
caller ctx via BaseContext. cmd/main.go health server default -> ::.
---
 cmd/node-doctor/main.go                   |   6 +-
 pkg/exporters/prometheus/exporter.go      |  20 +++-
 pkg/exporters/prometheus/exporter_test.go | 130 ++++++++++++++++++++++
 pkg/exporters/prometheus/server.go        |  49 +++++++-
 pkg/health/server.go                      |  50 ++++++++-
 pkg/health/server_test.go                 | 122 ++++++++++++++++++++
 pkg/types/config.go                       |  44 +++++---
 pkg/types/config_test.go                  |  33 ++++++
 8 files changed, 420 insertions(+), 34 deletions(-)

diff --git a/cmd/node-doctor/main.go b/cmd/node-doctor/main.go
index 12106b4..d612a3d 100644
--- a/cmd/node-doctor/main.go
+++ b/cmd/node-doctor/main.go
@@ -375,8 +375,10 @@ func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remedi
 	// Create Health Server (always enabled for Kubernetes probes)
 	log.Printf("[INFO] Creating health server...")
 	healthServer, err := health.NewServer(&health.Config{
-		Enabled:      true,
-		BindAddress:  "0.0.0.0",
+		Enabled: true,
+		// "::" binds dual-stack (IPv4 + IPv6) with graceful fallback to
+		// "0.0.0.0" when IPv6 is disabled on the node (handled in Start()).
+		BindAddress:  "::",
 		Port:         8080,
 		ReadTimeout:  5 * time.Second,
 		WriteTimeout: 10 * time.Second,
diff --git a/pkg/exporters/prometheus/exporter.go b/pkg/exporters/prometheus/exporter.go
index a0cb31b..8cd47db 100644
--- a/pkg/exporters/prometheus/exporter.go
+++ b/pkg/exporters/prometheus/exporter.go
@@ -53,6 +53,9 @@ func NewPrometheusExporter(config *types.PrometheusExporterConfig, settings *typ
 	if config.Port == 0 {
 		config.Port = 9100
 	}
+	if config.BindAddress == "" {
+		config.BindAddress = types.DefaultHTTPBindAddress
+	}
 	if config.Path == "" {
 		config.Path = "/metrics"
 	}
@@ -110,9 +113,9 @@ func (e *PrometheusExporter) Start(ctx context.Context) error {
 	// Initialize static metrics
 	e.initializeStaticMetrics()
 
-	// Start HTTP server
-	addr := fmt.Sprintf("0.0.0.0:%d", e.config.Port)
-	server, err := startHTTPServer(ctx, addr, e.config.Path, e.registry)
+	// Start HTTP server. Binds to the configured BindAddress ("::" by default
+	// for dual-stack), with graceful IPv4 fallback handled by startHTTPServer.
+	server, err := startHTTPServer(ctx, e.config.BindAddress, e.config.Port, e.config.Path, e.registry)
 	if err != nil {
 		return fmt.Errorf("failed to start HTTP server: %w", err)
 	}
@@ -406,6 +409,9 @@ func (e *PrometheusExporter) Reload(config interface{}) error {
 	if prometheusConfig.Port == 0 {
 		prometheusConfig.Port = 9100
 	}
+	if prometheusConfig.BindAddress == "" {
+		prometheusConfig.BindAddress = types.DefaultHTTPBindAddress
+	}
 	if prometheusConfig.Path == "" {
 		prometheusConfig.Path = "/metrics"
 	}
@@ -462,8 +468,7 @@ func (e *PrometheusExporter) Reload(config interface{}) error {
 			ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 			defer cancel()
 
-			addr := fmt.Sprintf("0.0.0.0:%d", prometheusConfig.Port)
-			server, err := startHTTPServer(ctx, addr, prometheusConfig.Path, e.registry)
+			server, err := startHTTPServer(ctx, prometheusConfig.BindAddress, prometheusConfig.Port, prometheusConfig.Path, e.registry)
 			if err != nil {
 				return fmt.Errorf("failed to start new HTTP server: %w", err)
 			}
@@ -503,6 +508,11 @@ func (e *PrometheusExporter) needsServerRestart(oldConfig, newConfig *types.Prom
 		return true
 	}
 
+	// Check if bind address changed
+	if oldConfig.BindAddress != newConfig.BindAddress {
+		return true
+	}
+
 	// Check if path changed
 	if oldConfig.Path != newConfig.Path {
 		return true
diff --git a/pkg/exporters/prometheus/exporter_test.go b/pkg/exporters/prometheus/exporter_test.go
index 3c8d556..4f4d814 100644
--- a/pkg/exporters/prometheus/exporter_test.go
+++ b/pkg/exporters/prometheus/exporter_test.go
@@ -1180,3 +1180,133 @@ func TestPrometheusExporter_StartBindFailure(t *testing.T) {
 		t.Error("exporter.started should be false after a bind failure")
 	}
 }
+
+// TestNewPrometheusExporter_DualStackDefault verifies an empty BindAddress
+// defaults to "::" (dual-stack) in the constructor.
+func TestNewPrometheusExporter_DualStackDefault(t *testing.T) {
+	config := &types.PrometheusExporterConfig{
+		Enabled:   true,
+		Port:      freePort(t),
+		Namespace: "test",
+	}
+	settings := &types.GlobalSettings{NodeName: "test-node"}
+
+	exporter, err := NewPrometheusExporter(config, settings)
+	if err != nil {
+		t.Fatalf("failed to create exporter: %v", err)
+	}
+	if exporter.config.BindAddress != "::" {
+		t.Errorf("default BindAddress = %q, want %q", exporter.config.BindAddress, "::")
+	}
+}
+
+// TestPrometheusExporter_DualStackServesRequest verifies the exporter binds with
+// the default "::" (dual-stack) BindAddress and serves /metrics. The bind has an
+// automatic IPv4 fallback, so this passes whether or not IPv6 is available.
+func TestPrometheusExporter_DualStackServesRequest(t *testing.T) {
+	port := freePort(t)
+	config := &types.PrometheusExporterConfig{
+		Enabled:   true,
+		Port:      port,
+		Path:      "/metrics",
+		Namespace: "test",
+		// BindAddress intentionally left empty -> defaults to "::".
+	}
+	settings := &types.GlobalSettings{NodeName: "test-node"}
+
+	exporter, err := NewPrometheusExporter(config, settings)
+	if err != nil {
+		t.Fatalf("failed to create exporter: %v", err)
+	}
+	if err := exporter.Start(context.Background()); err != nil {
+		t.Fatalf("failed to start exporter: %v", err)
+	}
+	defer func() { _ = exporter.Stop() }()
+
+	addr := fmt.Sprintf("localhost:%d", port)
+	if err := waitForServerReady(addr, 5*time.Second); err != nil {
+		t.Fatalf("server never became ready: %v", err)
+	}
+	resp, err := newTestHTTPClient().Get(fmt.Sprintf("http://localhost:%d%s", port, config.Path))
+	if err != nil {
+		t.Fatalf("failed to connect to metrics server: %v", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		t.Errorf("expected status 200, got %d", resp.StatusCode)
+	}
+}
+
+// TestPrometheusExporter_ExplicitBindAddressHonored verifies an explicit
+// BindAddress is used as-is and serves a request.
+func TestPrometheusExporter_ExplicitBindAddressHonored(t *testing.T) {
+	port := freePort(t)
+	config := &types.PrometheusExporterConfig{
+		Enabled:     true,
+		BindAddress: "127.0.0.1",
+		Port:        port,
+		Path:        "/metrics",
+		Namespace:   "test",
+	}
+	settings := &types.GlobalSettings{NodeName: "test-node"}
+
+	exporter, err := NewPrometheusExporter(config, settings)
+	if err != nil {
+		t.Fatalf("failed to create exporter: %v", err)
+	}
+	if err := exporter.Start(context.Background()); err != nil {
+		t.Fatalf("failed to start exporter: %v", err)
+	}
+	defer func() { _ = exporter.Stop() }()
+
+	host, _, err := net.SplitHostPort(exporter.server.Addr)
+	if err != nil {
+		t.Fatalf("SplitHostPort(%q) error = %v", exporter.server.Addr, err)
+	}
+	if host != "127.0.0.1" {
+		t.Errorf("bound host = %q, want 127.0.0.1", host)
+	}
+
+	addr := fmt.Sprintf("localhost:%d", port)
+	if err := waitForServerReady(addr, 5*time.Second); err != nil {
+		t.Fatalf("server never became ready: %v", err)
+	}
+	resp, err := newTestHTTPClient().Get(fmt.Sprintf("http://localhost:%d%s", port, config.Path))
+	if err != nil {
+		t.Fatalf("failed to connect to metrics server: %v", err)
+	}
+	resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		t.Errorf("expected status 200, got %d", resp.StatusCode)
+	}
+}
+
+func TestIsDualStackHost(t *testing.T) {
+	tests := []struct {
+		host string
+		want bool
+	}{
+		{"", true},
+		{"::", true},
+		{"::1", true},
+		{"fe80::1", true},
+		{"0.0.0.0", false},
+		{"127.0.0.1", false},
+	}
+	for _, tt := range tests {
+		if got := isDualStackHost(tt.host); got != tt.want {
+			t.Errorf("isDualStackHost(%q) = %v, want %v", tt.host, got, tt.want)
+		}
+	}
+}
+
+func TestListenWithFallback_Success(t *testing.T) {
+	ln, err := listenWithFallback("127.0.0.1", 0)
+	if err != nil {
+		t.Fatalf("listenWithFallback() error = %v", err)
+	}
+	defer ln.Close()
+	if ln.Addr() == nil {
+		t.Fatal("listenWithFallback() returned nil Addr")
+	}
+}
diff --git a/pkg/exporters/prometheus/server.go b/pkg/exporters/prometheus/server.go
index 5e0557f..75b1a33 100644
--- a/pkg/exporters/prometheus/server.go
+++ b/pkg/exporters/prometheus/server.go
@@ -6,14 +6,50 @@ import (
 	"log"
 	"net"
 	"net/http"
+	"strconv"
 	"time"
 
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 )
 
-// startHTTPServer starts an HTTP server to serve the /metrics endpoint
-func startHTTPServer(ctx context.Context, addr, path string, registry *prometheus.Registry) (*http.Server, error) {
+// isDualStackHost reports whether host represents a dual-stack/IPv6 wildcard
+// bind that may fail on nodes where IPv6 is disabled. This covers the empty
+// host, the IPv6 unspecified address "::", and any other IPv6 literal.
+// For these we attempt a graceful fallback to the IPv4 wildcard "0.0.0.0".
+func isDualStackHost(host string) bool {
+	if host == "" || host == "::" {
+		return true
+	}
+	ip := net.ParseIP(host)
+	return ip != nil && ip.To4() == nil
+}
+
+// listenWithFallback opens a TCP listener on host:port. When the host is a
+// dual-stack/IPv6 wildcard (e.g. "::" or empty) and the bind fails — typically
+// because IPv6 is disabled on the node (net.ipv6.bindv6only / missing kernel
+// module) — it logs a warning and retries on the IPv4 wildcard "0.0.0.0".
+func listenWithFallback(host string, port int) (net.Listener, error) {
+	addr := net.JoinHostPort(host, strconv.Itoa(port))
+	ln, err := net.Listen("tcp", addr)
+	if err == nil {
+		return ln, nil
+	}
+	if isDualStackHost(host) {
+		fallbackAddr := net.JoinHostPort("0.0.0.0", strconv.Itoa(port))
+		log.Printf("[WARN] failed to bind %s (%v); falling back to IPv4 %s", addr, err, fallbackAddr)
+		fln, ferr := net.Listen("tcp", fallbackAddr)
+		if ferr != nil {
+			return nil, fmt.Errorf("bind failed on %s (%v) and IPv4 fallback %s (%w)", addr, err, fallbackAddr, ferr)
+		}
+		return fln, nil
+	}
+	return nil, fmt.Errorf("failed to bind %s: %w", addr, err)
+}
+
+// startHTTPServer starts an HTTP server to serve the /metrics endpoint.
+// It binds host:port with a graceful IPv4 fallback for dual-stack hosts.
+func startHTTPServer(ctx context.Context, host string, port int, path string, registry *prometheus.Registry) (*http.Server, error) {
 	if registry == nil {
 		return nil, fmt.Errorf("registry cannot be nil")
 	}
@@ -37,18 +73,21 @@ func startHTTPServer(ctx context.Context, addr, path string, registry *prometheu
 	})
 
 	// Eagerly bind the listener so bind failures propagate synchronously.
-	ln, err := net.Listen("tcp", addr)
+	// listenWithFallback handles IPv6/dual-stack -> IPv4 fallback.
+	ln, err := listenWithFallback(host, port)
 	if err != nil {
-		return nil, fmt.Errorf("prometheus server failed to bind %s: %w", addr, err)
+		return nil, fmt.Errorf("prometheus server failed to bind: %w", err)
 	}
 
-	// Create HTTP server
+	// Create HTTP server. BaseContext propagates the caller's context to every
+	// in-flight request so they observe shutdown/cancellation.
 	server := &http.Server{
 		Addr:         ln.Addr().String(),
 		Handler:      mux,
 		ReadTimeout:  30 * time.Second,
 		WriteTimeout: 30 * time.Second,
 		IdleTimeout:  60 * time.Second,
+		BaseContext:  func(net.Listener) context.Context { return ctx },
 	}
 
 	// Start server using the already-bound listener
diff --git a/pkg/health/server.go b/pkg/health/server.go
index 5300d6e..4d3eb9b 100644
--- a/pkg/health/server.go
+++ b/pkg/health/server.go
@@ -36,7 +36,9 @@ type Config struct {
 	// Enabled controls whether the health server is running
 	Enabled bool
 
-	// BindAddress is the address to bind to (default: 0.0.0.0)
+	// BindAddress is the address to bind to (default: "::" for dual-stack,
+	// which accepts both IPv4 and IPv6 on Linux when net.ipv6.bindv6only=0).
+	// Falls back to "0.0.0.0" when the IPv6/dual-stack bind fails.
 	BindAddress string
 
 	// Port is the port to listen on (default: 8080)
@@ -100,6 +102,39 @@ type StatusResponse struct {
 	Metadata      map[string]string `json:"metadata,omitempty"`
 }
 
+// isDualStackHost reports whether host is a dual-stack/IPv6 wildcard bind that
+// may fail on nodes where IPv6 is disabled. Covers the empty host, the IPv6
+// unspecified address "::", and any other IPv6 literal.
+func isDualStackHost(host string) bool {
+	if host == "" || host == "::" {
+		return true
+	}
+	ip := net.ParseIP(host)
+	return ip != nil && ip.To4() == nil
+}
+
+// listenWithFallback opens a TCP listener on host:port using net.JoinHostPort
+// for correct IPv6 bracketing. When the host is a dual-stack/IPv6 wildcard and
+// the bind fails (typically because IPv6 is disabled on the node), it logs a
+// warning and retries on the IPv4 wildcard "0.0.0.0".
+func listenWithFallback(host string, port int) (net.Listener, error) {
+	addr := net.JoinHostPort(host, strconv.Itoa(port))
+	ln, err := net.Listen("tcp", addr)
+	if err == nil {
+		return ln, nil
+	}
+	if isDualStackHost(host) {
+		fallbackAddr := net.JoinHostPort("0.0.0.0", strconv.Itoa(port))
+		log.Printf("[WARN] failed to bind %s (%v); falling back to IPv4 %s", addr, err, fallbackAddr)
+		fln, ferr := net.Listen("tcp", fallbackAddr)
+		if ferr != nil {
+			return nil, fmt.Errorf("bind failed on %s (%v) and IPv4 fallback %s (%w)", addr, err, fallbackAddr, ferr)
+		}
+		return fln, nil
+	}
+	return nil, fmt.Errorf("failed to bind %s: %w", addr, err)
+}
+
 // NewServer creates a new health server with the given configuration.
 func NewServer(config *Config) (*Server, error) {
 	if config == nil {
@@ -108,7 +143,10 @@ func NewServer(config *Config) (*Server, error) {
 
 	// Apply defaults
 	if config.BindAddress == "" {
-		config.BindAddress = "0.0.0.0"
+		// "::" binds dual-stack (both IPv4 and IPv6) on Linux when
+		// net.ipv6.bindv6only=0; Start() falls back to "0.0.0.0" if IPv6
+		// is disabled on the node.
+		config.BindAddress = "::"
 	}
 	// Port 0 is intentionally allowed — net.Listen("tcp", "host:0") lets the OS
 	// pick a free port atomically. Tests use Port: 0 to avoid TOCTOU port-grab races.
@@ -148,14 +186,14 @@ func (s *Server) Start(ctx context.Context) error {
 	mux.HandleFunc("/status", s.handleStatus)
 	mux.HandleFunc("/remediation/history", s.handleRemediationHistory)
 
-	addr := fmt.Sprintf("%s:%d", s.config.BindAddress, s.config.Port)
-
 	// Eagerly bind the listener so bind failures propagate synchronously.
 	// Using net.Listen + Serve instead of ListenAndServe avoids the race where
 	// a goroutine fails silently after Start() returns success.
-	ln, err := net.Listen("tcp", addr)
+	// listenWithFallback uses net.JoinHostPort (correct IPv6 bracketing) and
+	// retries on "0.0.0.0" when a dual-stack/IPv6 bind fails.
+	ln, err := listenWithFallback(s.config.BindAddress, s.config.Port)
 	if err != nil {
-		return fmt.Errorf("health server failed to bind %s: %w", addr, err)
+		return fmt.Errorf("health server failed to bind: %w", err)
 	}
 
 	s.httpServer = &http.Server{
diff --git a/pkg/health/server_test.go b/pkg/health/server_test.go
index b2592d1..64f8908 100644
--- a/pkg/health/server_test.go
+++ b/pkg/health/server_test.go
@@ -647,3 +647,125 @@ func TestServer_Stop_NoDeadlockWithInFlightHandler(t *testing.T) {
 		t.Fatal("Stop() did not return within 1 s after handler released — likely deadlock")
 	}
 }
+
+// TestNewServer_DualStackDefault verifies that an empty BindAddress defaults to
+// "::" (dual-stack), not the legacy "0.0.0.0".
+func TestNewServer_DualStackDefault(t *testing.T) {
+	server, err := NewServer(&Config{Enabled: true})
+	if err != nil {
+		t.Fatalf("NewServer() error = %v", err)
+	}
+	if server.config.BindAddress != "::" {
+		t.Errorf("default BindAddress = %q, want %q", server.config.BindAddress, "::")
+	}
+}
+
+// TestServer_StartDualStackServesRequest verifies the health server binds with
+// the default "::" (dual-stack) BindAddress and serves a request. The bind has
+// an automatic IPv4 fallback, so this passes whether or not IPv6 is available.
+func TestServer_StartDualStackServesRequest(t *testing.T) {
+	// Empty BindAddress -> defaults to "::"; Port 0 -> ephemeral port.
+	server, err := NewServer(&Config{Enabled: true, Port: 0})
+	if err != nil {
+		t.Fatalf("NewServer() error = %v", err)
+	}
+	if err := server.Start(context.Background()); err != nil {
+		t.Fatalf("Start() error = %v", err)
+	}
+	defer func() { _ = server.Stop() }()
+
+	// httpServer.Addr is the actual bound address (host:port). Dial it via the
+	// loopback to confirm the listener is serving requests.
+	_, port, err := net.SplitHostPort(server.httpServer.Addr)
+	if err != nil {
+		t.Fatalf("SplitHostPort(%q) error = %v", server.httpServer.Addr, err)
+	}
+	resp, err := http.Get("http://127.0.0.1:" + port + "/healthz") //nolint:noctx
+	if err != nil {
+		t.Fatalf("GET /healthz error = %v", err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		t.Errorf("GET /healthz status = %d, want %d", resp.StatusCode, http.StatusOK)
+	}
+}
+
+// TestServer_StartExplicitBindAddressHonored verifies an explicit BindAddress is
+// used as-is (no fallback) and serves a request.
+func TestServer_StartExplicitBindAddressHonored(t *testing.T) {
+	server, err := NewServer(&Config{Enabled: true, BindAddress: "127.0.0.1", Port: 0})
+	if err != nil {
+		t.Fatalf("NewServer() error = %v", err)
+	}
+	if err := server.Start(context.Background()); err != nil {
+		t.Fatalf("Start() error = %v", err)
+	}
+	defer func() { _ = server.Stop() }()
+
+	host, port, err := net.SplitHostPort(server.httpServer.Addr)
+	if err != nil {
+		t.Fatalf("SplitHostPort(%q) error = %v", server.httpServer.Addr, err)
+	}
+	if host != "127.0.0.1" {
+		t.Errorf("bound host = %q, want 127.0.0.1", host)
+	}
+	resp, err := http.Get("http://127.0.0.1:" + port + "/healthz") //nolint:noctx
+	if err != nil {
+		t.Fatalf("GET /healthz error = %v", err)
+	}
+	resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		t.Errorf("GET /healthz status = %d, want %d", resp.StatusCode, http.StatusOK)
+	}
+}
+
+func TestIsDualStackHost(t *testing.T) {
+	tests := []struct {
+		host string
+		want bool
+	}{
+		{"", true},
+		{"::", true},
+		{"::1", true},
+		{"fe80::1", true},
+		{"0.0.0.0", false},
+		{"127.0.0.1", false},
+		{"192.168.1.1", false},
+	}
+	for _, tt := range tests {
+		if got := isDualStackHost(tt.host); got != tt.want {
+			t.Errorf("isDualStackHost(%q) = %v, want %v", tt.host, got, tt.want)
+		}
+	}
+}
+
+// TestListenWithFallback_Success confirms a bindable host returns a listener.
+func TestListenWithFallback_Success(t *testing.T) {
+	ln, err := listenWithFallback("127.0.0.1", 0)
+	if err != nil {
+		t.Fatalf("listenWithFallback() error = %v", err)
+	}
+	defer ln.Close()
+	if ln.Addr() == nil {
+		t.Fatal("listenWithFallback() returned nil Addr")
+	}
+}
+
+// TestListenWithFallback_NonDualStackNoFallback confirms that a bind failure on
+// a non-dual-stack host (e.g. 127.0.0.1) is returned as an error WITHOUT
+// retrying on 0.0.0.0 — the fallback only applies to dual-stack/IPv6 hosts.
+func TestListenWithFallback_NonDualStackNoFallback(t *testing.T) {
+	// Occupy a port on 127.0.0.1 so a second bind on the same host:port fails.
+	occupied, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("failed to grab a free port: %v", err)
+	}
+	defer occupied.Close()
+	port := occupied.Addr().(*net.TCPAddr).Port
+
+	ln, err := listenWithFallback("127.0.0.1", port)
+	if err == nil {
+		ln.Close()
+		t.Fatal("expected bind error for occupied 127.0.0.1 port, got nil")
+	}
+}
diff --git a/pkg/types/config.go b/pkg/types/config.go
index 5913dfb..e31394a 100644
--- a/pkg/types/config.go
+++ b/pkg/types/config.go
@@ -11,16 +11,19 @@ import (
 
 // Package-level defaults
 const (
-	DefaultLogLevel                 = "info"
-	DefaultLogFormat                = "json"
-	DefaultLogOutput                = "stdout"
-	DefaultUpdateInterval           = "10s"
-	DefaultResyncInterval           = "60s"
-	DefaultHeartbeatInterval        = "5m"
-	DefaultQPS                      = 50
-	DefaultBurst                    = 100
-	DefaultHTTPPort                 = 8080
-	DefaultHTTPBindAddress          = "0.0.0.0"
+	DefaultLogLevel          = "info"
+	DefaultLogFormat         = "json"
+	DefaultLogOutput         = "stdout"
+	DefaultUpdateInterval    = "10s"
+	DefaultResyncInterval    = "60s"
+	DefaultHeartbeatInterval = "5m"
+	DefaultQPS               = 50
+	DefaultBurst             = 100
+	DefaultHTTPPort          = 8080
+	// DefaultHTTPBindAddress binds all interfaces dual-stack. On Linux with the
+	// default net.ipv6.bindv6only=0, binding to "::" accepts BOTH IPv4 and IPv6
+	// connections. Callers fall back to "0.0.0.0" when IPv6 is unavailable.
+	DefaultHTTPBindAddress          = "::"
 	DefaultPrometheusPort           = 9100
 	DefaultPrometheusPath           = "/metrics"
 	DefaultMonitorInterval          = "30s"
@@ -381,12 +384,17 @@ type RetryConfig struct {
 
 // PrometheusExporterConfig configures the Prometheus exporter.
 type PrometheusExporterConfig struct {
-	Enabled   bool              `json:"enabled" yaml:"enabled"`
-	Port      int               `json:"port,omitempty" yaml:"port,omitempty"`
-	Path      string            `json:"path,omitempty" yaml:"path,omitempty"`
-	Namespace string            `json:"namespace,omitempty" yaml:"namespace,omitempty"`
-	Subsystem string            `json:"subsystem,omitempty" yaml:"subsystem,omitempty"`
-	Labels    map[string]string `json:"labels,omitempty" yaml:"labels,omitempty"`
+	Enabled bool `json:"enabled" yaml:"enabled"`
+	Port    int  `json:"port,omitempty" yaml:"port,omitempty"`
+	// BindAddress is the address the metrics HTTP server binds to.
+	// Defaults to "::" (dual-stack: accepts both IPv4 and IPv6 on Linux when
+	// net.ipv6.bindv6only=0). The exporter falls back to "0.0.0.0" if the
+	// IPv6/dual-stack bind fails (e.g. IPv6 disabled on the node).
+	BindAddress string            `json:"bindAddress,omitempty" yaml:"bindAddress,omitempty"`
+	Path        string            `json:"path,omitempty" yaml:"path,omitempty"`
+	Namespace   string            `json:"namespace,omitempty" yaml:"namespace,omitempty"`
+	Subsystem   string            `json:"subsystem,omitempty" yaml:"subsystem,omitempty"`
+	Labels      map[string]string `json:"labels,omitempty" yaml:"labels,omitempty"`
 }
 
 // RemediationConfig contains global remediation settings.
@@ -928,6 +936,9 @@ func (p *PrometheusExporterConfig) ApplyDefaults() error {
 	if p.Port == 0 {
 		p.Port = DefaultPrometheusPort
 	}
+	if p.BindAddress == "" {
+		p.BindAddress = DefaultHTTPBindAddress
+	}
 	if p.Path == "" {
 		p.Path = DefaultPrometheusPath
 	}
@@ -1742,6 +1753,7 @@ func (w *WebhookEndpoint) SubstituteEnvVars() {
 
 // SubstituteEnvVars performs environment variable substitution on PrometheusExporterConfig.
 func (p *PrometheusExporterConfig) SubstituteEnvVars() {
+	p.BindAddress = os.ExpandEnv(p.BindAddress)
 	p.Namespace = os.ExpandEnv(p.Namespace)
 	p.Subsystem = os.ExpandEnv(p.Subsystem)
 
diff --git a/pkg/types/config_test.go b/pkg/types/config_test.go
index 558e601..5d1e47c 100644
--- a/pkg/types/config_test.go
+++ b/pkg/types/config_test.go
@@ -3101,3 +3101,36 @@ func TestValidateWithRegistry_DependsOnCrossValidation(t *testing.T) {
 		}
 	})
 }
+
+// TestPrometheusExporterConfigApplyDefaults verifies defaults, including the
+// dual-stack BindAddress ("::") applied when the field is empty.
+func TestPrometheusExporterConfigApplyDefaults(t *testing.T) {
+	t.Run("empty config gets dual-stack defaults", func(t *testing.T) {
+		p := &PrometheusExporterConfig{Enabled: true}
+		if err := p.ApplyDefaults(); err != nil {
+			t.Fatalf("ApplyDefaults() error = %v", err)
+		}
+		if p.BindAddress != "::" {
+			t.Errorf("BindAddress = %q, want %q", p.BindAddress, "::")
+		}
+		if p.BindAddress != DefaultHTTPBindAddress {
+			t.Errorf("BindAddress = %q, want DefaultHTTPBindAddress %q", p.BindAddress, DefaultHTTPBindAddress)
+		}
+		if p.Port != DefaultPrometheusPort {
+			t.Errorf("Port = %d, want %d", p.Port, DefaultPrometheusPort)
+		}
+		if p.Path != DefaultPrometheusPath {
+			t.Errorf("Path = %q, want %q", p.Path, DefaultPrometheusPath)
+		}
+	})
+
+	t.Run("explicit BindAddress is preserved", func(t *testing.T) {
+		p := &PrometheusExporterConfig{Enabled: true, BindAddress: "127.0.0.1"}
+		if err := p.ApplyDefaults(); err != nil {
+			t.Fatalf("ApplyDefaults() error = %v", err)
+		}
+		if p.BindAddress != "127.0.0.1" {
+			t.Errorf("BindAddress = %q, want 127.0.0.1 (should not be overridden)", p.BindAddress)
+		}
+	})
+}

From 6bea04566723473c46024e17bc359354d27ec911 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:20:16 -0500
Subject: [PATCH 08/38] feat(kubernetes): IPv6 loopback fallback for kubelet
 probes (Task #17204)

When a kubelet healthz/metrics probe to a loopback host fails with a
connection-level (dial) error, retry once against the opposite-family
loopback ([::1] <-> 127.0.0.1) preserving scheme/port/path/query/auth.
Only loopback hosts are rewritten; HTTP 4xx/5xx and deadline timeouts do
not trigger fallback. Shared seam doRequestWithLoopbackFallback covers
both healthz and metrics. Helpers unit-tested; ::1 bind test skips when
IPv6 loopback unavailable.
---
 pkg/monitors/kubernetes/kubelet.go            | 166 +++++++++-
 .../kubernetes/kubelet_loopback_test.go       | 294 ++++++++++++++++++
 2 files changed, 458 insertions(+), 2 deletions(-)
 create mode 100644 pkg/monitors/kubernetes/kubelet_loopback_test.go

diff --git a/pkg/monitors/kubernetes/kubelet.go b/pkg/monitors/kubernetes/kubelet.go
index 65f0642..9e2c07b 100644
--- a/pkg/monitors/kubernetes/kubelet.go
+++ b/pkg/monitors/kubernetes/kubelet.go
@@ -9,6 +9,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"log"
 	"net"
 	"net/http"
 	"net/url"
@@ -310,6 +311,167 @@ func (c *defaultKubeletClient) addAuthHeader(req *http.Request) error {
 	return nil
 }
 
+// isLoopbackHost reports whether host (which may be a bare host or include a
+// port, e.g. "127.0.0.1:10248" or "[::1]:10250") refers to a recognized
+// loopback address: the literal "localhost", any address in 127.0.0.0/8, or
+// the IPv6 loopback "::1". Non-loopback hosts (including hostnames that merely
+// resolve to a loopback at runtime) return false so they are never rewritten.
+func isLoopbackHost(host string) bool {
+	if host == "" {
+		return false
+	}
+
+	// Strip a port if present. SplitHostPort fails for bare hosts, in which
+	// case we fall back to the original value.
+	h := host
+	if hostOnly, _, err := net.SplitHostPort(host); err == nil {
+		h = hostOnly
+	}
+
+	// "localhost" is a recognized loopback name; resolution may yield either
+	// or both families, which is exactly the ambiguity this fallback handles.
+	if strings.EqualFold(h, "localhost") {
+		return true
+	}
+
+	if ip := net.ParseIP(h); ip != nil {
+		return ip.IsLoopback()
+	}
+
+	return false
+}
+
+// loopbackFallbackURL inspects rawURL and, when its host is a recognized
+// loopback (localhost, 127.0.0.0/8, or ::1), returns an equivalent URL whose
+// host has been rewritten to the IPv6 loopback "[::1]" (or, when the original
+// host was already the IPv6 loopback, to the IPv4 loopback "127.0.0.1").
+// Scheme, port, path, query, and userinfo are preserved. The boolean result is
+// true only when a rewrite was performed; for non-loopback hosts (or parse
+// failures) it returns ("", false) so callers never rewrite a non-loopback
+// host.
+func loopbackFallbackURL(rawURL string) (string, bool) {
+	parsed, err := url.Parse(rawURL)
+	if err != nil {
+		return "", false
+	}
+
+	host := parsed.Hostname()
+	if host == "" || !isLoopbackHost(host) {
+		return "", false
+	}
+
+	// Determine the opposite-family loopback target.
+	var target string
+	if ip := net.ParseIP(host); ip != nil && ip.To4() == nil {
+		// Original host is the IPv6 loopback (::1) -> fall back to IPv4.
+		target = "127.0.0.1"
+	} else {
+		// Original host is localhost or an IPv4 loopback -> fall back to IPv6.
+		target = "::1"
+	}
+
+	// Preserve the port if one was specified. net.JoinHostPort correctly
+	// brackets IPv6 literals (e.g. "[::1]:10248").
+	if port := parsed.Port(); port != "" {
+		parsed.Host = net.JoinHostPort(target, port)
+	} else if target == "::1" {
+		parsed.Host = "[::1]"
+	} else {
+		parsed.Host = target
+	}
+
+	return parsed.String(), true
+}
+
+// isConnectionLevelError reports whether err represents a transport/dial-level
+// failure (connection refused, no route to host, dial failure, etc.) as
+// opposed to a successful HTTP response carrying an error status code. Only
+// connection-level failures should trigger the IPv6 loopback fallback: an HTTP
+// 4xx/5xx means kubelet answered and is therefore reachable on the probed
+// loopback.
+//
+// This is intentionally conservative. It treats *net.OpError (dial/read/write
+// transport failures, which wrap syscall errors like ECONNREFUSED and
+// EHOSTUNREACH) as connection-level. It explicitly does NOT treat context
+// cancellation or deadline-driven timeouts as connection-level, since those
+// usually indicate the request as a whole ran out of time rather than a
+// wrong-family loopback.
+func isConnectionLevelError(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	// Deadline/cancellation are not connection-level: a slow-but-reachable
+	// kubelet should not cause us to silently probe the other family.
+	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+		return false
+	}
+
+	// A *net.OpError on the "dial" op is the canonical signal for
+	// connection-refused / no-route style failures. errors.As unwraps the
+	// *url.Error that http.Client.Do returns.
+	var opErr *net.OpError
+	if errors.As(err, &opErr) {
+		// A dial-phase failure is a connection-level error. Read/write phase
+		// OpErrors also indicate the transport could not complete, which is a
+		// reasonable signal to retry the alternate loopback.
+		return true
+	}
+
+	return false
+}
+
+// doRequestWithLoopbackFallback executes req against c.client. If the request
+// fails with a connection-level error (see isConnectionLevelError) and the
+// request targets a recognized loopback host, it rebuilds the request against
+// the opposite-family loopback and retries exactly once. It returns the
+// response from whichever attempt succeeded and a boolean indicating whether
+// the fallback path was taken.
+//
+// On a successful primary attempt no second request is made and usedFallback
+// is false. The label argument ("healthz" or "metrics") is used only for
+// logging.
+func (c *defaultKubeletClient) doRequestWithLoopbackFallback(req *http.Request, label string) (resp *http.Response, usedFallback bool, err error) {
+	resp, err = c.client.Do(req)
+	if err == nil {
+		return resp, false, nil
+	}
+
+	// Only fall back on connection-level (dial) failures, and only when the
+	// original host is a recognized loopback we are allowed to rewrite.
+	if !isConnectionLevelError(err) {
+		return nil, false, err
+	}
+
+	fallbackURL, ok := loopbackFallbackURL(req.URL.String())
+	if !ok {
+		return nil, false, err
+	}
+
+	primaryErr := err
+
+	// Rebuild the request against the alternate loopback, preserving method,
+	// context, and headers (including any auth header already applied).
+	fbReq, buildErr := http.NewRequestWithContext(req.Context(), req.Method, fallbackURL, nil)
+	if buildErr != nil {
+		// Could not build the fallback request; surface the original error.
+		return nil, false, primaryErr
+	}
+	fbReq.Header = req.Header.Clone()
+
+	log.Printf("[INFO] kubelet %s: loopback probe to %s failed (%v), retrying %s",
+		label, req.URL.Host, primaryErr, fbReq.URL.Host)
+
+	resp, err = c.client.Do(fbReq)
+	if err != nil {
+		// Both families failed. Return the fallback error so the message
+		// reflects the most recent (alternate-loopback) attempt.
+		return nil, true, err
+	}
+
+	return resp, true, nil
+}
+
 // CheckHealth performs a health check against the kubelet healthz endpoint.
 func (c *defaultKubeletClient) CheckHealth(ctx context.Context) error {
 	req, err := http.NewRequestWithContext(ctx, "GET", c.healthzURL, nil)
@@ -322,7 +484,7 @@ func (c *defaultKubeletClient) CheckHealth(ctx context.Context) error {
 		return fmt.Errorf("failed to add authentication header: %w", err)
 	}
 
-	resp, err := c.client.Do(req)
+	resp, _, err := c.doRequestWithLoopbackFallback(req, "healthz")
 	if err != nil {
 		return fmt.Errorf("health check request failed: %w", err)
 	}
@@ -350,7 +512,7 @@ func (c *defaultKubeletClient) GetMetrics(ctx context.Context) (*KubeletMetrics,
 		return nil, fmt.Errorf("failed to add authentication header: %w", err)
 	}
 
-	resp, err := c.client.Do(req)
+	resp, _, err := c.doRequestWithLoopbackFallback(req, "metrics")
 	if err != nil {
 		return nil, fmt.Errorf("metrics request failed: %w", err)
 	}
diff --git a/pkg/monitors/kubernetes/kubelet_loopback_test.go b/pkg/monitors/kubernetes/kubelet_loopback_test.go
new file mode 100644
index 0000000..4c0315e
--- /dev/null
+++ b/pkg/monitors/kubernetes/kubelet_loopback_test.go
@@ -0,0 +1,294 @@
+package kubernetes
+
+import (
+	"context"
+	"errors"
+	"net"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+	"time"
+)
+
+// newLoopbackTestClient builds a defaultKubeletClient whose healthz/metrics
+// URLs point at the supplied addresses, with a short timeout and no auth.
+func newLoopbackTestClient(healthzURL, metricsURL string) *defaultKubeletClient {
+	cfg := &KubeletMonitorConfig{
+		HealthzURL:  healthzURL,
+		MetricsURL:  metricsURL,
+		HTTPTimeout: 2 * time.Second,
+	}
+	return newDefaultKubeletClient(cfg).(*defaultKubeletClient)
+}
+
+// TestLoopbackFallbackURL_Rewrite verifies the URL-rewrite helper for the
+// loopback families and that non-loopback hosts are never rewritten.
+func TestLoopbackFallbackURL_Rewrite(t *testing.T) {
+	tests := []struct {
+		name     string
+		in       string
+		wantURL  string
+		wantBool bool
+	}{
+		{
+			name:     "ipv4 loopback rewrites to ipv6",
+			in:       "http://127.0.0.1:10248/healthz",
+			wantURL:  "http://[::1]:10248/healthz",
+			wantBool: true,
+		},
+		{
+			name:     "ipv4 loopback metrics with query preserved",
+			in:       "http://127.0.0.1:10250/metrics?foo=bar",
+			wantURL:  "http://[::1]:10250/metrics?foo=bar",
+			wantBool: true,
+		},
+		{
+			name:     "ipv6 loopback rewrites to ipv4",
+			in:       "http://[::1]:10248/healthz",
+			wantURL:  "http://127.0.0.1:10248/healthz",
+			wantBool: true,
+		},
+		{
+			name:     "localhost rewrites to ipv6",
+			in:       "https://localhost:10250/metrics",
+			wantURL:  "https://[::1]:10250/metrics",
+			wantBool: true,
+		},
+		{
+			name:     "127.0.0.0/8 loopback rewrites to ipv6",
+			in:       "http://127.0.0.53:10248/healthz",
+			wantURL:  "http://[::1]:10248/healthz",
+			wantBool: true,
+		},
+		{
+			name:     "no port preserved",
+			in:       "http://127.0.0.1/healthz",
+			wantURL:  "http://[::1]/healthz",
+			wantBool: true,
+		},
+		{
+			name:     "non-loopback host not rewritten",
+			in:       "http://10.0.0.5:10248/healthz",
+			wantBool: false,
+		},
+		{
+			name:     "public hostname not rewritten",
+			in:       "http://kubelet.example.com:10250/metrics",
+			wantBool: false,
+		},
+		{
+			name:     "invalid url not rewritten",
+			in:       "://not a url",
+			wantBool: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			gotURL, gotBool := loopbackFallbackURL(tt.in)
+			if gotBool != tt.wantBool {
+				t.Fatalf("loopbackFallbackURL(%q) bool = %v, want %v", tt.in, gotBool, tt.wantBool)
+			}
+			if tt.wantBool && gotURL != tt.wantURL {
+				t.Fatalf("loopbackFallbackURL(%q) = %q, want %q", tt.in, gotURL, tt.wantURL)
+			}
+			if !tt.wantBool && gotURL != "" {
+				t.Fatalf("loopbackFallbackURL(%q) returned url %q with bool=false, want empty", tt.in, gotURL)
+			}
+		})
+	}
+}
+
+// TestIsLoopbackHost covers host classification including host:port forms.
+func TestIsLoopbackHost(t *testing.T) {
+	tests := []struct {
+		host string
+		want bool
+	}{
+		{"127.0.0.1", true},
+		{"127.0.0.1:10248", true},
+		{"127.0.0.53", true},
+		{"::1", true},
+		{"[::1]:10250", true},
+		{"localhost", true},
+		{"LOCALHOST", true},
+		{"localhost:10248", true},
+		{"10.0.0.5", false},
+		{"10.0.0.5:10248", false},
+		{"example.com", false},
+		{"", false},
+	}
+	for _, tt := range tests {
+		if got := isLoopbackHost(tt.host); got != tt.want {
+			t.Errorf("isLoopbackHost(%q) = %v, want %v", tt.host, got, tt.want)
+		}
+	}
+}
+
+// TestIsConnectionLevelError verifies the error classification used to decide
+// whether the loopback fallback should fire.
+func TestIsConnectionLevelError(t *testing.T) {
+	t.Run("nil is not connection-level", func(t *testing.T) {
+		if isConnectionLevelError(nil) {
+			t.Fatal("nil should not be connection-level")
+		}
+	})
+
+	t.Run("context canceled is not connection-level", func(t *testing.T) {
+		if isConnectionLevelError(context.Canceled) {
+			t.Fatal("context.Canceled should not be connection-level")
+		}
+	})
+
+	t.Run("deadline exceeded is not connection-level", func(t *testing.T) {
+		if isConnectionLevelError(context.DeadlineExceeded) {
+			t.Fatal("context.DeadlineExceeded should not be connection-level")
+		}
+	})
+
+	t.Run("dial OpError is connection-level", func(t *testing.T) {
+		// A real connection-refused error: dial a closed port.
+		ln, err := net.Listen("tcp", "127.0.0.1:0")
+		if err != nil {
+			t.Fatalf("listen: %v", err)
+		}
+		addr := ln.Addr().String()
+		_ = ln.Close() // close so the port is refused
+
+		_, dialErr := net.Dial("tcp", addr)
+		if dialErr == nil {
+			t.Skip("expected dial to fail against closed port; environment reused port")
+		}
+		if !isConnectionLevelError(dialErr) {
+			t.Fatalf("dial error %v should be connection-level", dialErr)
+		}
+	})
+
+	t.Run("plain error is not connection-level", func(t *testing.T) {
+		if isConnectionLevelError(errors.New("boom")) {
+			t.Fatal("plain error should not be connection-level")
+		}
+	})
+}
+
+// TestLoopbackFallback_PrimarySucceeds ensures no fallback request is made when
+// the primary loopback probe succeeds. We count requests on the server.
+func TestLoopbackFallback_PrimarySucceeds(t *testing.T) {
+	var count int
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		count++
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer srv.Close()
+
+	// httptest binds to 127.0.0.1 by default, a recognized loopback.
+	c := newLoopbackTestClient(srv.URL+"/healthz", srv.URL+"/metrics")
+
+	if err := c.CheckHealth(context.Background()); err != nil {
+		t.Fatalf("CheckHealth returned error: %v", err)
+	}
+	if count != 1 {
+		t.Fatalf("expected exactly 1 request, got %d", count)
+	}
+}
+
+// TestLoopbackFallback_HTTP500NoFallback ensures an HTTP error status (kubelet
+// answered) does NOT trigger a fallback. The server is hit exactly once.
+func TestLoopbackFallback_HTTP500NoFallback(t *testing.T) {
+	var count int
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		count++
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer srv.Close()
+
+	c := newLoopbackTestClient(srv.URL+"/healthz", srv.URL+"/metrics")
+
+	err := c.CheckHealth(context.Background())
+	if err == nil {
+		t.Fatal("expected CheckHealth to fail on HTTP 500")
+	}
+	if count != 1 {
+		t.Fatalf("HTTP 500 must not trigger fallback; expected 1 request, got %d", count)
+	}
+}
+
+// TestLoopbackFallback_NonLoopbackNoFallback verifies the execution helper does
+// not attempt a fallback for a non-loopback host that fails to dial.
+func TestLoopbackFallback_NonLoopbackNoFallback(t *testing.T) {
+	// 192.0.2.0/24 is TEST-NET-1 (RFC 5737), guaranteed not routable. Use a
+	// port; the dial should fail fast-ish. Keep timeout short via client.
+	c := newLoopbackTestClient("http://192.0.2.1:10248/healthz", "http://192.0.2.1:10250/metrics")
+
+	req, err := http.NewRequestWithContext(context.Background(), "GET", c.healthzURL, nil)
+	if err != nil {
+		t.Fatalf("new request: %v", err)
+	}
+	_, usedFallback, doErr := c.doRequestWithLoopbackFallback(req, "healthz")
+	if doErr == nil {
+		t.Skip("dial unexpectedly succeeded against TEST-NET address")
+	}
+	if usedFallback {
+		t.Fatal("non-loopback host must not trigger loopback fallback")
+	}
+}
+
+// TestLoopbackFallback_IPv4FailsIPv6Succeeds binds a server on the IPv6
+// loopback only, targets the IPv4 loopback (which will refuse), and verifies
+// the fallback retries [::1] and succeeds. Skips if ::1 cannot be bound.
+func TestLoopbackFallback_IPv4FailsIPv6Succeeds(t *testing.T) {
+	ln, err := net.Listen("tcp6", "[::1]:0")
+	if err != nil {
+		t.Skipf("cannot bind [::1] in this environment: %v", err)
+	}
+
+	srv := &httptest.Server{
+		Listener: ln,
+		Config: &http.Server{Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			w.WriteHeader(http.StatusOK)
+		})},
+	}
+	srv.Start()
+	defer srv.Close()
+
+	// Extract the port the IPv6 server is listening on.
+	_, port, err := net.SplitHostPort(ln.Addr().String())
+	if err != nil {
+		t.Fatalf("split host port: %v", err)
+	}
+
+	// Find a 127.0.0.1 port that is NOT listening, so the primary IPv4 probe
+	// gets connection-refused. Reuse the IPv6 port number on 127.0.0.1: it is
+	// very likely closed there since the server bound only to [::1].
+	ipv4Target := "http://" + net.JoinHostPort("127.0.0.1", port) + "/healthz"
+
+	// Sanity: confirm nothing answers on 127.0.0.1:port. If something does,
+	// skip rather than produce a misleading result.
+	if conn, derr := net.DialTimeout("tcp4", net.JoinHostPort("127.0.0.1", port), 200*time.Millisecond); derr == nil {
+		_ = conn.Close()
+		t.Skip("127.0.0.1 port unexpectedly in use; cannot exercise refused-primary path")
+	}
+
+	c := newLoopbackTestClient(ipv4Target, ipv4Target)
+
+	req, err := http.NewRequestWithContext(context.Background(), "GET", c.healthzURL, nil)
+	if err != nil {
+		t.Fatalf("new request: %v", err)
+	}
+	resp, usedFallback, doErr := c.doRequestWithLoopbackFallback(req, "healthz")
+	if doErr != nil {
+		t.Fatalf("expected fallback to [::1] to succeed, got error: %v", doErr)
+	}
+	defer resp.Body.Close()
+
+	if !usedFallback {
+		t.Fatal("expected fallback to be used (IPv4 refused, IPv6 reachable)")
+	}
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("expected 200 from IPv6 server, got %d", resp.StatusCode)
+	}
+	if !strings.Contains(resp.Request.URL.Host, "::1") {
+		t.Fatalf("expected final request host to be [::1], got %q", resp.Request.URL.Host)
+	}
+}

From c11fc21825381fae75ff83627b58632518f202aa Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:25:12 -0500
Subject: [PATCH 09/38] feat(network): IPv6 link-local zone ID support in ICMP
 pinger (Task #17233)

resolveTarget now parses fe80::1%eth0 (split on last %, retains zone) and
returns the zone; singlePing sends via net.IPAddr{IP,Zone} so link-local
pings carry the required scope. Also fix reply matching to compare IP
bytes (IP.Equal) instead of String(), so zoned replies (fe80::1%eth0)
match the zoneless target. Family stays ipv6 for link-local. Tests cover
zone parsing, destAddr threading, and zone-tolerant peer matching.
---
 pkg/monitors/network/pinger.go      | 76 ++++++++++++++++++++------
 pkg/monitors/network/pinger_test.go | 85 +++++++++++++++++++++++++++--
 2 files changed, 138 insertions(+), 23 deletions(-)

diff --git a/pkg/monitors/network/pinger.go b/pkg/monitors/network/pinger.go
index f020cf1..57ce50c 100644
--- a/pkg/monitors/network/pinger.go
+++ b/pkg/monitors/network/pinger.go
@@ -7,6 +7,7 @@ import (
 	"log"
 	"math/rand"
 	"net"
+	"strings"
 	"sync/atomic"
 	"time"
 
@@ -69,35 +70,73 @@ func newDefaultPinger() Pinger {
 	return &defaultPinger{}
 }
 
-// resolveTarget parses the target as an IP literal, or resolves it as a
-// hostname. Returns the chosen IP and family. When the target is a hostname,
-// IPv4 is preferred for backward compatibility; if no IPv4 address is
-// available, the first IPv6 address is used.
-func resolveTarget(target string) (net.IP, string, error) {
-	if ip := net.ParseIP(target); ip != nil {
+// resolveTarget parses the target as an IP literal (optionally carrying an
+// IPv6 zone/scope ID such as "fe80::1%eth0"), or resolves it as a hostname.
+// It returns the chosen IP, the IPv6 zone (empty for IPv4 and zone-less
+// targets), and the address family. When the target is a hostname, IPv4 is
+// preferred for backward compatibility; if no IPv4 address is available, the
+// first IPv6 address is used. Hostname resolution never invents a zone.
+//
+// The zone is parsed by splitting on the LAST "%" in the target and validating
+// that the leading portion is a valid IP literal. We deliberately do not use
+// net.ResolveIPAddr here: ResolveIPAddr would perform a DNS lookup for hostname
+// targets (changing the existing IPv4-preferring LookupIP behavior) and would
+// also issue network lookups for malformed inputs. The manual split keeps IP
+// literal and hostname paths cleanly separated.
+func resolveTarget(target string) (net.IP, string, string, error) {
+	// Separate a possible IPv6 zone suffix (e.g. "fe80::1%eth0"). The address
+	// part is only treated as zoned when it parses as an IP literal; otherwise
+	// the original target is left intact for hostname resolution so that, e.g.,
+	// a hostname containing "%" is not silently mangled.
+	addr, zone := target, ""
+	if i := strings.LastIndex(target, "%"); i >= 0 {
+		if candidate := net.ParseIP(target[:i]); candidate != nil {
+			addr, zone = target[:i], target[i+1:]
+		}
+	}
+
+	if ip := net.ParseIP(addr); ip != nil {
 		if ip.To4() != nil {
-			return ip.To4(), FamilyIPv4, nil
+			// IPv4 addresses do not carry a zone.
+			return ip.To4(), "", FamilyIPv4, nil
 		}
-		return ip, FamilyIPv6, nil
+		return ip, zone, FamilyIPv6, nil
 	}
 
 	ips, err := net.LookupIP(target)
 	if err != nil || len(ips) == 0 {
-		return nil, "", fmt.Errorf("failed to resolve target %s: %w", target, err)
+		return nil, "", "", fmt.Errorf("failed to resolve target %s: %w", target, err)
 	}
 
 	for _, resolvedIP := range ips {
 		if resolvedIP.To4() != nil {
-			return resolvedIP.To4(), FamilyIPv4, nil
+			return resolvedIP.To4(), "", FamilyIPv4, nil
 		}
 	}
 	for _, resolvedIP := range ips {
 		if resolvedIP.To16() != nil {
-			return resolvedIP, FamilyIPv6, nil
+			return resolvedIP, "", FamilyIPv6, nil
 		}
 	}
 
-	return nil, "", fmt.Errorf("no usable IP address found for target %s", target)
+	return nil, "", "", fmt.Errorf("no usable IP address found for target %s", target)
+}
+
+// destAddr builds the destination address for a ping send, carrying the IPv6
+// zone/scope ID when present. Link-local IPv6 destinations (fe80::/10) require
+// the zone for the kernel to select the correct outgoing interface.
+func destAddr(ip net.IP, zone string) *net.IPAddr {
+	return &net.IPAddr{IP: ip, Zone: zone}
+}
+
+// peerMatchesIP reports whether the reply came from the target IP, ignoring any
+// zone/scope ID the kernel may attach to a link-local peer address. It compares
+// the underlying IP bytes so that "fe80::1%eth0" matches the target "fe80::1".
+func peerMatchesIP(peer net.Addr, ip net.IP) bool {
+	if ipAddr, ok := peer.(*net.IPAddr); ok {
+		return ipAddr.IP.Equal(ip)
+	}
+	return peer.String() == ip.String()
 }
 
 // listenICMP opens an ICMP packet connection for the given address family
@@ -141,7 +180,7 @@ func isEchoReply(family string, msgType icmp.Type) bool {
 // IPv4 or IPv6 based on the resolved target. Returns one PingResult per
 // attempt; each result carries the address family used.
 func (p *defaultPinger) Ping(ctx context.Context, target string, count int, timeout time.Duration) ([]PingResult, error) {
-	ip, family, err := resolveTarget(target)
+	ip, zone, family, err := resolveTarget(target)
 	if err != nil {
 		return nil, err
 	}
@@ -162,7 +201,7 @@ func (p *defaultPinger) Ping(ctx context.Context, target string, count int, time
 		default:
 		}
 
-		result := p.singlePing(ctx, conn, ip, family, protocol, echoType, timeout)
+		result := p.singlePing(ctx, conn, ip, zone, family, protocol, echoType, timeout)
 		results = append(results, result)
 
 		// Small delay between pings (100ms)
@@ -183,6 +222,7 @@ func (p *defaultPinger) singlePing(
 	ctx context.Context,
 	conn *icmp.PacketConn,
 	ip net.IP,
+	zone string,
 	family string,
 	protocol int,
 	echoType icmp.Type,
@@ -225,7 +265,7 @@ func (p *defaultPinger) singlePing(
 
 	// Send echo request
 	start := time.Now()
-	_, err = conn.WriteTo(msgBytes, &net.IPAddr{IP: ip})
+	_, err = conn.WriteTo(msgBytes, destAddr(ip, zone))
 	if err != nil {
 		log.Printf("[DEBUG] Ping to %s (%s): failed to send: %v", ip, family, err)
 		return PingResult{
@@ -277,8 +317,10 @@ func (p *defaultPinger) singlePing(
 			continue
 		}
 
-		// Verify it's from the target IP
-		if peer.String() != ip.String() {
+		// Verify it's from the target IP. Compare the address bytes rather than
+		// the string form so a link-local reply carrying a zone suffix
+		// (e.g. "fe80::1%eth0") still matches the zone-less target IP.
+		if !peerMatchesIP(peer, ip) {
 			continue
 		}
 
diff --git a/pkg/monitors/network/pinger_test.go b/pkg/monitors/network/pinger_test.go
index 41d849f..cd1d0c4 100644
--- a/pkg/monitors/network/pinger_test.go
+++ b/pkg/monitors/network/pinger_test.go
@@ -266,18 +266,30 @@ func TestResolveTarget(t *testing.T) {
 	tests := []struct {
 		name       string
 		target     string
+		wantIP     string // expected IP string (empty = skip exact check)
+		wantZone   string
 		wantFamily string
 		wantErr    bool
 	}{
-		{name: "IPv4 literal", target: "192.0.2.1", wantFamily: FamilyIPv4},
-		{name: "IPv6 literal", target: "2001:db8::1", wantFamily: FamilyIPv6},
-		{name: "IPv6 loopback literal", target: "::1", wantFamily: FamilyIPv6},
-		{name: "IPv4-mapped literal collapses to v4", target: "::ffff:192.0.2.1", wantFamily: FamilyIPv4},
+		{name: "IPv4 literal", target: "192.0.2.1", wantIP: "192.0.2.1", wantZone: "", wantFamily: FamilyIPv4},
+		{name: "IPv6 literal", target: "2001:db8::1", wantIP: "2001:db8::1", wantZone: "", wantFamily: FamilyIPv6},
+		{name: "IPv6 loopback literal", target: "::1", wantIP: "::1", wantZone: "", wantFamily: FamilyIPv6},
+		{name: "IPv4-mapped literal collapses to v4", target: "::ffff:192.0.2.1", wantIP: "192.0.2.1", wantZone: "", wantFamily: FamilyIPv4},
+		// Link-local IPv6 with a zone/scope ID.
+		{name: "link-local with zone", target: "fe80::1%eth0", wantIP: "fe80::1", wantZone: "eth0", wantFamily: FamilyIPv6},
+		// Link-local IPv6 without a zone (bare).
+		{name: "link-local without zone", target: "fe80::1", wantIP: "fe80::1", wantZone: "", wantFamily: FamilyIPv6},
+		// Zones are retained for any IPv6, not only link-local.
+		{name: "global IPv6 with zone retained", target: "2001:db8::1%eth1", wantIP: "2001:db8::1", wantZone: "eth1", wantFamily: FamilyIPv6},
+		// Empty zone after '%': manual split leaves the address valid with an
+		// empty zone (the trailing '%' is not treated as a zone). This is the
+		// behavior we implement; assert it explicitly.
+		{name: "empty zone after percent", target: "fe80::1%", wantIP: "fe80::1", wantZone: "", wantFamily: FamilyIPv6},
 		{name: "empty target", target: "", wantErr: true},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			ip, family, err := resolveTarget(tt.target)
+			ip, zone, family, err := resolveTarget(tt.target)
 			if (err != nil) != tt.wantErr {
 				t.Fatalf("resolveTarget(%q) err=%v wantErr=%v", tt.target, err, tt.wantErr)
 			}
@@ -288,7 +300,68 @@ func TestResolveTarget(t *testing.T) {
 				t.Errorf("family = %q, want %q", family, tt.wantFamily)
 			}
 			if ip == nil {
-				t.Errorf("ip is nil for target %q", tt.target)
+				t.Fatalf("ip is nil for target %q", tt.target)
+			}
+			if tt.wantIP != "" && ip.String() != tt.wantIP {
+				t.Errorf("ip = %q, want %q", ip.String(), tt.wantIP)
+			}
+			if zone != tt.wantZone {
+				t.Errorf("zone = %q, want %q", zone, tt.wantZone)
+			}
+		})
+	}
+}
+
+// TestDestAddr verifies the zone reaches the *net.IPAddr used for sending.
+// Actually transmitting link-local ICMP requires privileges and a real
+// interface, so we unit-test the destination builder instead.
+func TestDestAddr(t *testing.T) {
+	tests := []struct {
+		name string
+		ip   string
+		zone string
+	}{
+		{name: "link-local with zone", ip: "fe80::1", zone: "eth0"},
+		{name: "ipv6 no zone", ip: "2001:db8::1", zone: ""},
+		{name: "ipv4 no zone", ip: "192.0.2.1", zone: ""},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ip := net.ParseIP(tt.ip)
+			if ip == nil {
+				t.Fatalf("bad test IP %q", tt.ip)
+			}
+			addr := destAddr(ip, tt.zone)
+			if addr.Zone != tt.zone {
+				t.Errorf("destAddr zone = %q, want %q", addr.Zone, tt.zone)
+			}
+			if !addr.IP.Equal(ip) {
+				t.Errorf("destAddr IP = %v, want %v", addr.IP, ip)
+			}
+		})
+	}
+}
+
+// TestPeerMatchesIP verifies that a link-local reply carrying a zone still
+// matches the zone-less target IP used by the receive loop.
+func TestPeerMatchesIP(t *testing.T) {
+	target := net.ParseIP("fe80::1")
+	if target == nil {
+		t.Fatal("bad target IP")
+	}
+	tests := []struct {
+		name string
+		peer net.Addr
+		want bool
+	}{
+		{name: "zoned peer matches", peer: &net.IPAddr{IP: net.ParseIP("fe80::1"), Zone: "eth0"}, want: true},
+		{name: "zoneless peer matches", peer: &net.IPAddr{IP: net.ParseIP("fe80::1")}, want: true},
+		{name: "different ip does not match", peer: &net.IPAddr{IP: net.ParseIP("fe80::2"), Zone: "eth0"}, want: false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := peerMatchesIP(tt.peer, target); got != tt.want {
+				t.Errorf("peerMatchesIP(%v, %v) = %v, want %v", tt.peer, target, got, tt.want)
 			}
 		})
 	}

From 61170381720d807cbdb8258d525943eb5585802f Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:30:59 -0500
Subject: [PATCH 10/38] feat(network): httpPinger emits Family for hostname
 targets (Task #17234)

Pre-resolve hostname targets via the injectable Resolver (LookupIP),
select an address (IPv4-preferred, IPv6 if no v4) to set PingResult.Family
truthfully, and pin the dial to that IP via a per-target Transport
DialContext (original Host header preserved for TLS/vhost). Resolution
failure falls back to the original unpinned probe with empty family. IP
literals unchanged (resolver not consulted). Tests use a fake resolver.
---
 pkg/monitors/network/http_pinger.go      |  98 +++++++++++++--
 pkg/monitors/network/http_pinger_test.go | 145 +++++++++++++++++++++++
 2 files changed, 236 insertions(+), 7 deletions(-)

diff --git a/pkg/monitors/network/http_pinger.go b/pkg/monitors/network/http_pinger.go
index 0b26cd1..e0a54d4 100644
--- a/pkg/monitors/network/http_pinger.go
+++ b/pkg/monitors/network/http_pinger.go
@@ -18,6 +18,10 @@ type httpPinger struct {
 	port   int
 	path   string
 	client *http.Client
+	// resolver pre-resolves hostname targets so the reported address family is
+	// truthful and the dial is pinned to the resolved address. It is injectable
+	// for testing; production uses the package default (system) resolver.
+	resolver Resolver
 }
 
 // newHTTPPinger creates a new HTTP-based pinger targeting the given port and path.
@@ -29,8 +33,9 @@ func newHTTPPinger(port int, path string) Pinger {
 		path = defaultProbePath
 	}
 	return &httpPinger{
-		port: port,
-		path: path,
+		port:     port,
+		path:     path,
+		resolver: newDefaultResolver(),
 		client: &http.Client{
 			// Per-request timeout is set via context; this is a safety net.
 			Timeout: 30 * time.Second,
@@ -64,9 +69,32 @@ func hostFamily(target string) string {
 func (p *httpPinger) Ping(ctx context.Context, target string, count int, timeout time.Duration) ([]PingResult, error) {
 	results := make([]PingResult, 0, count)
 
-	family := hostFamily(target)
+	// The URL host always remains the original target (hostname or IP literal)
+	// so TLS/vhost routing keeps working; only the dial target IP is pinned.
 	url := "http://" + net.JoinHostPort(target, strconv.Itoa(p.port)) + p.path
 
+	// family is the reported address family. For IP literals it is derived
+	// directly from the literal. For hostnames it is empty until resolution.
+	family := hostFamily(target)
+
+	// client is the HTTP client used for probes. For hostname targets we build
+	// a per-target client whose DialContext is pinned to the resolved address so
+	// the emitted family is accurate and matches the connection actually made.
+	client := p.client
+
+	if family == "" {
+		// target is a hostname: pre-resolve to determine the true address family
+		// and pin the dial to the chosen address.
+		if resolved, resolvedFamily, ok := p.resolveTarget(ctx, target); ok {
+			family = resolvedFamily
+			client = p.pinnedClient(resolved)
+		}
+		// On resolution failure we fall through with the original URL/client and
+		// empty family (graceful fallback): a resolvable-but-unreachable host is
+		// not turned into a resolution error; only the DNS step failing here
+		// leaves family empty and lets the probe surface its own error.
+	}
+
 	for i := 0; i < count; i++ {
 		// Check context before each probe
 		select {
@@ -75,7 +103,7 @@ func (p *httpPinger) Ping(ctx context.Context, target string, count int, timeout
 		default:
 		}
 
-		result := p.singleProbe(ctx, url, family, timeout)
+		result := p.singleProbe(ctx, client, url, family, timeout)
 		results = append(results, result)
 
 		// 100ms delay between probes (same as ICMP pinger)
@@ -91,8 +119,64 @@ func (p *httpPinger) Ping(ctx context.Context, target string, count int, timeout
 	return results, nil
 }
 
-// singleProbe performs a single HTTP GET and measures RTT.
-func (p *httpPinger) singleProbe(ctx context.Context, url, family string, timeout time.Duration) PingResult {
+// resolveTarget resolves a hostname target to a single address and reports the
+// address family to emit on results. Selection policy: prefer the first IPv4
+// address (matching the existing pinger's hostname IPv4-preference); if only
+// IPv6 addresses are returned, use the first IPv6 address. The returned ok is
+// false when resolution fails or yields no usable address, in which case the
+// caller falls back to the original (unpinned) behavior.
+func (p *httpPinger) resolveTarget(ctx context.Context, host string) (ip net.IP, family string, ok bool) {
+	ips, err := p.resolver.LookupIP(ctx, "ip", host)
+	if err != nil || len(ips) == 0 {
+		return nil, "", false
+	}
+
+	var firstV6 net.IP
+	for _, candidate := range ips {
+		if candidate.To4() != nil {
+			// Prefer IPv4: return immediately on the first IPv4 address.
+			return candidate, FamilyIPv4, true
+		}
+		if firstV6 == nil {
+			firstV6 = candidate
+		}
+	}
+
+	if firstV6 != nil {
+		return firstV6, FamilyIPv6, true
+	}
+	return nil, "", false
+}
+
+// pinnedClient returns an HTTP client that dials the given resolved IP for every
+// connection while preserving the requested port. It clones the base client's
+// transport settings (timeouts, keep-alive behavior) and only overrides the
+// dial target, so the URL host header is unaffected.
+func (p *httpPinger) pinnedClient(ip net.IP) *http.Client {
+	dialer := &net.Dialer{Timeout: 5 * time.Second}
+	pinnedAddr := ip.String()
+
+	transport := &http.Transport{
+		DisableKeepAlives: true, // Each probe should be independent.
+		DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
+			// Replace the host with the resolved IP, preserving the original
+			// port. net.JoinHostPort handles IPv6 bracketing correctly.
+			_, port, err := net.SplitHostPort(addr)
+			if err != nil {
+				port = strconv.Itoa(p.port)
+			}
+			return dialer.DialContext(ctx, network, net.JoinHostPort(pinnedAddr, port))
+		},
+	}
+
+	return &http.Client{
+		Timeout:   p.client.Timeout,
+		Transport: transport,
+	}
+}
+
+// singleProbe performs a single HTTP GET using the given client and measures RTT.
+func (p *httpPinger) singleProbe(ctx context.Context, client *http.Client, url, family string, timeout time.Duration) PingResult {
 	reqCtx, cancel := context.WithTimeout(ctx, timeout)
 	defer cancel()
 
@@ -106,7 +190,7 @@ func (p *httpPinger) singleProbe(ctx context.Context, url, family string, timeou
 	}
 
 	start := time.Now()
-	resp, err := p.client.Do(req)
+	resp, err := client.Do(req)
 	rtt := time.Since(start)
 
 	if err != nil {
diff --git a/pkg/monitors/network/http_pinger_test.go b/pkg/monitors/network/http_pinger_test.go
index c8361fc..f8e9981 100644
--- a/pkg/monitors/network/http_pinger_test.go
+++ b/pkg/monitors/network/http_pinger_test.go
@@ -200,6 +200,151 @@ func TestHTTPPinger_ImplementsPingerInterface(t *testing.T) {
 	var _ Pinger = newHTTPPinger(8023, "/healthz")
 }
 
+// fakeResolver is a test Resolver returning canned LookupIP results. It records
+// whether LookupIP was consulted so tests can assert IP-literal targets skip it.
+type fakeResolver struct {
+	ips         []net.IP
+	err         error
+	lookupIPHit atomic.Bool
+}
+
+func (f *fakeResolver) LookupHost(_ context.Context, _ string) ([]string, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (f *fakeResolver) LookupAddr(_ context.Context, _ string) ([]string, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (f *fakeResolver) LookupIP(_ context.Context, _, _ string) ([]net.IP, error) {
+	f.lookupIPHit.Store(true)
+	if f.err != nil {
+		return nil, f.err
+	}
+	return f.ips, nil
+}
+
+func TestHTTPPinger_HostnameIPv4Only(t *testing.T) {
+	pinger := newHTTPPinger(8023, "/healthz").(*httpPinger)
+	pinger.resolver = &fakeResolver{ips: []net.IP{net.ParseIP("203.0.113.10")}}
+
+	// Pin the dial to a dead port so the probe fails fast; we only assert family.
+	pinger.client = unreachableClient()
+
+	results, err := pinger.Ping(context.Background(), "ipv4.example.test", 1, 500*time.Millisecond)
+	if err != nil {
+		t.Fatalf("Ping() unexpected error: %v", err)
+	}
+	if got := results[0].Family; got != FamilyIPv4 {
+		t.Errorf("Family = %q, want %q", got, FamilyIPv4)
+	}
+}
+
+func TestHTTPPinger_HostnameIPv6Only(t *testing.T) {
+	pinger := newHTTPPinger(8023, "/healthz").(*httpPinger)
+	pinger.resolver = &fakeResolver{ips: []net.IP{net.ParseIP("2001:db8::1")}}
+	pinger.client = unreachableClient()
+
+	results, err := pinger.Ping(context.Background(), "ipv6.example.test", 1, 500*time.Millisecond)
+	if err != nil {
+		t.Fatalf("Ping() unexpected error: %v", err)
+	}
+	if got := results[0].Family; got != FamilyIPv6 {
+		t.Errorf("Family = %q, want %q", got, FamilyIPv6)
+	}
+}
+
+func TestHTTPPinger_HostnameDualStackPrefersIPv4(t *testing.T) {
+	// httptest binds 127.0.0.1. A fake resolver maps the hostname to both an
+	// IPv6 address (listed first) and the loopback IPv4 the server listens on.
+	// IPv4 preference must win AND the pinned dial must reach the server.
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+	defer server.Close()
+
+	_, port := testServerHostPort(t, server)
+	pinger := newHTTPPinger(port, "/healthz").(*httpPinger)
+	pinger.resolver = &fakeResolver{ips: []net.IP{
+		net.ParseIP("2001:db8::1"), // IPv6 first to prove preference, not order
+		net.ParseIP("127.0.0.1"),   // the address the test server actually serves
+	}}
+
+	results, err := pinger.Ping(context.Background(), "dual.example.test", 1, 5*time.Second)
+	if err != nil {
+		t.Fatalf("Ping() unexpected error: %v", err)
+	}
+	if !results[0].Success {
+		t.Fatalf("expected pinned dial to reach the server, got error: %v", results[0].Error)
+	}
+	if got := results[0].Family; got != FamilyIPv4 {
+		t.Errorf("Family = %q, want %q (IPv4 preference)", got, FamilyIPv4)
+	}
+}
+
+func TestHTTPPinger_ResolutionFailureGraceful(t *testing.T) {
+	pinger := newHTTPPinger(8023, "/healthz").(*httpPinger)
+	pinger.resolver = &fakeResolver{err: errors.New("no such host")}
+	pinger.client = unreachableClient()
+
+	results, err := pinger.Ping(context.Background(), "broken.example.test", 1, 500*time.Millisecond)
+	if err != nil {
+		t.Fatalf("Ping() should not return a top-level error on resolution failure, got: %v", err)
+	}
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+	// Documented fallback: resolution failure leaves family empty and the probe
+	// surfaces its own (unpinned) error rather than a resolution error.
+	if results[0].Success {
+		t.Error("probe against unreachable fallback should not succeed")
+	}
+	if got := results[0].Family; got != "" {
+		t.Errorf("Family = %q, want empty on resolution failure", got)
+	}
+}
+
+func TestHTTPPinger_IPLiteralSkipsResolver(t *testing.T) {
+	for _, tc := range []struct {
+		name   string
+		target string
+		family string
+	}{
+		{"ipv4", "127.0.0.1", FamilyIPv4},
+		{"ipv6", "::1", FamilyIPv6},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			pinger := newHTTPPinger(8023, "/healthz").(*httpPinger)
+			fake := &fakeResolver{ips: []net.IP{net.ParseIP("203.0.113.10")}}
+			pinger.resolver = fake
+			pinger.client = unreachableClient()
+
+			results, err := pinger.Ping(context.Background(), tc.target, 1, 500*time.Millisecond)
+			if err != nil {
+				t.Fatalf("Ping() unexpected error: %v", err)
+			}
+			if got := results[0].Family; got != tc.family {
+				t.Errorf("Family = %q, want %q", got, tc.family)
+			}
+			if fake.lookupIPHit.Load() {
+				t.Error("resolver should NOT be consulted for IP-literal targets")
+			}
+		})
+	}
+}
+
+// unreachableClient returns a client that dials a closed loopback port so probes
+// fail fast without network access; used by family-only assertion tests.
+func unreachableClient() *http.Client {
+	return &http.Client{
+		Timeout: 30 * time.Second,
+		Transport: &http.Transport{
+			DisableKeepAlives: true,
+			DialContext:       (&net.Dialer{Timeout: 200 * time.Millisecond}).DialContext,
+		},
+	}
+}
+
 func TestHTTPPinger_URLConstruction(t *testing.T) {
 	var receivedPath string
 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

From 218988ece547756a9607ae76ab33c45156f1b1c0 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:35:23 -0500
Subject: [PATCH 11/38] =?UTF-8?q?chore(network):=20ICMPv6=20socket=20polis?=
 =?UTF-8?q?h=20=E2=80=94=20filter,=20per-instance=20ID=20(Task=20#17235)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Install ipv6.ICMPFilter accepting only EchoReply on the v6 listener
(non-fatal if unsupported; receive loop still filters). Replace the
process-global ICMP id with a per-pinger-instance id (pid mixed with an
atomic counter, masked to 16 bits, non-zero) so concurrent pingers don't
cross-match replies. Live-socket integration test stays gated by
testing.Short(). Adds id-uniqueness unit test.
---
 pkg/monitors/network/pinger.go      | 50 ++++++++++++++++++++++++-----
 pkg/monitors/network/pinger_test.go | 34 ++++++++++++++++++++
 2 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/pkg/monitors/network/pinger.go b/pkg/monitors/network/pinger.go
index 57ce50c..fef1d80 100644
--- a/pkg/monitors/network/pinger.go
+++ b/pkg/monitors/network/pinger.go
@@ -5,8 +5,8 @@ import (
 	"context"
 	"fmt"
 	"log"
-	"math/rand"
 	"net"
+	"os"
 	"strings"
 	"sync/atomic"
 	"time"
@@ -33,9 +33,25 @@ const (
 // pingSequence is a global counter for ICMP sequence numbers
 var pingSequence uint32
 
-// pingID is a random ID generated at startup to avoid collisions with other processes.
-// Using math/rand is acceptable here - cryptographic randomness is not required for ICMP ping IDs.
-var pingID = uint16(rand.Uint32()) //nolint:gosec // ping ID doesn't require crypto/rand
+// pingerInstanceCounter is an atomically-incremented package counter used to
+// derive a stable, unique 16-bit ICMP echo ID for each defaultPinger instance.
+// Mixing the counter with the process ID keeps IDs distinct both across
+// instances in this process and (best-effort) across processes on the host,
+// avoiding cross-matching of echo replies between concurrent pingers.
+var pingerInstanceCounter uint32
+
+// nextPingerID returns a unique, non-zero 16-bit ICMP echo ID for a new pinger
+// instance. It mixes the process ID with an atomically-incremented counter so
+// the value is deterministic and unique per instance (no randomness required).
+func nextPingerID() uint16 {
+	n := atomic.AddUint32(&pingerInstanceCounter, 1)
+	id := uint16(os.Getpid()) + uint16(n)
+	if id == 0 {
+		// Avoid an all-zero ID, which is a poor discriminator on the wire.
+		id = uint16(n) | 0x8000
+	}
+	return id
+}
 
 // PingResult represents the result of a single ping operation.
 type PingResult struct {
@@ -62,12 +78,15 @@ type Pinger interface {
 // It supports both IPv4 (ICMP) and IPv6 (ICMPv6) probes, dispatching
 // based on the resolved target address family.
 type defaultPinger struct {
-	// No state needed for default implementation
+	// id is this instance's stable 16-bit ICMP echo identifier. Each pinger
+	// gets its own ID so replies destined for one pinger are not accepted by
+	// another pinger running in the same process.
+	id uint16
 }
 
 // newDefaultPinger creates a new default pinger that uses ICMP echo requests.
 func newDefaultPinger() Pinger {
-	return &defaultPinger{}
+	return &defaultPinger{id: nextPingerID()}
 }
 
 // resolveTarget parses the target as an IP literal (optionally carrying an
@@ -155,6 +174,21 @@ func listenICMP(family string) (*icmp.PacketConn, int, icmp.Type, error) {
 		if err != nil {
 			return nil, 0, nil, fmt.Errorf("failed to create IPv6 ICMP listener (may require elevated privileges): %w", err)
 		}
+		// Install a kernel-side ICMPv6 filter so the socket only wakes us for
+		// echo replies, avoiding parsing of unrelated ICMPv6 traffic (router
+		// advertisements, neighbor discovery, MLD, ICMP errors). This is a
+		// best-effort optimization: some environments/sockets do not support
+		// setting the filter, so a failure here is non-fatal and we continue
+		// with an unfiltered socket (the receive loop still discards non-echo
+		// replies via isEchoReply).
+		if pc := conn.IPv6PacketConn(); pc != nil {
+			var f ipv6.ICMPFilter
+			f.SetAll(true)
+			f.Accept(ipv6.ICMPTypeEchoReply)
+			if err := pc.SetICMPFilter(&f); err != nil {
+				log.Printf("[DEBUG] IPv6 ICMP listener: could not set echo-reply filter (continuing unfiltered): %v", err)
+			}
+		}
 		return conn, protocolICMPv6, ipv6.ICMPTypeEchoRequest, nil
 	default:
 		return nil, 0, nil, fmt.Errorf("unsupported address family %q", family)
@@ -236,7 +270,7 @@ func (p *defaultPinger) singlePing(
 		Type: echoType,
 		Code: 0,
 		Body: &icmp.Echo{
-			ID:   int(pingID),
+			ID:   int(p.id),
 			Seq:  int(seq),
 			Data: []byte("node-doctor-ping"),
 		},
@@ -330,7 +364,7 @@ func (p *defaultPinger) singlePing(
 			continue
 		}
 
-		if echoReply.ID != int(pingID) || echoReply.Seq != int(seq) {
+		if echoReply.ID != int(p.id) || echoReply.Seq != int(seq) {
 			continue
 		}
 
diff --git a/pkg/monitors/network/pinger_test.go b/pkg/monitors/network/pinger_test.go
index cd1d0c4..97bd983 100644
--- a/pkg/monitors/network/pinger_test.go
+++ b/pkg/monitors/network/pinger_test.go
@@ -367,10 +367,44 @@ func TestPeerMatchesIP(t *testing.T) {
 	}
 }
 
+// TestNewDefaultPinger_UniqueID verifies that each defaultPinger instance is
+// assigned its own non-zero 16-bit ICMP echo ID, so concurrent pingers in the
+// same process cannot cross-match each other's echo replies. This is a pure
+// unit test and does not open any sockets, so it is safe under -short.
+func TestNewDefaultPinger_UniqueID(t *testing.T) {
+	p1, ok := newDefaultPinger().(*defaultPinger)
+	if !ok {
+		t.Fatal("newDefaultPinger() did not return *defaultPinger")
+	}
+	p2, ok := newDefaultPinger().(*defaultPinger)
+	if !ok {
+		t.Fatal("newDefaultPinger() did not return *defaultPinger")
+	}
+
+	if p1.id == 0 {
+		t.Errorf("first pinger id is zero, want non-zero")
+	}
+	if p2.id == 0 {
+		t.Errorf("second pinger id is zero, want non-zero")
+	}
+	if p1.id == p2.id {
+		t.Errorf("two pingers share id %d, want distinct ids", p1.id)
+	}
+}
+
 // TestDefaultPinger_Integration is an integration test for the real pinger.
 // This test requires ICMP permissions and may not run in all environments.
 // It exercises both IPv4 and IPv6 loopback paths so the dual-stack rewrite
 // is exercised when run in privileged mode.
+//
+// Gating: this lives in the default (non-tagged) test file but opens raw ICMP
+// sockets, so it must NOT run under `go test -short`. The testing.Short() guard
+// below is the gate of record — it keeps `-short` CI from attempting raw
+// sockets while still letting a normal `go test` run exercise the live path
+// where privileges allow. We keep it here (rather than behind the
+// //go:build integration tag used by cni_integration_test.go) because it is a
+// lightweight loopback check, not a cluster-dependent integration test, and the
+// existing Short() guard already satisfies the requirement with the least churn.
 func TestDefaultPinger_Integration(t *testing.T) {
 	if testing.Short() {
 		t.Skip("Skipping integration test in short mode")

From 8e8c080f171a8011234b98c8fc3e3c2c171582f2 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 02:40:43 -0500
Subject: [PATCH 12/38] chore(ci): run ICMP pinger integration test instead of
 silently skipping (Task #17236)

Add NODE_DOCTOR_ICMP_INTEGRATION env gate: when set (CI), socket/
permission failures in TestDefaultPinger_Integration become hard failures
instead of silent t.Skip; -short still skips, unset+non-short still
skips gracefully on unprivileged dev boxes. New Makefile target
test-net-icmp-integration and a separate (non-blocking) CI job compile
the test binary as the runner user and run it under sudo for CAP_NET_RAW.
---
 .github/workflows/ci.yml            | 32 ++++++++++++++++
 Makefile                            | 15 ++++++++
 pkg/monitors/network/pinger_test.go | 57 +++++++++++++++++++++++------
 3 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6463ca6..72cd277 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -105,6 +105,38 @@ jobs:
           fail_ci_if_error: false
           token: ${{ secrets.CODECOV_TOKEN }}
 
+  # Pinger ICMP integration - runs the real raw-ICMP loopback test under privilege.
+  #
+  # Kept as a SEPARATE job (not a step in `test`) so a privileged-socket flake on
+  # the runner does not block the main unit-test/coverage job. The runner user
+  # compiles the test binary (preserving the Go env / cache), then runs ONLY the
+  # integration test via sudo so it has CAP_NET_RAW. NODE_DOCTOR_ICMP_INTEGRATION=1
+  # makes socket/permission errors HARD failures so a misconfigured runner surfaces
+  # loudly instead of silently passing without exercising real ICMP.
+  pinger-icmp-integration:
+    name: Pinger ICMP Integration
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Setup Go
+        uses: actions/setup-go@v6
+        with:
+          go-version: ${{ env.GO_VERSION }}
+          cache: true
+
+      - name: Download dependencies
+        run: go mod download
+
+      - name: Compile network test binary
+        run: go test -c -o /tmp/nd-network.test ./pkg/monitors/network/
+
+      - name: Run ICMP integration test (privileged)
+        run: |
+          sudo NODE_DOCTOR_ICMP_INTEGRATION=1 /tmp/nd-network.test \
+            -test.run '^TestDefaultPinger_Integration$' -test.v
+
   # Security scan - gosec
   security-gosec:
     name: Security Scan (gosec)
diff --git a/Makefile b/Makefile
index 47b284a..cf78a8f 100644
--- a/Makefile
+++ b/Makefile
@@ -22,6 +22,7 @@
 	gh-status gh-watch gh-logs gh-builds \
 	check-prerequisites check-docker check-kubectl \
 	build test test-integration test-e2e test-all \
+	test-net-icmp-integration \
 	lint fmt clean install-deps \
 	docker-build docker-push \
 	coverage-check
@@ -232,6 +233,20 @@ test-e2e:
 	fi
 	@$(call print_success,"E2E tests completed")
 
+# Run the real ICMP pinger integration test under privilege.
+#
+# The default pinger opens RAW ICMP sockets (CAP_NET_RAW), so this must run as
+# root. We compile the test binary as the normal user first (preserving the Go
+# environment / module cache) and then run ONLY this test under sudo with the
+# integration env var set, so socket/permission failures are HARD failures
+# instead of silent skips.
+test-net-icmp-integration:
+	@$(call print_status,"Compiling network test binary...")
+	@go test -c -o /tmp/nd-network.test ./pkg/monitors/network/
+	@$(call print_status,"Running ICMP integration test as root (CAP_NET_RAW)...")
+	@sudo NODE_DOCTOR_ICMP_INTEGRATION=1 /tmp/nd-network.test -test.run '^TestDefaultPinger_Integration$$' -test.v
+	@$(call print_success,"ICMP integration test passed")
+
 # Run all tests with coverage
 test-all:
 	@$(call print_status,"Running all tests with coverage...")
diff --git a/pkg/monitors/network/pinger_test.go b/pkg/monitors/network/pinger_test.go
index 97bd983..63638aa 100644
--- a/pkg/monitors/network/pinger_test.go
+++ b/pkg/monitors/network/pinger_test.go
@@ -4,6 +4,8 @@ import (
 	"context"
 	"errors"
 	"net"
+	"os"
+	"strconv"
 	"testing"
 	"time"
 )
@@ -392,24 +394,51 @@ func TestNewDefaultPinger_UniqueID(t *testing.T) {
 	}
 }
 
+// icmpIntegrationRequired reports whether NODE_DOCTOR_ICMP_INTEGRATION is set to
+// a truthy value ("1"/"true"/etc). When true, TestDefaultPinger_Integration must
+// actually exercise the raw ICMP socket path and treats inability to do so as a
+// hard failure rather than a silent skip.
+func icmpIntegrationRequired() bool {
+	v, ok := os.LookupEnv("NODE_DOCTOR_ICMP_INTEGRATION")
+	if !ok {
+		return false
+	}
+	b, err := strconv.ParseBool(v)
+	return err == nil && b
+}
+
 // TestDefaultPinger_Integration is an integration test for the real pinger.
-// This test requires ICMP permissions and may not run in all environments.
-// It exercises both IPv4 and IPv6 loopback paths so the dual-stack rewrite
-// is exercised when run in privileged mode.
+// This test requires raw ICMP socket permissions (CAP_NET_RAW) because the
+// default pinger opens icmp.ListenPacket("ip4:icmp"/"ip6:ipv6-icmp"). It
+// exercises both IPv4 and IPv6 loopback paths so the dual-stack rewrite is
+// exercised when run in privileged mode.
+//
+// This test has three modes, gated in order:
 //
-// Gating: this lives in the default (non-tagged) test file but opens raw ICMP
-// sockets, so it must NOT run under `go test -short`. The testing.Short() guard
-// below is the gate of record — it keeps `-short` CI from attempting raw
-// sockets while still letting a normal `go test` run exercise the live path
-// where privileges allow. We keep it here (rather than behind the
-// //go:build integration tag used by cni_integration_test.go) because it is a
-// lightweight loopback check, not a cluster-dependent integration test, and the
-// existing Short() guard already satisfies the requirement with the least churn.
+//  1. `go test -short`  -> SKIP. The Short() guard keeps fast/local CI from
+//     attempting raw sockets at all. This is the local fast path.
+//  2. NODE_DOCTOR_ICMP_INTEGRATION set/truthy (and not short) -> MUST RUN OR
+//     FAIL. This is the dedicated privileged CI job: inability to open the
+//     socket, a permission error, or a probe timeout is a HARD FAILURE
+//     (t.Fatalf) so a misconfigured runner surfaces loudly instead of silently
+//     passing without exercising real ICMP. A genuinely successful ping passes.
+//  3. Neither short nor the env set (e.g. a dev `make test-all` on an
+//     unprivileged box) -> BEST-EFFORT. Socket/permission errors gracefully
+//     t.Skip so dev machines without CAP_NET_RAW are not broken.
+//
+// We keep this here (rather than behind the //go:build integration tag used by
+// cni_integration_test.go) because it is a lightweight loopback check, not a
+// cluster-dependent integration test.
 func TestDefaultPinger_Integration(t *testing.T) {
 	if testing.Short() {
 		t.Skip("Skipping integration test in short mode")
 	}
 
+	mustRun := icmpIntegrationRequired()
+	if mustRun {
+		t.Log("NODE_DOCTOR_ICMP_INTEGRATION set: ICMP socket/permission errors are HARD FAILURES")
+	}
+
 	cases := []struct {
 		name       string
 		target     string
@@ -429,10 +458,16 @@ func TestDefaultPinger_Integration(t *testing.T) {
 
 			if err != nil && (errors.Is(err, context.DeadlineExceeded) ||
 				errors.Is(err, context.Canceled)) {
+				if mustRun {
+					t.Fatalf("NODE_DOCTOR_ICMP_INTEGRATION set but ping to %s timed out/was canceled (CAP_NET_RAW or connectivity misconfigured): %v", tc.target, err)
+				}
 				t.Skipf("Skipping integration test due to permissions or timeout: %v", err)
 				return
 			}
 			if err != nil {
+				if mustRun {
+					t.Fatalf("NODE_DOCTOR_ICMP_INTEGRATION set but ping to %s failed to open raw ICMP socket / probe (CAP_NET_RAW required): %v", tc.target, err)
+				}
 				t.Logf("Warning: Ping failed (may require elevated privileges): %v", err)
 				t.Skip("Skipping test - ping requires elevated privileges")
 				return

From 5751627e2204b982ff05fb6487087d0da7ee21f2 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 03:37:32 -0500
Subject: [PATCH 13/38] test(network): cover resolveTarget hostname DNS path
 (Task #17237)

Add TestResolveTarget_HostnameDNSPath (localhost via /etc/hosts, no
network) exercising the net.ParseIP-fails -> net.LookupIP branch with
loopback/family/zone assertions, plus TestResolveTarget_HostnameResolution
Failure using an RFC 6761 .invalid name for the error branch. Test-only.
---
 pkg/monitors/network/pinger_test.go | 46 +++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/pkg/monitors/network/pinger_test.go b/pkg/monitors/network/pinger_test.go
index 63638aa..c3a217c 100644
--- a/pkg/monitors/network/pinger_test.go
+++ b/pkg/monitors/network/pinger_test.go
@@ -314,6 +314,52 @@ func TestResolveTarget(t *testing.T) {
 	}
 }
 
+// TestResolveTarget_HostnameDNSPath exercises the hostname-resolution branch of
+// resolveTarget (net.ParseIP fails -> net.LookupIP), which the IP-literal table
+// above does not reach. "localhost" resolves via /etc/hosts (no network
+// dependency, deterministic in CI). We assert a loopback address comes back and
+// that the reported family is consistent with the returned IP and never carries
+// a zone (hostname resolution must not invent one). The IP family of localhost
+// can vary by host (IPv4-preference yields 127.0.0.1 where an A record exists,
+// otherwise ::1), so we avoid pinning the exact address/family.
+func TestResolveTarget_HostnameDNSPath(t *testing.T) {
+	ip, zone, family, err := resolveTarget("localhost")
+	if err != nil {
+		t.Fatalf("resolveTarget(\"localhost\") returned error: %v", err)
+	}
+	if ip == nil {
+		t.Fatal("resolveTarget(\"localhost\") returned nil IP")
+	}
+	if !ip.IsLoopback() {
+		t.Errorf("resolveTarget(\"localhost\") IP = %q, want a loopback address", ip)
+	}
+	if zone != "" {
+		t.Errorf("hostname resolution invented a zone %q, want empty", zone)
+	}
+	// Family must match the actual returned address: IPv4-preference returns a
+	// 4-byte address tagged ipv4; otherwise an IPv6 loopback tagged ipv6.
+	if ip.To4() != nil {
+		if family != FamilyIPv4 {
+			t.Errorf("family = %q for IPv4 loopback, want %q", family, FamilyIPv4)
+		}
+	} else if family != FamilyIPv6 {
+		t.Errorf("family = %q for IPv6 loopback, want %q", family, FamilyIPv6)
+	}
+}
+
+// TestResolveTarget_HostnameResolutionFailure exercises the error branch of the
+// hostname path. The ".invalid" TLD is reserved by RFC 6761 to always fail
+// resolution, so this is deterministic and does not depend on external DNS.
+func TestResolveTarget_HostnameResolutionFailure(t *testing.T) {
+	ip, zone, family, err := resolveTarget("node-doctor-nonexistent.invalid")
+	if err == nil {
+		t.Fatalf("resolveTarget of an unresolvable .invalid name succeeded: ip=%v zone=%q family=%q", ip, zone, family)
+	}
+	if ip != nil {
+		t.Errorf("expected nil IP on resolution failure, got %v", ip)
+	}
+}
+
 // TestDestAddr verifies the zone reaches the *net.IPAddr used for sending.
 // Actually transmitting link-local ICMP requires privileges and a real
 // interface, so we unit-test the destination builder instead.

From c04bfffbaef3280b5c1a26b8103b24628a7aa102 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 03:42:58 -0500
Subject: [PATCH 14/38] feat(network): metric-based default-route selection
 v4+v6 (Task #17240)

Both default-route parsers now scan all default routes and select the
lowest-metric one (kernel behavior) instead of first-seen. IPv4 metric =
/proc/net/route col 6 decimal; IPv6 metric = /proc/net/ipv6_route col 5
hex. Tie -> first-seen; malformed metric -> treated as max (line still
eligible if sole default). New dedicated multi-default fixtures (shared
ipv6_route fixture untouched to protect #17239 tests).
---
 pkg/monitors/network/gateway.go               | 109 +++++++++++++++---
 pkg/monitors/network/gateway_test.go          |  96 ++++++++++++++-
 .../proc/net/ipv6_route_multi_default         |   3 +
 .../testdata/proc/net/route_multi_default     |   4 +
 4 files changed, 189 insertions(+), 23 deletions(-)
 create mode 100644 pkg/monitors/network/testdata/proc/net/ipv6_route_multi_default
 create mode 100644 pkg/monitors/network/testdata/proc/net/route_multi_default

diff --git a/pkg/monitors/network/gateway.go b/pkg/monitors/network/gateway.go
index d99312a..2145951 100644
--- a/pkg/monitors/network/gateway.go
+++ b/pkg/monitors/network/gateway.go
@@ -7,6 +7,7 @@ import (
 	"encoding/hex"
 	"fmt"
 	"io"
+	"math"
 	"net"
 	"os"
 	"strconv"
@@ -470,8 +471,18 @@ func detectDefaultGatewayFromFile(path string) (string, error) {
 }
 
 // detectDefaultGatewayFromReader parses /proc/net/route content and returns
-// the first default gateway it finds. The reader must include the header line
-// the kernel emits first; that line is skipped before route entries are read.
+// the gateway of the default route with the LOWEST metric. The kernel routes
+// traffic through the lowest-metric default route when several exist (multi-NIC,
+// failover), so node-doctor mirrors that selection. The reader must include the
+// header line the kernel emits first; that line is skipped before route entries
+// are read.
+//
+// The Metric column (index 6, 0-based) in /proc/net/route is a plain base-10
+// integer string (the kernel formats it with %d), so it is parsed as decimal.
+// On ties, the first-seen default route wins. A line whose Metric field cannot
+// be parsed is treated as having the maximum metric so it never wins over a
+// well-formed route, but is still eligible if it is the only default route — the
+// gateway hex itself is still validated before the value is returned.
 func detectDefaultGatewayFromReader(r io.Reader) (string, error) {
 	scanner := bufio.NewScanner(r)
 
@@ -480,13 +491,19 @@ func detectDefaultGatewayFromReader(r io.Reader) (string, error) {
 		return "", fmt.Errorf("route table is empty")
 	}
 
+	var (
+		bestGateway string
+		bestMetric  int64
+		found       bool
+	)
+
 	// Parse route entries
 	for scanner.Scan() {
 		line := scanner.Text()
 		fields := strings.Fields(line)
 
 		// Route table format: Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT
-		// We need at least 8 fields
+		// We need at least 7 fields to read the Metric column (index 6).
 		if len(fields) < 8 {
 			continue
 		}
@@ -494,14 +511,30 @@ func detectDefaultGatewayFromReader(r io.Reader) (string, error) {
 		destination := fields[1]
 		gateway := fields[2]
 
-		// Default route has destination 00000000
-		if destination == "00000000" && gateway != "00000000" {
-			// Parse gateway hex string to IP
-			gatewayIP, err := hexToIP(gateway)
-			if err != nil {
-				return "", fmt.Errorf("failed to parse gateway hex %s: %w", gateway, err)
-			}
-			return gatewayIP, nil
+		// Default route has destination 00000000 and a non-zero gateway.
+		if destination != "00000000" || gateway == "00000000" {
+			continue
+		}
+
+		// Parse the gateway hex up front so a malformed gateway is rejected
+		// even when it is the only default route present.
+		gatewayIP, err := hexToIP(gateway)
+		if err != nil {
+			return "", fmt.Errorf("failed to parse gateway hex %s: %w", gateway, err)
+		}
+
+		// Metric column is a base-10 integer. A malformed metric is treated as
+		// the maximum value so a well-formed lower-metric route always wins.
+		metric, err := strconv.ParseInt(fields[6], 10, 64)
+		if err != nil {
+			metric = math.MaxInt64
+		}
+
+		// First-seen wins on equal metric (strict less-than comparison).
+		if !found || metric < bestMetric {
+			bestGateway = gatewayIP
+			bestMetric = metric
+			found = true
 		}
 	}
 
@@ -509,7 +542,11 @@ func detectDefaultGatewayFromReader(r io.Reader) (string, error) {
 		return "", fmt.Errorf("error reading route table: %w", err)
 	}
 
-	return "", fmt.Errorf("no default gateway found in route table")
+	if !found {
+		return "", fmt.Errorf("no default gateway found in route table")
+	}
+
+	return bestGateway, nil
 }
 
 // detectDefaultIPv6GatewayFromFile opens the given path and parses it as a
@@ -526,9 +563,11 @@ func detectDefaultIPv6GatewayFromFile(path string) (string, error) {
 }
 
 // detectDefaultIPv6GatewayFromReader parses /proc/net/ipv6_route content and
-// returns the first default route's next-hop. Unlike /proc/net/route, the
-// IPv6 route table does NOT begin with a header line — every line is a route
-// entry. The kernel format is space-separated:
+// returns the next-hop of the default route with the LOWEST metric. As with the
+// IPv4 table, the kernel routes traffic through the lowest-metric default route
+// when several exist, so node-doctor mirrors that selection. Unlike
+// /proc/net/route, the IPv6 route table does NOT begin with a header line —
+// every line is a route entry. The kernel format is space-separated:
 //
 //	dest(32 hex)  prefix(2)  src(32)  src_prefix(2)  next_hop(32)  metric(8)
 //	ref(8)        use(8)     flags(8)  iface
@@ -536,15 +575,28 @@ func detectDefaultIPv6GatewayFromFile(path string) (string, error) {
 // A default route has destination = all-zero and prefix = 0x00. Lines whose
 // next-hop is all-zero are link-scoped on-link routes (no gateway) and are
 // skipped.
+//
+// The metric column (index 5, 0-based) is an 8-character HEX value (the kernel
+// formats it with %08x), so it is parsed as base 16. On ties, the first-seen
+// default route wins. A line whose metric field cannot be parsed is treated as
+// the maximum metric so a well-formed lower-metric route always wins, but it is
+// still eligible if it is the only default route — the next-hop hex is still
+// validated before the value is returned.
 func detectDefaultIPv6GatewayFromReader(r io.Reader) (string, error) {
 	scanner := bufio.NewScanner(r)
 
+	var (
+		bestGateway string
+		bestMetric  uint64
+		found       bool
+	)
+
 	for scanner.Scan() {
 		line := scanner.Text()
 		fields := strings.Fields(line)
 
-		// Need at least dest, prefix, src, src_prefix, next_hop
-		if len(fields) < 5 {
+		// Need at least dest, prefix, src, src_prefix, next_hop, metric
+		if len(fields) < 6 {
 			continue
 		}
 
@@ -561,18 +613,37 @@ func detectDefaultIPv6GatewayFromReader(r io.Reader) (string, error) {
 			continue
 		}
 
+		// Validate the next-hop hex up front so a malformed gateway is rejected
+		// even when it is the only default route present.
 		gatewayIP, err := hexToIPv6(nextHop)
 		if err != nil {
 			return "", fmt.Errorf("failed to parse IPv6 gateway hex %s: %w", nextHop, err)
 		}
-		return gatewayIP, nil
+
+		// Metric column is an 8-hex value. A malformed metric is treated as the
+		// maximum value so a well-formed lower-metric route always wins.
+		metric, err := strconv.ParseUint(fields[5], 16, 64)
+		if err != nil {
+			metric = math.MaxUint64
+		}
+
+		// First-seen wins on equal metric (strict less-than comparison).
+		if !found || metric < bestMetric {
+			bestGateway = gatewayIP
+			bestMetric = metric
+			found = true
+		}
 	}
 
 	if err := scanner.Err(); err != nil {
 		return "", fmt.Errorf("error reading IPv6 route table: %w", err)
 	}
 
-	return "", fmt.Errorf("no default IPv6 gateway found in IPv6 route table")
+	if !found {
+		return "", fmt.Errorf("no default IPv6 gateway found in IPv6 route table")
+	}
+
+	return bestGateway, nil
 }
 
 // hexToIP converts a hex string (little-endian) to an IP address string.
diff --git a/pkg/monitors/network/gateway_test.go b/pkg/monitors/network/gateway_test.go
index 4b86ecd..fabd465 100644
--- a/pkg/monitors/network/gateway_test.go
+++ b/pkg/monitors/network/gateway_test.go
@@ -309,10 +309,40 @@ func TestDetectDefaultGateway(t *testing.T) {
 			wantErr: false,
 		},
 		{
-			name: "multiple interfaces - first default gateway",
+			// Equal-metric tie: first-seen default route wins.
+			name: "multiple interfaces equal metric - first default gateway",
 			routeData: "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n" +
 				"eth0\t00000000\t0101A8C0\t0003\t0\t0\t100\t00000000\t0\t0\t0\n" +
-				"wlan0\t00000000\t0A0AA8C0\t0003\t0\t0\t200\t00000000\t0\t0\t0\n",
+				"wlan0\t00000000\t0A0AA8C0\t0003\t0\t0\t100\t00000000\t0\t0\t0\n",
+			want:    "192.168.1.1",
+			wantErr: false,
+		},
+		{
+			// Lowest-metric default route wins even though it is NOT first in
+			// file order (proves metric-based selection, not first-wins).
+			name: "multiple defaults - lowest metric selected (not first)",
+			routeData: "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n" +
+				"eth0\t00000000\t0101A8C0\t0003\t0\t0\t200\t00000000\t0\t0\t0\n" +
+				"wlan0\t00000000\t0A0AA8C0\t0003\t0\t0\t50\t00000000\t0\t0\t0\n",
+			want:    "192.168.10.10",
+			wantErr: false,
+		},
+		{
+			// A malformed Metric field is treated as max metric, so the other
+			// well-formed default route is still selected.
+			name: "malformed metric on one default - other valid default selected",
+			routeData: "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n" +
+				"eth0\t00000000\t0101A8C0\t0003\t0\t0\tNOTANUM\t00000000\t0\t0\t0\n" +
+				"wlan0\t00000000\t0A0AA8C0\t0003\t0\t0\t300\t00000000\t0\t0\t0\n",
+			want:    "192.168.10.10",
+			wantErr: false,
+		},
+		{
+			// A single default route with a malformed Metric is still returned
+			// (graceful handling, no crash) since it is the only default route.
+			name: "single default with malformed metric still returned",
+			routeData: "Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n" +
+				"eth0\t00000000\t0101A8C0\t0003\t0\t0\tNOTANUM\t00000000\t0\t0\t0\n",
 			want:    "192.168.1.1",
 			wantErr: false,
 		},
@@ -470,11 +500,40 @@ func TestDetectDefaultIPv6Gateway(t *testing.T) {
 			errFrag: "in IPv6 route table",
 		},
 		{
-			name: "first default route wins",
+			// Equal-metric tie: the first-seen default route wins.
+			name: "equal metric tie - first default route wins",
 			content: "00000000000000000000000000000000 00 00000000000000000000000000000000 00 " +
 				"fe800000000000000000000000000001 00000400 00000003 00000000 00000003     eth0\n" +
 				"00000000000000000000000000000000 00 00000000000000000000000000000000 00 " +
-				"20010db800000000000000000000beef 00000800 00000001 00000000 00000003     eth1\n",
+				"20010db800000000000000000000beef 00000400 00000001 00000000 00000003     eth1\n",
+			want: "fe80::1",
+		},
+		{
+			// Lowest-metric default route wins even though it is NOT first in
+			// file order. Metric is parsed as HEX: 00000800=2048, 00000100=256.
+			name: "lowest metric default selected (not first)",
+			content: "00000000000000000000000000000000 00 00000000000000000000000000000000 00 " +
+				"fe800000000000000000000000000001 00000800 00000003 00000000 00000003     eth0\n" +
+				"00000000000000000000000000000000 00 00000000000000000000000000000000 00 " +
+				"20010db8000000000000000000000001 00000100 00000001 00000000 00000003     eth1\n",
+			want: "2001:db8::1",
+		},
+		{
+			// A malformed metric field is treated as max metric, so the other
+			// well-formed default route (higher position) is still selected.
+			name: "malformed metric on one default - other valid default selected",
+			content: "00000000000000000000000000000000 00 00000000000000000000000000000000 00 " +
+				"fe800000000000000000000000000001 zzzzzzzz 00000003 00000000 00000003     eth0\n" +
+				"00000000000000000000000000000000 00 00000000000000000000000000000000 00 " +
+				"20010db8000000000000000000000001 00000900 00000001 00000000 00000003     eth1\n",
+			want: "2001:db8::1",
+		},
+		{
+			// A single default route with a malformed metric is still returned
+			// (graceful handling, no crash) since it is the only default route.
+			name: "single default with malformed metric still returned",
+			content: "00000000000000000000000000000000 00 00000000000000000000000000000000 00 " +
+				"fe800000000000000000000000000001 zzzzzzzz 00000003 00000000 00000003     eth0\n",
 			want: "fe80::1",
 		},
 		{
@@ -537,6 +596,35 @@ func TestDetectDefaultIPv6Gateway(t *testing.T) {
 	})
 }
 
+// TestDetectDefaultGateway_MetricSelectionFixture verifies metric-based default
+// route selection against committed fixture files (one per family) that contain
+// MULTIPLE default routes with different metrics, where the lowest-metric route
+// is intentionally NOT first in file order.
+func TestDetectDefaultGateway_MetricSelectionFixture(t *testing.T) {
+	t.Run("ipv4 lowest-metric default selected from fixture", func(t *testing.T) {
+		got, err := detectDefaultGatewayFromFile("testdata/proc/net/route_multi_default")
+		if err != nil {
+			t.Fatalf("detectDefaultGatewayFromFile() error = %v", err)
+		}
+		// Lowest metric (50) is the second default route -> 192.168.10.10.
+		if want := "192.168.10.10"; got != want {
+			t.Errorf("detectDefaultGatewayFromFile() = %q, want %q", got, want)
+		}
+	})
+
+	t.Run("ipv6 lowest-metric default selected from fixture", func(t *testing.T) {
+		got, err := detectDefaultIPv6GatewayFromFile("testdata/proc/net/ipv6_route_multi_default")
+		if err != nil {
+			t.Fatalf("detectDefaultIPv6GatewayFromFile() error = %v", err)
+		}
+		// Lowest metric (0x100) is the last default route -> 2001:db8::1,
+		// even though fe80::1 (0x400) appears first.
+		if want := "2001:db8::1"; got != want {
+			t.Errorf("detectDefaultIPv6GatewayFromFile() = %q, want %q", got, want)
+		}
+	})
+}
+
 func TestGatewayMonitor_CheckGateway(t *testing.T) {
 	tests := []struct {
 		name                     string
diff --git a/pkg/monitors/network/testdata/proc/net/ipv6_route_multi_default b/pkg/monitors/network/testdata/proc/net/ipv6_route_multi_default
new file mode 100644
index 0000000..6b911cc
--- /dev/null
+++ b/pkg/monitors/network/testdata/proc/net/ipv6_route_multi_default
@@ -0,0 +1,3 @@
+00000000000000000000000000000000 00 00000000000000000000000000000000 00 fe800000000000000000000000000001 00000400 00000003 00000000 00000003     eth0
+fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000007 00000000 00000001     eth0
+00000000000000000000000000000000 00 00000000000000000000000000000000 00 20010db8000000000000000000000001 00000100 00000003 00000000 00000003     eth1
diff --git a/pkg/monitors/network/testdata/proc/net/route_multi_default b/pkg/monitors/network/testdata/proc/net/route_multi_default
new file mode 100644
index 0000000..b3e206d
--- /dev/null
+++ b/pkg/monitors/network/testdata/proc/net/route_multi_default
@@ -0,0 +1,4 @@
+Iface	Destination	Gateway 	Flags	RefCnt	Use	Metric	Mask		MTU	Window	IRTT
+eth0	00000000	0101A8C0	0003	0	0	200	00000000	0	0	0
+wlan0	00000000	0A0AA8C0	0003	0	0	50	00000000	0	0	0
+eth0	0000A8C0	00000000	0001	0	0	100	00FFFFFF	0	0	0

From fecf25d7e119aaaf041f5b9222c62d36665cb3b8 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 03:47:45 -0500
Subject: [PATCH 15/38] feat(network): warn when AAAA query drops A-only DNS
 checks (Task #17241)

When a custom DNS query is RecordType AAAA with TestEachNameserver and/or
ConsistencyCheck enabled (both A-only), emit a single Warning event
(AAAAFeatureUnsupported) per query naming the skipped feature(s) and
domain, instead of silently dropping them. AAAA lookup behavior and the
A-query path are unchanged. Tests cover each flag, both, neither, and A.
---
 pkg/monitors/network/dns.go      |  18 ++++
 pkg/monitors/network/dns_test.go | 165 +++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+)

diff --git a/pkg/monitors/network/dns.go b/pkg/monitors/network/dns.go
index 32137d8..94466ca 100644
--- a/pkg/monitors/network/dns.go
+++ b/pkg/monitors/network/dns.go
@@ -1608,6 +1608,24 @@ func (m *DNSMonitor) checkCustomQueries(ctx context.Context, status *types.Statu
 			continue
 		}
 
+		// TestEachNameserver and ConsistencyCheck are only implemented for A
+		// queries. For AAAA queries they are silently skipped, so emit a single
+		// warning per query naming the unsupported feature(s) and the domain.
+		if recordType == "AAAA" && (query.TestEachNameserver || query.ConsistencyCheck) {
+			var skipped []string
+			if query.TestEachNameserver {
+				skipped = append(skipped, "TestEachNameserver")
+			}
+			if query.ConsistencyCheck {
+				skipped = append(skipped, "ConsistencyCheck")
+			}
+			status.AddEvent(types.NewEvent(
+				types.EventWarning,
+				"AAAAFeatureUnsupported",
+				fmt.Sprintf("%s not supported for AAAA queries; skipping for domain %s (only the basic AAAA lookup runs)", strings.Join(skipped, " and "), query.Domain),
+			))
+		}
+
 		start := time.Now()
 		var resultCount int
 		var err error
diff --git a/pkg/monitors/network/dns_test.go b/pkg/monitors/network/dns_test.go
index 9a6fa95..b0d00ce 100644
--- a/pkg/monitors/network/dns_test.go
+++ b/pkg/monitors/network/dns_test.go
@@ -778,6 +778,171 @@ func TestCustomQueries(t *testing.T) {
 	}
 }
 
+// TestCustomQueriesAAAAUnsupportedFeatureWarning verifies that AAAA queries
+// configured with the A-only TestEachNameserver/ConsistencyCheck features emit
+// exactly one AAAAFeatureUnsupported warning (not one per nameserver) instead of
+// silently dropping the requested feature, while the basic AAAA lookup still runs.
+func TestCustomQueriesAAAAUnsupportedFeatureWarning(t *testing.T) {
+	const unsupportedReason = "AAAAFeatureUnsupported"
+
+	countReason := func(events []types.Event, reason string) int {
+		n := 0
+		for _, e := range events {
+			if e.Reason == reason {
+				n++
+			}
+		}
+		return n
+	}
+
+	tests := []struct {
+		name               string
+		query              DNSQuery
+		mockSetup          func(*mockResolver)
+		wantUnsupported    int
+		wantLookupMentions []string // substrings that must appear in the warning message
+		// checkAAAALookupRan asserts the basic AAAA lookup still happened by
+		// requiring no lookup-failure events (the mock returns "no such host"
+		// for an unconfigured lookup, which would surface as such an event).
+		checkAAAALookupRan bool
+	}{
+		{
+			name:  "AAAA with TestEachNameserver warns once and still resolves",
+			query: DNSQuery{Domain: "v6.example.com", RecordType: "AAAA", TestEachNameserver: true},
+			mockSetup: func(m *mockResolver) {
+				m.setIPResponse("ip6", "v6.example.com", []net.IP{net.ParseIP("2606:4700::1")})
+			},
+			wantUnsupported:    1,
+			wantLookupMentions: []string{"TestEachNameserver", "v6.example.com"},
+			checkAAAALookupRan: true,
+		},
+		{
+			name:  "AAAA with ConsistencyCheck warns once",
+			query: DNSQuery{Domain: "v6c.example.com", RecordType: "AAAA", ConsistencyCheck: true},
+			mockSetup: func(m *mockResolver) {
+				m.setIPResponse("ip6", "v6c.example.com", []net.IP{net.ParseIP("2606:4700::2")})
+			},
+			wantUnsupported:    1,
+			wantLookupMentions: []string{"ConsistencyCheck", "v6c.example.com"},
+		},
+		{
+			name:  "AAAA with both flags warns exactly once mentioning both",
+			query: DNSQuery{Domain: "v6b.example.com", RecordType: "AAAA", TestEachNameserver: true, ConsistencyCheck: true},
+			mockSetup: func(m *mockResolver) {
+				m.setIPResponse("ip6", "v6b.example.com", []net.IP{net.ParseIP("2606:4700::3")})
+			},
+			wantUnsupported:    1,
+			wantLookupMentions: []string{"TestEachNameserver", "ConsistencyCheck", "v6b.example.com"},
+		},
+		{
+			name:  "AAAA with neither flag does not warn",
+			query: DNSQuery{Domain: "v6plain.example.com", RecordType: "AAAA"},
+			mockSetup: func(m *mockResolver) {
+				m.setIPResponse("ip6", "v6plain.example.com", []net.IP{net.ParseIP("2606:4700::4")})
+			},
+			wantUnsupported: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mock := newMockResolver()
+			tt.mockSetup(mock)
+
+			monitor := &DNSMonitor{
+				name: "test-dns",
+				config: &DNSMonitorConfig{
+					ClusterDomains:   []string{},
+					ExternalDomains:  []string{},
+					CustomQueries:    []DNSQuery{tt.query},
+					LatencyThreshold: 1 * time.Second,
+					// Enabled so an A-query ConsistencyCheck path would activate;
+					// AAAA must still bypass it regardless of this config.
+					ConsistencyChecking: &ConsistencyCheckConfig{
+						Enabled:                true,
+						QueriesPerCheck:        3,
+						IntervalBetweenQueries: time.Millisecond,
+					},
+				},
+				resolver: mock,
+			}
+
+			ctx := context.Background()
+			status := &types.Status{Source: monitor.name, Timestamp: time.Now()}
+
+			monitor.checkCustomQueries(ctx, status)
+
+			if got := countReason(status.Events, unsupportedReason); got != tt.wantUnsupported {
+				t.Errorf("expected %d %s events, got %d (events: %+v)", tt.wantUnsupported, unsupportedReason, got, status.Events)
+			}
+
+			if tt.wantUnsupported > 0 {
+				var msg string
+				for _, e := range status.Events {
+					if e.Reason == unsupportedReason {
+						msg = e.Message
+						if e.Severity != types.EventWarning {
+							t.Errorf("expected %s severity, got %s", types.EventWarning, e.Severity)
+						}
+					}
+				}
+				for _, want := range tt.wantLookupMentions {
+					if !strings.Contains(msg, want) {
+						t.Errorf("warning message %q does not mention %q", msg, want)
+					}
+				}
+			}
+
+			if tt.checkAAAALookupRan {
+				// A configured, successful AAAA lookup produces no failure or
+				// no-records event; their absence confirms the lookup still ran.
+				for _, e := range status.Events {
+					if e.Reason == "CustomDNSQueryFailed" || e.Reason == "CustomDNSNoRecords" {
+						t.Errorf("AAAA lookup did not run as expected; got failure event: %+v", e)
+					}
+				}
+			}
+		})
+	}
+}
+
+// TestCustomQueriesAQueryNoAAAAWarning verifies that an A query with the
+// per-nameserver/consistency features enabled does not produce an
+// AAAAFeatureUnsupported warning (those features run on the A path as before).
+func TestCustomQueriesAQueryNoAAAAWarning(t *testing.T) {
+	mock := newMockResolver()
+	mock.setResponse("a.example.com", []string{"1.2.3.4"})
+
+	monitor := &DNSMonitor{
+		name: "test-dns",
+		config: &DNSMonitorConfig{
+			ClusterDomains:  []string{},
+			ExternalDomains: []string{},
+			CustomQueries: []DNSQuery{
+				{Domain: "a.example.com", RecordType: "A", TestEachNameserver: true, ConsistencyCheck: true},
+			},
+			LatencyThreshold: 1 * time.Second,
+			ConsistencyChecking: &ConsistencyCheckConfig{
+				Enabled:                true,
+				QueriesPerCheck:        3,
+				IntervalBetweenQueries: time.Millisecond,
+			},
+		},
+		resolver: mock,
+	}
+
+	ctx := context.Background()
+	status := &types.Status{Source: monitor.name, Timestamp: time.Now()}
+
+	monitor.checkCustomQueries(ctx, status)
+
+	for _, e := range status.Events {
+		if e.Reason == "AAAAFeatureUnsupported" {
+			t.Errorf("A query unexpectedly produced AAAAFeatureUnsupported event: %+v", e)
+		}
+	}
+}
+
 // TestNameserverChecks tests nameserver verification error handling.
 // Note: This primarily tests error paths since checkNameservers creates its own net.Resolver.
 func TestNameserverChecks(t *testing.T) {

From 75b543e4b7ff6b80faf3d51482bbb916d8246fe5 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 03:50:07 -0500
Subject: [PATCH 16/38] test(network): AAAA scoped/link-local address coverage
 (Task #17242)

Add TestCheckCustomQueries_AAAAScopedAddresses asserting the AAAA custom-
query path treats scoped/non-global IPv6 results (fe80::/10 link-local,
fc00::/7 ULA, ::1 loopback, and a scoped+global mix) as a successful
lookup emitting no error/no-records events. Test-only.
---
 pkg/monitors/network/dns_test.go | 52 ++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/pkg/monitors/network/dns_test.go b/pkg/monitors/network/dns_test.go
index b0d00ce..9b99414 100644
--- a/pkg/monitors/network/dns_test.go
+++ b/pkg/monitors/network/dns_test.go
@@ -1398,6 +1398,58 @@ func TestCheckCustomQueries(t *testing.T) {
 	}
 }
 
+// TestCheckCustomQueries_AAAAScopedAddresses covers the AAAA path when the
+// resolver returns non-global / scoped IPv6 addresses (link-local fe80::/10,
+// unique-local fc00::/7, loopback ::1, and a multi-address mix). These are
+// valid resolution results and must be treated as a successful AAAA lookup —
+// no error / no-records event — i.e. the monitor must not reject scoped
+// addresses. Spawned from Task #17201 (AAAA probe path).
+func TestCheckCustomQueries_AAAAScopedAddresses(t *testing.T) {
+	tests := []struct {
+		name string
+		ips  []net.IP
+	}{
+		{name: "link-local only", ips: []net.IP{net.ParseIP("fe80::1")}},
+		{name: "unique-local only", ips: []net.IP{net.ParseIP("fc00::1")}},
+		{name: "unique-local fd00", ips: []net.IP{net.ParseIP("fd12:3456::1")}},
+		{name: "ipv6 loopback", ips: []net.IP{net.ParseIP("::1")}},
+		{name: "mix of scoped and global", ips: []net.IP{net.ParseIP("fe80::1"), net.ParseIP("2606:4700::1")}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			for _, ip := range tt.ips {
+				if ip == nil {
+					t.Fatal("test setup error: nil IP in fixture")
+				}
+			}
+
+			mock := newMockResolver()
+			mock.ipResponses["ip6|scoped.example.com"] = tt.ips
+
+			monitor := &DNSMonitor{
+				config: &DNSMonitorConfig{
+					CustomQueries:    []DNSQuery{{Domain: "scoped.example.com", RecordType: "AAAA"}},
+					LatencyThreshold: 500 * time.Millisecond,
+				},
+				resolver: mock,
+			}
+
+			status := types.NewStatus("test-dns")
+			monitor.checkCustomQueries(context.Background(), status)
+
+			// A successful AAAA resolution of scoped addresses emits no events.
+			if len(status.Events) != 0 {
+				reasons := make([]string, len(status.Events))
+				for i, e := range status.Events {
+					reasons[i] = e.Reason
+				}
+				t.Errorf("scoped AAAA resolution emitted unexpected events %v; want none (scoped addresses must count as a successful lookup)", reasons)
+			}
+		})
+	}
+}
+
 // TestParseDNSConfigTestEachNameserver tests parsing of testEachNameserver field.
 func TestParseDNSConfigTestEachNameserver(t *testing.T) {
 	tests := []struct {

From 9e46ce43491859196dced7214530f9a4ec4bb0ef Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 03:51:18 -0500
Subject: [PATCH 17/38] test(network): assert AAAA record type appears in DNS
 event messages (Task #17243)

Add TestCheckCustomQueries_AAAARecordTypeInEventMessage asserting the
failure, no-records, and high-latency events for AAAA custom queries
include the "AAAA" record-type string in their message. Test-only.
---
 pkg/monitors/network/dns_test.go | 64 ++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/pkg/monitors/network/dns_test.go b/pkg/monitors/network/dns_test.go
index 9b99414..2ae0161 100644
--- a/pkg/monitors/network/dns_test.go
+++ b/pkg/monitors/network/dns_test.go
@@ -1450,6 +1450,70 @@ func TestCheckCustomQueries_AAAAScopedAddresses(t *testing.T) {
 	}
 }
 
+// TestCheckCustomQueries_AAAARecordTypeInEventMessage asserts that events
+// emitted for AAAA custom queries name the record type ("AAAA") in their
+// message, so operators can tell IPv6 query results apart from A in logs/
+// events. Covers the failure, no-records, and high-latency event paths.
+// Spawned from Task #17201 (AAAA probe path).
+func TestCheckCustomQueries_AAAARecordTypeInEventMessage(t *testing.T) {
+	tests := []struct {
+		name       string
+		setupMock  func(*mockResolver)
+		wantReason string
+	}{
+		{
+			name:       "failure event names AAAA",
+			setupMock:  func(m *mockResolver) { m.ipErrors["ip6|v6.example.com"] = fmt.Errorf("no such host") },
+			wantReason: "CustomDNSQueryFailed",
+		},
+		{
+			name:       "no-records event names AAAA",
+			setupMock:  func(m *mockResolver) { m.ipResponses["ip6|v6.example.com"] = []net.IP{} },
+			wantReason: "CustomDNSNoRecords",
+		},
+		{
+			name: "high-latency event names AAAA",
+			setupMock: func(m *mockResolver) {
+				m.ipResponses["ip6|v6.example.com"] = []net.IP{net.ParseIP("2606:4700::1")}
+				m.latencies["v6.example.com"] = 2 * time.Second
+			},
+			wantReason: "HighCustomDNSLatency",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mock := newMockResolver()
+			tt.setupMock(mock)
+
+			monitor := &DNSMonitor{
+				config: &DNSMonitorConfig{
+					CustomQueries:    []DNSQuery{{Domain: "v6.example.com", RecordType: "AAAA"}},
+					LatencyThreshold: 500 * time.Millisecond,
+				},
+				resolver: mock,
+			}
+
+			status := types.NewStatus("test-dns")
+			monitor.checkCustomQueries(context.Background(), status)
+
+			var found bool
+			for _, e := range status.Events {
+				if e.Reason != tt.wantReason {
+					continue
+				}
+				found = true
+				if !strings.Contains(e.Message, "AAAA") {
+					t.Errorf("%s message %q does not contain the record type \"AAAA\"", tt.wantReason, e.Message)
+				}
+			}
+			if !found {
+				t.Fatalf("expected an event with reason %s; got %d events", tt.wantReason, len(status.Events))
+			}
+		})
+	}
+}
+
 // TestParseDNSConfigTestEachNameserver tests parsing of testEachNameserver field.
 func TestParseDNSConfigTestEachNameserver(t *testing.T) {
 	tests := []struct {

From 5cdb9e74064a552a646b10c6cc46906f73b8a66e Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 03:55:40 -0500
Subject: [PATCH 18/38] fix(network): refresh CNI peer Family on re-IP/toggle
 (Task #17247)

The #17202 carry-forward made PeerStatus.Family monotonic-sticky: a fresh
probe resolving a different family (peer re-IP v4<->v6, overlay-test
toggle) was ignored. Now the most recent check's resolved family wins;
the prior family is retained only when a check resolves no family at all
(transient failure / pinger error early-return), preserving #17202
behavior. Tests cover both re-IP directions + carry-forward boundary.
---
 pkg/monitors/network/cni.go      | 46 +++++++++++-----
 pkg/monitors/network/cni_test.go | 90 ++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 14 deletions(-)

diff --git a/pkg/monitors/network/cni.go b/pkg/monitors/network/cni.go
index 1bb0535..3de33a6 100644
--- a/pkg/monitors/network/cni.go
+++ b/pkg/monitors/network/cni.go
@@ -107,9 +107,13 @@ type PeerStatus struct {
 	LastSuccess      time.Time
 	ConsecutiveFails int
 	// Family is the address family observed on the most recent probe that
-	// resolved a target ("ipv4" or "ipv6"). It is captured from PingResult.Family
-	// and preserved across attempts that fail before the pinger can select a
-	// family (e.g., resolution failure with a nil pinger).
+	// resolved a target ("ipv4" or "ipv6"). It is captured from PingResult.Family.
+	// The most recent check that resolves a family always wins, so a peer re-IP
+	// (v4->v6 or v6->v4) or an overlay-test target toggle is reflected immediately.
+	// The prior family is retained only across checks where no result resolved a
+	// family at all (e.g., a pinger-level error or resolution failure before a
+	// family is selected), so a transient probe failure does not erase what we
+	// already know about the peer's address family.
 	Family string
 }
 
@@ -639,10 +643,6 @@ func (m *CNIMonitor) checkPeerConnectivity(ctx context.Context, peer Peer) *Peer
 	if exists {
 		peerStatus.ConsecutiveFails = existingStatus.ConsecutiveFails
 		peerStatus.FailureCount = existingStatus.FailureCount
-		// Carry forward the previously-observed family so a transient probe
-		// failure (which may produce no family signal) does not erase what we
-		// already know about the peer's address family.
-		peerStatus.Family = existingStatus.Family
 	}
 
 	// Determine which IP to ping based on overlay test mode
@@ -659,20 +659,31 @@ func (m *CNIMonitor) checkPeerConnectivity(ctx context.Context, peer Peer) *Peer
 		peerStatus.Reachable = false
 		peerStatus.ConsecutiveFails++
 		peerStatus.FailureCount++
+		// No results were produced, so this check resolved no family. Preserve the
+		// previously-observed family across this transient failure.
+		if exists {
+			peerStatus.Family = existingStatus.Family
+		}
 		return peerStatus
 	}
 
-	// Analyze ping results
+	// Analyze ping results.
+	//
+	// Resolve the address family from THIS check's results: a fresh probe that
+	// resolves a family always wins, so a peer re-IP (v4<->v6) or an overlay-test
+	// target toggle is reflected immediately rather than sticking to a stale
+	// value. All results in a batch target the same IP, so families are uniform;
+	// we take the first non-empty value (which also picks up failures that still
+	// resolved a family, e.g., a timeout after the listener bound). Only when no
+	// result resolved a family do we carry forward the previously-observed family
+	// so a transient probe failure does not erase what we already know.
 	successCount := 0
 	var totalRTT time.Duration
+	var freshFamily string
 
 	for _, result := range results {
-		// Capture address family from the first result that reports one. All
-		// results in a batch target the same IP, so families are uniform; we
-		// take the first non-empty value to also pick up failures that still
-		// resolved a family (e.g., timeout after the listener bound).
-		if peerStatus.Family == "" && result.Family != "" {
-			peerStatus.Family = result.Family
+		if freshFamily == "" && result.Family != "" {
+			freshFamily = result.Family
 		}
 		if result.Success {
 			successCount++
@@ -681,6 +692,13 @@ func (m *CNIMonitor) checkPeerConnectivity(ctx context.Context, peer Peer) *Peer
 		}
 	}
 
+	// Fresh family wins; fall back to the prior family only when this check
+	// resolved none.
+	peerStatus.Family = freshFamily
+	if peerStatus.Family == "" && exists {
+		peerStatus.Family = existingStatus.Family
+	}
+
 	// Majority of pings must succeed
 	if successCount > len(results)/2 {
 		peerStatus.Reachable = true
diff --git a/pkg/monitors/network/cni_test.go b/pkg/monitors/network/cni_test.go
index acd40bc..1c86e5b 100644
--- a/pkg/monitors/network/cni_test.go
+++ b/pkg/monitors/network/cni_test.go
@@ -1324,3 +1324,93 @@ func TestCNIMonitor_FamilyMixedResultsTakesFirstNonEmpty(t *testing.T) {
 		t.Errorf("Family = %q, want %q", status.Family, FamilyIPv4)
 	}
 }
+
+// TestCNIMonitor_FamilyRefreshesOnReIP verifies that a fresh probe which
+// resolves a family always wins over a previously-observed family. This covers
+// a peer re-IP (v4<->v6) and an overlay-test target toggle: the family must not
+// be monotonic-sticky (Task #17247 regression).
+func TestCNIMonitor_FamilyRefreshesOnReIP(t *testing.T) {
+	tests := []struct {
+		name        string
+		priorFamily string
+		pingResults []PingResult
+		wantFamily  string
+		wantReach   bool
+	}{
+		{
+			name:        "ipv4 to ipv6 re-IP refreshes family",
+			priorFamily: FamilyIPv4,
+			pingResults: []PingResult{
+				{Success: true, RTT: 8 * time.Millisecond, Family: FamilyIPv6},
+				{Success: true, RTT: 9 * time.Millisecond, Family: FamilyIPv6},
+				{Success: true, RTT: 10 * time.Millisecond, Family: FamilyIPv6},
+			},
+			wantFamily: FamilyIPv6,
+			wantReach:  true,
+		},
+		{
+			name:        "ipv6 to ipv4 re-IP refreshes family",
+			priorFamily: FamilyIPv6,
+			pingResults: []PingResult{
+				{Success: true, RTT: 8 * time.Millisecond, Family: FamilyIPv4},
+				{Success: true, RTT: 9 * time.Millisecond, Family: FamilyIPv4},
+				{Success: true, RTT: 10 * time.Millisecond, Family: FamilyIPv4},
+			},
+			wantFamily: FamilyIPv4,
+			wantReach:  true,
+		},
+		{
+			name:        "fresh family wins even when pings fail but listener bound",
+			priorFamily: FamilyIPv4,
+			pingResults: []PingResult{
+				{Success: false, Error: errors.New("timeout"), Family: FamilyIPv6},
+				{Success: false, Error: errors.New("timeout"), Family: FamilyIPv6},
+				{Success: false, Error: errors.New("timeout"), Family: FamilyIPv6},
+			},
+			wantFamily: FamilyIPv6,
+			wantReach:  false,
+		},
+		{
+			name:        "all-failure check with no family carries prior family forward",
+			priorFamily: FamilyIPv4,
+			pingResults: []PingResult{
+				{Success: false, Error: errors.New("timeout")},
+				{Success: false, Error: errors.New("timeout")},
+				{Success: false, Error: errors.New("timeout")},
+			},
+			wantFamily: FamilyIPv4,
+			wantReach:  false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			peer := Peer{Name: "peer-1", NodeName: "node-2", NodeIP: "10.0.0.2"}
+			monitor := &CNIMonitor{
+				name: "test-cni",
+				config: &CNIMonitorConfig{
+					Connectivity: ConnectivityConfig{
+						PingCount:   3,
+						PingTimeout: 5 * time.Second,
+					},
+				},
+				pinger: newMockPinger(tt.pingResults, nil),
+				peerStatuses: map[string]*PeerStatus{
+					// Seed with a prior status that observed a (now stale) family.
+					"node-2": {
+						Peer:   peer,
+						Family: tt.priorFamily,
+					},
+				},
+			}
+
+			status := monitor.checkPeerConnectivity(context.Background(), peer)
+			if status.Reachable != tt.wantReach {
+				t.Errorf("Reachable = %v, want %v", status.Reachable, tt.wantReach)
+			}
+			if status.Family != tt.wantFamily {
+				t.Errorf("Family = %q, want %q", status.Family, tt.wantFamily)
+			}
+		})
+	}
+}

From 3c6b49b89fc73e6761c894db0402c0586c240dc2 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:00:58 -0500
Subject: [PATCH 19/38] test(controller): fix TestCorrelationDetectionFlow
 port/start failures (Task #17232)

Root cause: the test started the controller server on hardcoded 0.0.0.0:
8080 only to run the correlator's background loop; the bind collided with
a port already in use, and two subtests ignored the Start() error so the
correlator loop never ran -> no correlations -> misleading assertion
failure. Fix (test-only): use an ephemeral port (Port=0), assert
Start() succeeds in every subtest, and replace fixed sleeps with the
Eventually poll helper. Assertions unchanged.
---
 .../controller/controller_integration_test.go | 65 ++++++++++++-------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/test/integration/controller/controller_integration_test.go b/test/integration/controller/controller_integration_test.go
index 486250a..83991c6 100644
--- a/test/integration/controller/controller_integration_test.go
+++ b/test/integration/controller/controller_integration_test.go
@@ -201,6 +201,14 @@ func TestCorrelationDetectionFlow(t *testing.T) {
 	config.Correlation.Enabled = true
 	config.Correlation.ClusterWideThreshold = 0.3 // 30% threshold
 	config.Correlation.EvaluationInterval = 100 * time.Millisecond
+	// Use an ephemeral port (0) instead of the hardcoded default 8080 so that
+	// server.Start() binds a free OS-assigned port. The test's actual HTTP
+	// traffic goes through the httptest server (ts.URL); server.Start() is only
+	// called to launch the correlator's background evaluation loop. Binding 8080
+	// makes the test non-hermetic: it collides across subtests/repeats and fails
+	// outright when 8080 is already in use (e.g. in CI). Port 0 keeps every
+	// subtest's Start() independent and parallel-safe.
+	config.Server.Port = 0
 
 	t.Run("infrastructure correlation detection", func(t *testing.T) {
 		server, ts, cleanup := createTestServer(t, config)
@@ -238,21 +246,27 @@ func TestCorrelationDetectionFlow(t *testing.T) {
 			test.AssertEqual(t, http.StatusAccepted, resp.StatusCode, "Report should be accepted")
 		}
 
-		// Wait for correlation evaluation
-		time.Sleep(300 * time.Millisecond)
+		// Poll for the correlator's background evaluation to detect the pattern
+		// rather than relying on a fixed sleep, which is racy against the ticker.
+		// 3/5 nodes (60%) report the "dns" problem, exceeding the 30% threshold,
+		// so an infrastructure correlation must appear once a cycle completes.
+		test.Eventually(t, func() bool {
+			resp, result := doRequest(t, ts, http.MethodGet, "/api/v1/correlations", nil)
+			if resp.StatusCode != http.StatusOK {
+				return false
+			}
+			correlations, ok := result["data"].([]interface{})
+			return ok && len(correlations) > 0
+		}, 3*time.Second, 50*time.Millisecond, "Should detect infrastructure correlation")
 
-		// Check for correlations
+		// Verify the detected correlation is of type "infrastructure".
 		resp, result := doRequest(t, ts, http.MethodGet, "/api/v1/correlations", nil)
 		test.AssertEqual(t, http.StatusOK, resp.StatusCode, "Correlations request should succeed")
-
 		correlations := result["data"].([]interface{})
 		test.AssertTrue(t, len(correlations) > 0, "Should detect infrastructure correlation")
-
-		if len(correlations) > 0 {
-			corr := correlations[0].(map[string]interface{})
-			test.AssertEqual(t, "infrastructure", corr["type"],
-				"Correlation type should be 'infrastructure'")
-		}
+		corr := correlations[0].(map[string]interface{})
+		test.AssertEqual(t, "infrastructure", corr["type"],
+			"Correlation type should be 'infrastructure'")
 	})
 
 	t.Run("common cause correlation detection", func(t *testing.T) {
@@ -260,7 +274,8 @@ func TestCorrelationDetectionFlow(t *testing.T) {
 		defer cleanup()
 
 		ctx := context.Background()
-		server.Start(ctx)
+		err := server.Start(ctx)
+		test.AssertNoError(t, err, "Failed to start server")
 		defer server.Stop(ctx)
 
 		// Submit reports with related problems (memory + disk pressure)
@@ -279,16 +294,21 @@ func TestCorrelationDetectionFlow(t *testing.T) {
 			test.AssertEqual(t, http.StatusAccepted, resp.StatusCode, "Report should be accepted")
 		}
 
-		// Wait for correlation evaluation
-		time.Sleep(300 * time.Millisecond)
-
-		// Check for correlations
-		resp, result := doRequest(t, ts, http.MethodGet, "/api/v1/correlations", nil)
-		test.AssertEqual(t, http.StatusOK, resp.StatusCode, "Correlations request should succeed")
-
-		correlations := result["data"].([]interface{})
-		// Should detect at least infrastructure correlation since 100% of nodes have same problems
-		test.AssertTrue(t, len(correlations) > 0, "Should detect some correlation")
+		// Poll for correlation evaluation rather than sleeping a fixed amount: the
+		// correlator runs on a background ticker (EvaluationInterval=100ms), so the
+		// number of completed cycles by any fixed deadline is non-deterministic.
+		// All 3 nodes report MemoryPressure+DiskPressure, which triggers both an
+		// infrastructure correlation (100% of nodes share each problem type) and the
+		// "resource-exhaustion" common-cause pattern, so at least one correlation
+		// must appear once an evaluation cycle completes.
+		test.Eventually(t, func() bool {
+			resp, result := doRequest(t, ts, http.MethodGet, "/api/v1/correlations", nil)
+			if resp.StatusCode != http.StatusOK {
+				return false
+			}
+			correlations, ok := result["data"].([]interface{})
+			return ok && len(correlations) > 0
+		}, 3*time.Second, 50*time.Millisecond, "Should detect some correlation")
 	})
 
 	t.Run("correlation resolution when nodes recover", func(t *testing.T) {
@@ -296,7 +316,8 @@ func TestCorrelationDetectionFlow(t *testing.T) {
 		defer cleanup()
 
 		ctx := context.Background()
-		server.Start(ctx)
+		err := server.Start(ctx)
+		test.AssertNoError(t, err, "Failed to start server")
 		defer server.Stop(ctx)
 
 		// First, create a problem state

From 6a03ab6ac8b92b1f039e0fa11a8d5ec951fe08cd Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:15:00 -0500
Subject: [PATCH 20/38] feat(metrics): add address_family label to network
 metrics (Task #17216)

Add address_family label (ipv4|ipv6|unknown) to GatewayLatencySeconds,
Peer{Latency,LatencyAvg,Reachable}, and DNSLatencySeconds. Add
AddressFamily to types.{PeerLatency,DNSLatency,APIServerLatency}
(GatewayLatency already had it) and populate from PeerStatus.Family,
DNS RecordType (AAAA->ipv6), and the classified API-server endpoint.
familyLabel() normalizes empty/unexpected -> unknown so the label is
never empty. No import cycle (monitors use existing family consts;
types untouched re imports). Tests for label emission + classifier.
---
 pkg/exporters/prometheus/exporter.go      |  25 +++-
 pkg/exporters/prometheus/metrics.go       |  10 +-
 pkg/exporters/prometheus/metrics_test.go  | 142 ++++++++++++++++++++++
 pkg/monitors/kubernetes/apiserver.go      |  34 +++++-
 pkg/monitors/kubernetes/apiserver_test.go |  26 ++++
 pkg/monitors/network/cni.go               |  11 +-
 pkg/monitors/network/dns.go               |  19 ++-
 pkg/types/types.go                        |  15 +++
 8 files changed, 259 insertions(+), 23 deletions(-)

diff --git a/pkg/exporters/prometheus/exporter.go b/pkg/exporters/prometheus/exporter.go
index 8cd47db..5bbaa22 100644
--- a/pkg/exporters/prometheus/exporter.go
+++ b/pkg/exporters/prometheus/exporter.go
@@ -227,7 +227,7 @@ func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 		latencySeconds := gw.LatencyMs / 1000.0
 
 		e.metrics.GatewayLatencySeconds.WithLabelValues(
-			e.nodeName, gw.GatewayIP).Set(latencySeconds)
+			e.nodeName, gw.GatewayIP, familyLabel(gw.AddressFamily)).Set(latencySeconds)
 
 		e.metrics.GatewayLatencyHistogram.WithLabelValues(
 			e.nodeName, gw.GatewayIP).Observe(latencySeconds)
@@ -240,11 +240,13 @@ func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 			latencySeconds := peer.LatencyMs / 1000.0
 			avgLatencySeconds := peer.AvgLatencyMs / 1000.0
 
+			family := familyLabel(peer.AddressFamily)
+
 			e.metrics.PeerLatencySeconds.WithLabelValues(
-				e.nodeName, peer.PeerNode, peer.PeerIP).Set(latencySeconds)
+				e.nodeName, peer.PeerNode, peer.PeerIP, family).Set(latencySeconds)
 
 			e.metrics.PeerLatencyAvgSeconds.WithLabelValues(
-				e.nodeName, peer.PeerNode, peer.PeerIP).Set(avgLatencySeconds)
+				e.nodeName, peer.PeerNode, peer.PeerIP, family).Set(avgLatencySeconds)
 
 			reachable := 0.0
 			if peer.Reachable {
@@ -252,7 +254,7 @@ func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 				reachableCount++
 			}
 			e.metrics.PeerReachable.WithLabelValues(
-				e.nodeName, peer.PeerNode, peer.PeerIP).Set(reachable)
+				e.nodeName, peer.PeerNode, peer.PeerIP, family).Set(reachable)
 
 			e.metrics.PeerLatencyHistogram.WithLabelValues(
 				e.nodeName, peer.PeerNode).Observe(latencySeconds)
@@ -267,7 +269,7 @@ func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 		latencySeconds := dns.LatencyMs / 1000.0
 
 		e.metrics.DNSLatencySeconds.WithLabelValues(
-			e.nodeName, dns.DNSServer, dns.Domain, dns.RecordType).Set(latencySeconds)
+			e.nodeName, dns.DNSServer, dns.Domain, dns.RecordType, familyLabel(dns.AddressFamily)).Set(latencySeconds)
 
 		e.metrics.DNSLatencyHistogram.WithLabelValues(
 			e.nodeName, dns.DomainType).Observe(latencySeconds)
@@ -324,6 +326,19 @@ func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 	}
 }
 
+// familyLabel normalizes an address-family string for use as a Prometheus
+// label value. It returns "ipv4" or "ipv6" only when the input matches one of
+// those exactly; any other value (including an empty string) maps to "unknown"
+// so the address_family label is never emitted empty.
+func familyLabel(s string) string {
+	switch s {
+	case "ipv4", "ipv6":
+		return s
+	default:
+		return "unknown"
+	}
+}
+
 // ExportProblem implements types.Exporter interface for problem exports
 func (e *PrometheusExporter) ExportProblem(ctx context.Context, problem *types.Problem) error {
 	if problem == nil {
diff --git a/pkg/exporters/prometheus/metrics.go b/pkg/exporters/prometheus/metrics.go
index ee2725b..e41ea74 100644
--- a/pkg/exporters/prometheus/metrics.go
+++ b/pkg/exporters/prometheus/metrics.go
@@ -212,7 +212,7 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 				Help:        "Current latency to the default gateway in seconds",
 				ConstLabels: labels,
 			},
-			[]string{"node", "gateway_ip"},
+			[]string{"node", "gateway_ip", "address_family"},
 		),
 
 		PeerLatencySeconds: prometheus.NewGaugeVec(
@@ -223,7 +223,7 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 				Help:        "Last measured latency to peer node in seconds",
 				ConstLabels: labels,
 			},
-			[]string{"node", "peer_node", "peer_ip"},
+			[]string{"node", "peer_node", "peer_ip", "address_family"},
 		),
 
 		PeerLatencyAvgSeconds: prometheus.NewGaugeVec(
@@ -234,7 +234,7 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 				Help:        "Average latency to peer node in seconds",
 				ConstLabels: labels,
 			},
-			[]string{"node", "peer_node", "peer_ip"},
+			[]string{"node", "peer_node", "peer_ip", "address_family"},
 		),
 
 		PeerReachable: prometheus.NewGaugeVec(
@@ -245,7 +245,7 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 				Help:        "Whether peer node is reachable (1 = reachable, 0 = unreachable)",
 				ConstLabels: labels,
 			},
-			[]string{"node", "peer_node", "peer_ip"},
+			[]string{"node", "peer_node", "peer_ip", "address_family"},
 		),
 
 		PeersTotal: prometheus.NewGaugeVec(
@@ -278,7 +278,7 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 				Help:        "DNS resolution latency in seconds",
 				ConstLabels: labels,
 			},
-			[]string{"node", "dns_server", "domain", "record_type"},
+			[]string{"node", "dns_server", "domain", "record_type", "address_family"},
 		),
 
 		DNSNameserverHealthScore: prometheus.NewGaugeVec(
diff --git a/pkg/exporters/prometheus/metrics_test.go b/pkg/exporters/prometheus/metrics_test.go
index 19533bd..3a90bf0 100644
--- a/pkg/exporters/prometheus/metrics_test.go
+++ b/pkg/exporters/prometheus/metrics_test.go
@@ -5,6 +5,8 @@ import (
 
 	"github.com/prometheus/client_golang/prometheus"
 	dto "github.com/prometheus/client_model/go"
+
+	"github.com/supporttools/node-doctor/pkg/types"
 )
 
 func TestNewMetrics(t *testing.T) {
@@ -299,6 +301,146 @@ func TestMetricLabels(t *testing.T) {
 	}
 }
 
+func TestFamilyLabel(t *testing.T) {
+	cases := map[string]string{
+		"ipv4":      "ipv4",
+		"ipv6":      "ipv6",
+		"":          "unknown",
+		"IPv4":      "unknown", // case-sensitive: only exact "ipv4"/"ipv6" pass through
+		"dualstack": "unknown",
+	}
+	for in, want := range cases {
+		if got := familyLabel(in); got != want {
+			t.Errorf("familyLabel(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+// findLabelValue returns the value of the named label on the first sample of the
+// metric family with the given name, or "" if not found.
+func findLabelValue(t *testing.T, families []*dto.MetricFamily, metricName, labelName string) (string, bool) {
+	t.Helper()
+	for _, mf := range families {
+		if mf.GetName() != metricName {
+			continue
+		}
+		for _, metric := range mf.Metric {
+			for _, label := range metric.Label {
+				if label.GetName() == labelName {
+					return label.GetValue(), true
+				}
+			}
+		}
+	}
+	return "", false
+}
+
+func TestAddressFamilyLabelEmitted(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	metrics, err := NewMetrics("test", "", nil)
+	if err != nil {
+		t.Fatalf("failed to create metrics: %v", err)
+	}
+	if err := metrics.Register(registry); err != nil {
+		t.Fatalf("failed to register metrics: %v", err)
+	}
+
+	e := &PrometheusExporter{
+		nodeName: "test-node",
+		registry: registry,
+		metrics:  metrics,
+	}
+
+	status := (&types.Status{Source: "test"}).SetLatencyMetrics(&types.LatencyMetrics{
+		Gateway: &types.GatewayLatency{
+			GatewayIP:     "10.0.0.1",
+			LatencyMs:     1.0,
+			AddressFamily: "ipv4",
+		},
+		Peers: []types.PeerLatency{
+			{
+				PeerNode:      "peer-v6",
+				PeerIP:        "fd00::1",
+				LatencyMs:     2.0,
+				AvgLatencyMs:  2.0,
+				Reachable:     true,
+				AddressFamily: "ipv6",
+			},
+			{
+				PeerNode:     "peer-unknown",
+				PeerIP:       "10.0.0.9",
+				LatencyMs:    3.0,
+				AvgLatencyMs: 3.0,
+				Reachable:    true,
+				// AddressFamily intentionally empty -> "unknown"
+			},
+		},
+		DNS: []types.DNSLatency{
+			{
+				DNSServer:     "8.8.8.8",
+				Domain:        "example.com",
+				RecordType:    "AAAA",
+				DomainType:    "external",
+				LatencyMs:     4.0,
+				Success:       true,
+				AddressFamily: "ipv6",
+			},
+		},
+	})
+
+	e.recordLatencyMetrics(status)
+
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	checks := []struct {
+		metric string
+		want   string
+	}{
+		{"test_gateway_latency_seconds", "ipv4"},
+		{"test_peer_latency_seconds", ""}, // multiple series; checked below
+		{"test_dns_latency_seconds", "ipv6"},
+	}
+	// Gateway and DNS each have a single series, so the first-sample lookup is deterministic.
+	for _, c := range checks {
+		if c.metric == "test_peer_latency_seconds" {
+			continue
+		}
+		got, ok := findLabelValue(t, families, c.metric, "address_family")
+		if !ok {
+			t.Errorf("%s: address_family label not found", c.metric)
+			continue
+		}
+		if got != c.want {
+			t.Errorf("%s: address_family = %q, want %q", c.metric, got, c.want)
+		}
+	}
+
+	// Peer metric has two series; assert that both expected family labels are present.
+	wantPeerFamilies := map[string]bool{"ipv6": false, "unknown": false}
+	for _, mf := range families {
+		if mf.GetName() != "test_peer_latency_seconds" {
+			continue
+		}
+		for _, metric := range mf.Metric {
+			for _, label := range metric.Label {
+				if label.GetName() == "address_family" {
+					if _, expected := wantPeerFamilies[label.GetValue()]; expected {
+						wantPeerFamilies[label.GetValue()] = true
+					}
+				}
+			}
+		}
+	}
+	for fam, seen := range wantPeerFamilies {
+		if !seen {
+			t.Errorf("peer_latency_seconds: expected an address_family=%q series, none found", fam)
+		}
+	}
+}
+
 func TestMetricsReset(t *testing.T) {
 	registry := prometheus.NewRegistry()
 	constLabels := prometheus.Labels{"env": "test"}
diff --git a/pkg/monitors/kubernetes/apiserver.go b/pkg/monitors/kubernetes/apiserver.go
index 9ffc0af..c0e466c 100644
--- a/pkg/monitors/kubernetes/apiserver.go
+++ b/pkg/monitors/kubernetes/apiserver.go
@@ -4,6 +4,8 @@ package kubernetes
 import (
 	"context"
 	"fmt"
+	"net"
+	"net/url"
 	"strings"
 	"sync"
 	"time"
@@ -421,14 +423,42 @@ func (m *APIServerMonitor) checkAPIServer(ctx context.Context) (*types.Status, e
 	// Set API server latency metrics for Prometheus export
 	status.SetLatencyMetrics(&types.LatencyMetrics{
 		APIServer: &types.APIServerLatency{
-			LatencyMs: float64(metrics.Latency.Microseconds()) / 1000.0,
-			Reachable: true,
+			LatencyMs:     float64(metrics.Latency.Microseconds()) / 1000.0,
+			Reachable:     true,
+			AddressFamily: classifyEndpointFamily(m.config.Endpoint),
 		},
 	})
 
 	return status, nil
 }
 
+// classifyEndpointFamily inspects an API server endpoint and returns the IP
+// address family it targets: "ipv4" or "ipv6" when the host is a literal IP,
+// or "" when the host is a DNS name (or otherwise cannot be classified). The
+// exporter normalizes an empty value to the "unknown" label.
+func classifyEndpointFamily(endpoint string) string {
+	host := endpoint
+	// Endpoints are typically URLs (e.g. "https://10.0.0.1:6443"); extract the
+	// host component when present so we classify the actual dial target.
+	if u, err := url.Parse(endpoint); err == nil && u.Host != "" {
+		host = u.Host
+	}
+	// Strip any port (and brackets around IPv6 literals).
+	if h, _, err := net.SplitHostPort(host); err == nil {
+		host = h
+	}
+	host = strings.Trim(host, "[]")
+
+	ip := net.ParseIP(host)
+	if ip == nil {
+		return ""
+	}
+	if ip.To4() != nil {
+		return "ipv4"
+	}
+	return "ipv6"
+}
+
 // ParseAPIServerConfig parses API server configuration from a generic config map.
 func ParseAPIServerConfig(configMap map[string]interface{}) (*APIServerMonitorConfig, error) {
 	config := &APIServerMonitorConfig{
diff --git a/pkg/monitors/kubernetes/apiserver_test.go b/pkg/monitors/kubernetes/apiserver_test.go
index cd65967..e5e57f3 100644
--- a/pkg/monitors/kubernetes/apiserver_test.go
+++ b/pkg/monitors/kubernetes/apiserver_test.go
@@ -41,6 +41,32 @@ func (m *mockAPIServerClient) GetVersion(ctx context.Context) (*version.Info, er
 	}, m.err
 }
 
+// TestClassifyEndpointFamily verifies address-family classification of API
+// server endpoints.
+func TestClassifyEndpointFamily(t *testing.T) {
+	cases := []struct {
+		name     string
+		endpoint string
+		want     string
+	}{
+		{"ipv4 url with port", "https://10.0.0.1:6443", "ipv4"},
+		{"ipv4 url no port", "https://10.0.0.1", "ipv4"},
+		{"ipv6 url bracketed with port", "https://[fd00::1]:6443", "ipv6"},
+		{"ipv6 url bracketed no port", "https://[2001:db8::1]", "ipv6"},
+		{"hostname url", "https://kubernetes.default.svc.cluster.local", ""},
+		{"bare ipv4", "10.0.0.1", "ipv4"},
+		{"bare ipv6", "fd00::1", "ipv6"},
+		{"empty", "", ""},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := classifyEndpointFamily(c.endpoint); got != c.want {
+				t.Errorf("classifyEndpointFamily(%q) = %q, want %q", c.endpoint, got, c.want)
+			}
+		})
+	}
+}
+
 // TestParseAPIServerConfig tests configuration parsing.
 func TestParseAPIServerConfig(t *testing.T) {
 	tests := []struct {
diff --git a/pkg/monitors/network/cni.go b/pkg/monitors/network/cni.go
index 3de33a6..d9a0c8c 100644
--- a/pkg/monitors/network/cni.go
+++ b/pkg/monitors/network/cni.go
@@ -611,11 +611,12 @@ func (m *CNIMonitor) checkCNI(ctx context.Context) (*types.Status, error) {
 	peerLatencies := make([]types.PeerLatency, 0, len(m.peerStatuses))
 	for _, ps := range m.peerStatuses {
 		peerLatencies = append(peerLatencies, types.PeerLatency{
-			PeerNode:     ps.Peer.NodeName,
-			PeerIP:       ps.Peer.NodeIP,
-			LatencyMs:    float64(ps.LastLatency.Microseconds()) / 1000.0,
-			AvgLatencyMs: float64(ps.AvgLatency.Microseconds()) / 1000.0,
-			Reachable:    ps.Reachable,
+			PeerNode:      ps.Peer.NodeName,
+			PeerIP:        ps.Peer.NodeIP,
+			LatencyMs:     float64(ps.LastLatency.Microseconds()) / 1000.0,
+			AvgLatencyMs:  float64(ps.AvgLatency.Microseconds()) / 1000.0,
+			Reachable:     ps.Reachable,
+			AddressFamily: ps.Family,
 		})
 	}
 	m.mu.Unlock()
diff --git a/pkg/monitors/network/dns.go b/pkg/monitors/network/dns.go
index 94466ca..8234024 100644
--- a/pkg/monitors/network/dns.go
+++ b/pkg/monitors/network/dns.go
@@ -1507,15 +1507,22 @@ func (m *DNSMonitor) checkDNS(ctx context.Context) (*types.Status, error) {
 
 // recordDNSLatency records a DNS latency measurement for Prometheus export.
 func (m *DNSMonitor) recordDNSLatency(domain, domainType, dnsServer, recordType string, latency time.Duration, success bool) {
+	// Derive the address family from the record type: AAAA queries resolve
+	// IPv6 addresses, everything else (A and the default) resolves IPv4.
+	family := FamilyIPv4
+	if strings.EqualFold(strings.TrimSpace(recordType), "AAAA") {
+		family = FamilyIPv6
+	}
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	m.latencyMetrics = append(m.latencyMetrics, types.DNSLatency{
-		DNSServer:  dnsServer,
-		Domain:     domain,
-		RecordType: recordType,
-		DomainType: strings.ToLower(domainType),
-		LatencyMs:  float64(latency.Microseconds()) / 1000.0,
-		Success:    success,
+		DNSServer:     dnsServer,
+		Domain:        domain,
+		RecordType:    recordType,
+		DomainType:    strings.ToLower(domainType),
+		LatencyMs:     float64(latency.Microseconds()) / 1000.0,
+		Success:       success,
+		AddressFamily: family,
 	})
 }
 
diff --git a/pkg/types/types.go b/pkg/types/types.go
index 923b33c..1177e93 100644
--- a/pkg/types/types.go
+++ b/pkg/types/types.go
@@ -465,6 +465,11 @@ type PeerLatency struct {
 	LatencyMs    float64 `json:"latency_ms"`
 	AvgLatencyMs float64 `json:"avg_latency_ms"`
 	Reachable    bool    `json:"reachable"`
+	// AddressFamily records which IP family the probed peer belongs to
+	// ("ipv4" or "ipv6"). It lets downstream consumers distinguish dual-stack
+	// peer probes. Empty when the family is unknown (e.g. a peer whose family
+	// could not be classified).
+	AddressFamily string `json:"address_family,omitempty"`
 }
 
 // DNSLatency represents DNS resolution latency.
@@ -475,12 +480,22 @@ type DNSLatency struct {
 	DomainType string  `json:"domain_type"` // "cluster", "external", "custom"
 	LatencyMs  float64 `json:"latency_ms"`
 	Success    bool    `json:"success"`
+	// AddressFamily records which IP family the query resolves
+	// ("ipv4" for A records, "ipv6" for AAAA records). It lets downstream
+	// consumers distinguish dual-stack DNS probes. Empty when the family is
+	// unknown.
+	AddressFamily string `json:"address_family,omitempty"`
 }
 
 // APIServerLatency represents Kubernetes API server response latency.
 type APIServerLatency struct {
 	LatencyMs float64 `json:"latency_ms"`
 	Reachable bool    `json:"reachable"`
+	// AddressFamily records which IP family the probed API server endpoint
+	// belongs to ("ipv4" or "ipv6"). It lets downstream consumers distinguish
+	// dual-stack API server probes. Empty when the family is unknown (e.g. a
+	// hostname endpoint whose family could not be classified).
+	AddressFamily string `json:"address_family,omitempty"`
 }
 
 // SetLatencyMetrics is a helper to set latency metrics in Status.Metadata.

From c20a48a5402e5a3ff53ad14667ab09ac2d2b559c Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:19:08 -0500
Subject: [PATCH 21/38] feat(dashboards): address_family template var +
 family-aware CNI panels (Task #17217)

Add an address_family query template variable (multi/includeAll) to the
CNI network-health dashboard and apply address_family=~"$address_family"
to the peer latency/reachability panels, grouping the per-node timeseries
by (node, address_family) so IPv4 vs IPv6 series are distinguishable. The
overview/system/kubernetes dashboards have no relabeled-metric panels and
are unchanged.
---
 .../node-doctor-cni-network-health.json       | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/dashboards/node-doctor-cni-network-health.json b/dashboards/node-doctor-cni-network-health.json
index b7de9c6..471605d 100644
--- a/dashboards/node-doctor-cni-network-health.json
+++ b/dashboards/node-doctor-cni-network-health.json
@@ -777,7 +777,7 @@
             "type": "prometheus",
             "uid": "${datasource}"
           },
-          "expr": "avg(node_doctor_monitor_peer_latency_seconds{node=~\"$node\"}) * 1000",
+          "expr": "avg(node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=~\"$address_family\"}) * 1000",
           "legendFormat": "Avg Latency",
           "refId": "A"
         }
@@ -844,7 +844,7 @@
             "type": "prometheus",
             "uid": "${datasource}"
           },
-          "expr": "max(node_doctor_monitor_peer_latency_seconds{node=~\"$node\"}) * 1000",
+          "expr": "max(node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=~\"$address_family\"}) * 1000",
           "legendFormat": "Max Latency",
           "refId": "A"
         }
@@ -1208,8 +1208,8 @@
             "type": "prometheus",
             "uid": "${datasource}"
           },
-          "expr": "avg by (node) (node_doctor_monitor_peer_latency_seconds{node=~\"$node\"}) * 1000",
-          "legendFormat": "{{node}}",
+          "expr": "avg by (node, address_family) (node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=~\"$address_family\"}) * 1000",
+          "legendFormat": "{{node}} ({{address_family}})",
           "refId": "A"
         }
       ],
@@ -1479,7 +1479,7 @@
             "type": "prometheus",
             "uid": "${datasource}"
           },
-          "expr": "node_doctor_monitor_peer_latency_seconds{node=~\"$node\"} * 1000",
+          "expr": "node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=~\"$address_family\"} * 1000",
           "format": "table",
           "instant": true,
           "legendFormat": "__auto",
@@ -1490,7 +1490,7 @@
             "type": "prometheus",
             "uid": "${datasource}"
           },
-          "expr": "node_doctor_monitor_peer_reachable{node=~\"$node\"}",
+          "expr": "node_doctor_monitor_peer_reachable{node=~\"$node\", address_family=~\"$address_family\"}",
           "format": "table",
           "instant": true,
           "legendFormat": "__auto",
@@ -2546,6 +2546,34 @@
         "skipUrlSync": false,
         "sort": 1,
         "type": "query"
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": true,
+          "text": ["All"],
+          "value": ["$__all"]
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${datasource}"
+        },
+        "definition": "label_values(node_doctor_monitor_peer_latency_seconds, address_family)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Address Family",
+        "multi": true,
+        "name": "address_family",
+        "options": [],
+        "query": {
+          "query": "label_values(node_doctor_monitor_peer_latency_seconds, address_family)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
       }
     ]
   },

From d261cf83b62a40351888ece834745ac6f41e163b Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:23:37 -0500
Subject: [PATCH 22/38] feat(dashboards): new IPv6 / dual-stack health Grafana
 dashboard (Task #17218)

Add dashboards/node-doctor-ipv6.json (29 panels): IPv6 monitor condition
health (condition_status{condition_type=~"IPv6.*"} for the six new
conditions), condition activity over time, dual-stack connectivity
(peer/gateway latency + reachability filtered by address_family, A-vs-AAAA
DNS), and IPv6 monitor-up/events. Mirrors the CNI dashboard schema/
templating; all PromQL uses verified metric+label names.
---
 dashboards/node-doctor-ipv6.json | 2245 ++++++++++++++++++++++++++++++
 1 file changed, 2245 insertions(+)
 create mode 100644 dashboards/node-doctor-ipv6.json

diff --git a/dashboards/node-doctor-ipv6.json b/dashboards/node-doctor-ipv6.json
new file mode 100644
index 0000000..fcb15ab
--- /dev/null
+++ b/dashboards/node-doctor-ipv6.json
@@ -0,0 +1,2245 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Node Doctor IPv6 / Dual-Stack Health - IPv6 monitor conditions (sysctl, default route, link-local/global addresses, router advertisements, firewall blackhole), IPv6 peer connectivity and latency, IPv6 gateway latency, and AAAA DNS resolution. Requires the IPv6 network monitors to be enabled.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [
+    {
+      "asDropdown": true,
+      "icon": "external link",
+      "includeVars": true,
+      "keepTime": true,
+      "tags": [
+        "node-doctor"
+      ],
+      "targetBlank": false,
+      "title": "Node Doctor Dashboards",
+      "type": "dashboards"
+    }
+  ],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "panels": [],
+      "title": "IPv6 Monitor Health",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 0,
+        "y": 1
+      },
+      "id": 2,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_condition_status{node=~\"$node\", condition_type=\"IPv6SysctlMisconfigured\"} == 1) or vector(0)",
+          "legendFormat": "Affected",
+          "refId": "A"
+        }
+      ],
+      "title": "Sysctl Misconfigured",
+      "type": "stat",
+      "description": "Number of nodes currently reporting the IPv6SysctlMisconfigured condition (active = problem)"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 4,
+        "y": 1
+      },
+      "id": 3,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_condition_status{node=~\"$node\", condition_type=\"IPv6DefaultRouteMissing\"} == 1) or vector(0)",
+          "legendFormat": "Affected",
+          "refId": "A"
+        }
+      ],
+      "title": "Default Route Missing",
+      "type": "stat",
+      "description": "Number of nodes currently reporting the IPv6DefaultRouteMissing condition (active = problem)"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 8,
+        "y": 1
+      },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_condition_status{node=~\"$node\", condition_type=\"IPv6LinkLocalMissing\"} == 1) or vector(0)",
+          "legendFormat": "Affected",
+          "refId": "A"
+        }
+      ],
+      "title": "Link-Local Missing",
+      "type": "stat",
+      "description": "Number of nodes currently reporting the IPv6LinkLocalMissing condition (active = problem)"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 1
+      },
+      "id": 5,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_condition_status{node=~\"$node\", condition_type=\"IPv6GlobalAddressMissing\"} == 1) or vector(0)",
+          "legendFormat": "Affected",
+          "refId": "A"
+        }
+      ],
+      "title": "Global Address Missing",
+      "type": "stat",
+      "description": "Number of nodes currently reporting the IPv6GlobalAddressMissing condition (active = problem)"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 1
+      },
+      "id": 6,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_condition_status{node=~\"$node\", condition_type=\"IPv6RouterAdvertisementDisabled\"} == 1) or vector(0)",
+          "legendFormat": "Affected",
+          "refId": "A"
+        }
+      ],
+      "title": "RA Disabled",
+      "type": "stat",
+      "description": "Number of nodes currently reporting the IPv6RouterAdvertisementDisabled condition (active = problem)"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 1
+      },
+      "id": 7,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_condition_status{node=~\"$node\", condition_type=\"IPv6FirewallBlackhole\"} == 1) or vector(0)",
+          "legendFormat": "Affected",
+          "refId": "A"
+        }
+      ],
+      "title": "Firewall Blackhole",
+      "type": "stat",
+      "description": "Number of nodes currently reporting the IPv6FirewallBlackhole condition (active = problem)"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "description": "All currently-active IPv6 conditions (condition_status == 1) broken down by node and condition type.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "align": "auto",
+            "cellOptions": {
+              "type": "auto"
+            },
+            "inspect": false
+          },
+          "mappings": [
+            {
+              "options": {
+                "1": {
+                  "color": "red",
+                  "index": 0,
+                  "text": "ACTIVE"
+                }
+              },
+              "type": "value"
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Status"
+            },
+            "properties": [
+              {
+                "id": "custom.cellOptions",
+                "value": {
+                  "mode": "basic",
+                  "type": "color-background"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 9
+      },
+      "id": 8,
+      "options": {
+        "cellHeight": "sm",
+        "footer": {
+          "countRows": false,
+          "fields": "",
+          "reducer": [
+            "sum"
+          ],
+          "show": false
+        },
+        "showHeader": true,
+        "sortBy": [
+          {
+            "desc": true,
+            "displayName": "Condition"
+          }
+        ]
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "node_doctor_monitor_condition_status{node=~\"$node\", condition_type=~\"IPv6.*\"} == 1",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "__auto",
+          "refId": "A"
+        }
+      ],
+      "title": "Active IPv6 Conditions by Node",
+      "transformations": [
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {
+              "Time": true,
+              "__name__": true,
+              "cluster": true,
+              "container": true,
+              "endpoint": true,
+              "instance": true,
+              "job": true,
+              "namespace": true,
+              "pod": true,
+              "service": true
+            },
+            "indexByName": {
+              "node": 0,
+              "condition_type": 1
+            },
+            "renameByName": {
+              "Value": "Status",
+              "condition_type": "Condition",
+              "node": "Node"
+            }
+          }
+        }
+      ],
+      "type": "table"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 17
+      },
+      "id": 9,
+      "panels": [],
+      "title": "IPv6 Condition Activity",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Affected nodes",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "bars",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 18
+      },
+      "id": 10,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (condition_type) (node_doctor_monitor_condition_status{node=~\"$node\", condition_type=~\"IPv6.*\"})",
+          "legendFormat": "{{condition_type}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Active IPv6 Conditions Over Time",
+      "type": "timeseries",
+      "description": "Count of nodes with each IPv6 condition active over time."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 18
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "sum by (condition_type) (rate(node_doctor_monitor_conditions_total{node=~\"$node\", condition_type=~\"IPv6.*\", status=\"True\"}[5m])) * 60",
+          "legendFormat": "{{condition_type}}",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Condition Transitions (rate/min)",
+      "type": "timeseries",
+      "description": "Rate of IPv6 condition True transitions per minute."
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 26
+      },
+      "id": 12,
+      "panels": [],
+      "title": "Dual-Stack Connectivity",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 50
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 0,
+        "y": 27
+      },
+      "id": 13,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "avg(node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=\"ipv6\"}) * 1000",
+          "legendFormat": "Avg",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Avg Peer Latency",
+      "type": "stat",
+      "description": "Average latency across all IPv6 peer connections."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 25
+              },
+              {
+                "color": "red",
+                "value": 100
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 4,
+        "y": 27
+      },
+      "id": 14,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "max(node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=\"ipv6\"}) * 1000",
+          "legendFormat": "Max",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Max Peer Latency",
+      "type": "stat",
+      "description": "Maximum IPv6 peer latency observed."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 80
+              },
+              {
+                "color": "green",
+                "value": 95
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 8,
+        "y": 27
+      },
+      "id": 15,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "(count(node_doctor_monitor_peer_reachable{node=~\"$node\", address_family=\"ipv6\"} == 1) / count(node_doctor_monitor_peer_reachable{node=~\"$node\", address_family=\"ipv6\"})) * 100",
+          "legendFormat": "Reachable %",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Peer Reachability %",
+      "type": "stat",
+      "description": "Percentage of IPv6 peer connections currently reachable."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 50
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 27
+      },
+      "id": 16,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "avg(node_doctor_monitor_gateway_latency_seconds{node=~\"$node\", address_family=\"ipv6\"}) * 1000",
+          "legendFormat": "Gateway",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Gateway Avg Latency",
+      "type": "stat",
+      "description": "Average latency to the IPv6 default gateway."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "red",
+                "value": 200
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 27
+      },
+      "id": 17,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "avg(node_doctor_monitor_dns_latency_seconds{node=~\"$node\", record_type=\"AAAA\"}) * 1000",
+          "legendFormat": "AAAA",
+          "refId": "A"
+        }
+      ],
+      "title": "AAAA DNS Avg Latency",
+      "type": "stat",
+      "description": "Average DNS resolution latency for AAAA (IPv6) records."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 27
+      },
+      "id": 18,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_peer_reachable{node=~\"$node\", address_family=\"ipv6\"} == 0) or vector(0)",
+          "legendFormat": "Unreachable",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Unreachable Peers",
+      "type": "stat",
+      "description": "Number of IPv6 peer connections currently unreachable."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Latency (ms)",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 50
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 31
+      },
+      "id": 19,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "avg by (address_family) (node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=~\"$address_family\"}) * 1000",
+          "legendFormat": "{{address_family}}",
+          "refId": "A"
+        }
+      ],
+      "title": "Peer Latency: IPv4 vs IPv6",
+      "type": "timeseries",
+      "description": "Average peer latency by address family for selected families."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Latency (ms)",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 50
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 31
+      },
+      "id": 20,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "avg by (node) (node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=\"ipv6\"}) * 1000",
+          "legendFormat": "{{node}}",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Peer Latency by Node",
+      "type": "timeseries",
+      "description": "IPv6 peer latency over time grouped by source node."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Latency (ms)",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "red",
+                "value": 200
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 39
+      },
+      "id": 21,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "avg by (record_type) (node_doctor_monitor_dns_latency_seconds{node=~\"$node\", record_type=~\"A|AAAA\"}) * 1000",
+          "legendFormat": "{{record_type}}",
+          "refId": "A"
+        }
+      ],
+      "title": "DNS Latency: A vs AAAA",
+      "type": "timeseries",
+      "description": "DNS resolution latency comparing A (IPv4) vs AAAA (IPv6) record types."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Reachable (1=yes)",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 39
+      },
+      "id": 22,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "avg by (node) (node_doctor_monitor_peer_reachable{node=~\"$node\", address_family=\"ipv6\"})",
+          "legendFormat": "{{node}}",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Peer Reachability by Node",
+      "type": "timeseries",
+      "description": "IPv6 peer reachability over time grouped by source node (1 = reachable)."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "description": "Per-connection IPv6 peer latency and reachability matrix.",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "custom": {
+            "align": "auto",
+            "cellOptions": {
+              "type": "auto"
+            },
+            "inspect": false
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 50
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Latency"
+            },
+            "properties": [
+              {
+                "id": "custom.cellOptions",
+                "value": {
+                  "mode": "gradient",
+                  "type": "color-background"
+                }
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Reachable"
+            },
+            "properties": [
+              {
+                "id": "mappings",
+                "value": [
+                  {
+                    "options": {
+                      "0": {
+                        "color": "red",
+                        "index": 1,
+                        "text": "No"
+                      },
+                      "1": {
+                        "color": "green",
+                        "index": 0,
+                        "text": "Yes"
+                      }
+                    },
+                    "type": "value"
+                  }
+                ]
+              },
+              {
+                "id": "custom.cellOptions",
+                "value": {
+                  "mode": "basic",
+                  "type": "color-background"
+                }
+              },
+              {
+                "id": "unit",
+                "value": "short"
+              }
+            ]
+          }
+        ]
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 47
+      },
+      "id": 23,
+      "options": {
+        "cellHeight": "sm",
+        "footer": {
+          "countRows": false,
+          "fields": "",
+          "reducer": [
+            "sum"
+          ],
+          "show": false
+        },
+        "showHeader": true,
+        "sortBy": [
+          {
+            "desc": true,
+            "displayName": "Latency"
+          }
+        ]
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "node_doctor_monitor_peer_latency_seconds{node=~\"$node\", address_family=\"ipv6\"} * 1000",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "__auto",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "node_doctor_monitor_peer_reachable{node=~\"$node\", address_family=\"ipv6\"}",
+          "format": "table",
+          "instant": true,
+          "legendFormat": "__auto",
+          "refId": "B"
+        }
+      ],
+      "title": "IPv6 Node-to-Peer Matrix",
+      "transformations": [
+        {
+          "id": "merge",
+          "options": {}
+        },
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {
+              "Time": true,
+              "__name__": true,
+              "address_family": true,
+              "cluster": true,
+              "container": true,
+              "endpoint": true,
+              "instance": true,
+              "job": true,
+              "namespace": true,
+              "pod": true,
+              "service": true
+            },
+            "indexByName": {
+              "node": 0,
+              "peer_node": 1,
+              "peer_ip": 2
+            },
+            "renameByName": {
+              "Value #A": "Latency",
+              "Value #B": "Reachable",
+              "node": "Source Node",
+              "peer_ip": "Peer IP",
+              "peer_node": "Peer Node"
+            }
+          }
+        }
+      ],
+      "type": "table"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 55
+      },
+      "id": 24,
+      "panels": [],
+      "title": "IPv6 Monitor Status & Events",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 0,
+        "y": 56
+      },
+      "id": 25,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_monitor_up{node=~\"$node\", monitor_name=\"ipv6-sysctl-check\"} == 1) or vector(0)",
+          "legendFormat": "Up",
+          "refId": "A"
+        }
+      ],
+      "title": "ipv6-sysctl-check",
+      "type": "stat",
+      "description": "Number of nodes running the ipv6-sysctl-check monitor."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 4,
+        "y": 56
+      },
+      "id": 26,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_monitor_up{node=~\"$node\", monitor_name=\"ipv6-route-check\"} == 1) or vector(0)",
+          "legendFormat": "Up",
+          "refId": "A"
+        }
+      ],
+      "title": "ipv6-route-check",
+      "type": "stat",
+      "description": "Number of nodes running the ipv6-route-check monitor."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 8,
+        "y": 56
+      },
+      "id": 27,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_monitor_up{node=~\"$node\", monitor_name=\"ipv6-neighbor-check\"} == 1) or vector(0)",
+          "legendFormat": "Up",
+          "refId": "A"
+        }
+      ],
+      "title": "ipv6-neighbor-check",
+      "type": "stat",
+      "description": "Number of nodes running the ipv6-neighbor-check monitor."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 56
+      },
+      "id": 28,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "count(node_doctor_monitor_monitor_up{node=~\"$node\", monitor_name=\"ipv6-firewall-check\"} == 1) or vector(0)",
+          "legendFormat": "Up",
+          "refId": "A"
+        }
+      ],
+      "title": "ipv6-firewall-check",
+      "type": "stat",
+      "description": "Number of nodes running the ipv6-firewall-check monitor."
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 4,
+        "w": 8,
+        "x": 16,
+        "y": 56
+      },
+      "id": 29,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${datasource}"
+          },
+          "expr": "rate(node_doctor_monitor_events_total{node=~\"$node\", source=~\"ipv6-.*\"}[5m]) * 60",
+          "legendFormat": "{{source}} - {{severity}}",
+          "refId": "A"
+        }
+      ],
+      "title": "IPv6 Monitor Events Rate (per minute)",
+      "type": "timeseries",
+      "description": "Event rate from IPv6 monitors (sysctl/route/neighbor/firewall)."
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "tags": [
+    "node-doctor",
+    "ipv6",
+    "dual-stack",
+    "network",
+    "dns",
+    "kubernetes"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "Prometheus",
+          "value": "prometheus"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": true,
+          "text": [
+            "All"
+          ],
+          "value": [
+            "$__all"
+          ]
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${datasource}"
+        },
+        "definition": "label_values(node_doctor_monitor_info, node)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Node",
+        "multi": true,
+        "name": "node",
+        "options": [],
+        "query": {
+          "query": "label_values(node_doctor_monitor_info, node)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": true,
+          "text": "ipv6",
+          "value": "ipv6"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${datasource}"
+        },
+        "definition": "label_values(node_doctor_monitor_peer_latency_seconds, address_family)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Address Family",
+        "multi": true,
+        "name": "address_family",
+        "options": [],
+        "query": {
+          "query": "label_values(node_doctor_monitor_peer_latency_seconds, address_family)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Node Doctor - IPv6 / Dual-Stack Health",
+  "uid": "node-doctor-ipv6",
+  "version": 1,
+  "weekStart": ""
+}

From 483c890655dbb89bf6fca4435aa7a4b6b68f9167 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:28:53 -0500
Subject: [PATCH 23/38] feat(alerts): per-family PrometheusRule alerts + IPv6
 condition alert (Task #17219)

Rewrite NodeDoctorHighPeerLatency and NodeDoctorLowPeerConnectivity to
aggregate by (node, address_family) using the family-labeled peer gauges,
so IPv4/IPv6 alert independently and the firing alert carries
address_family. Add NodeDoctorIPv6Misconfigured firing on
condition_status{condition_type=~"IPv6.*"}==1 (for 10m, address_family=
ipv6 label), gated by prometheusRule.warning.ipv6Misconfigured.enabled
(default true). Mirror the two rewrites + new alert into the static
deployment/prometheusrule.yaml. helm template renders; YAML valid.
---
 deployment/prometheusrule.yaml                | 31 +++++++++++---
 .../node-doctor/templates/prometheusrule.yaml | 41 ++++++++++++++++---
 helm/node-doctor/values.yaml                  |  8 ++++
 3 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/deployment/prometheusrule.yaml b/deployment/prometheusrule.yaml
index 6c434a6..f8b816b 100644
--- a/deployment/prometheusrule.yaml
+++ b/deployment/prometheusrule.yaml
@@ -137,26 +137,29 @@ spec:
             description: "Node {{ $labels.node }} is experiencing network degradation (high latency or partial connectivity)."
 
         - alert: NodeDoctorHighPeerLatency
+          # Per-family: grouped by address_family so IPv4 and IPv6 alert independently.
           expr: |
-            histogram_quantile(0.95, sum(rate(node_doctor_monitor_peer_latency_histogram_seconds_bucket[5m])) by (le, node)) * 1000 > 100
+            max by (node, address_family) (node_doctor_monitor_peer_latency_seconds) * 1000 > 100
           for: 10m
           labels:
             severity: warning
             component: network
           annotations:
-            summary: "High peer latency on {{ $labels.node }}"
-            description: "Node {{ $labels.node }} P95 peer latency exceeds 100ms."
+            summary: "High peer latency ({{ $labels.address_family }}) on {{ $labels.node }}"
+            description: "Node {{ $labels.node }} peer latency over {{ $labels.address_family }} exceeds 100ms."
 
         - alert: NodeDoctorLowPeerConnectivity
+          # Per-family: percentage of reachable peers grouped by address_family so a
+          # single-stack outage (e.g. IPv6 down) is not masked by a healthy IPv4 path.
           expr: |
-            (sum by (node) (node_doctor_monitor_peers_reachable_total) / sum by (node) (node_doctor_monitor_peers_total)) * 100 < 90
+            (avg by (node, address_family) (node_doctor_monitor_peer_reachable)) * 100 < 90
           for: 5m
           labels:
             severity: warning
             component: network
           annotations:
-            summary: "Low peer connectivity on {{ $labels.node }}"
-            description: "Node {{ $labels.node }} can only reach {{ $value | printf \"%.1f\" }}% of peers."
+            summary: "Low peer connectivity ({{ $labels.address_family }}) on {{ $labels.node }}"
+            description: "Node {{ $labels.node }} can only reach {{ $value | printf \"%.1f\" }}% of peers over {{ $labels.address_family }}."
 
         - alert: NodeDoctorAPIServerLatencyHigh
           expr: |
@@ -169,6 +172,22 @@ spec:
             summary: "High API server latency on {{ $labels.node }}"
             description: "Node {{ $labels.node }} is experiencing high latency communicating with the API server."
 
+        - alert: NodeDoctorIPv6Misconfigured
+          # Covers all IPv6-specific conditions (IPv6SysctlMisconfigured,
+          # IPv6DefaultRouteMissing, IPv6LinkLocalMissing, IPv6GlobalAddressMissing,
+          # IPv6RouterAdvertisementDisabled, IPv6FirewallBlackhole). The condition_status
+          # gauge carries only {node,condition_type}, so address_family is fixed to ipv6.
+          expr: |
+            node_doctor_monitor_condition_status{condition_type=~"IPv6.*"} == 1
+          for: 10m
+          labels:
+            severity: warning
+            component: network
+            address_family: ipv6
+          annotations:
+            summary: "IPv6 misconfiguration ({{ $labels.condition_type }}) on {{ $labels.node }}"
+            description: "Node {{ $labels.node }} has an active IPv6 condition {{ $labels.condition_type }}. IPv6 connectivity may be impaired."
+
     # Informational alerts - for awareness
     - name: node-doctor-info
       rules:
diff --git a/helm/node-doctor/templates/prometheusrule.yaml b/helm/node-doctor/templates/prometheusrule.yaml
index b3f092b..1807b5c 100644
--- a/helm/node-doctor/templates/prometheusrule.yaml
+++ b/helm/node-doctor/templates/prometheusrule.yaml
@@ -182,26 +182,30 @@ spec:
             description: "Node {{`{{ $labels.node }}`}} is experiencing network degradation (high latency or partial connectivity)."
 
         - alert: NodeDoctorHighPeerLatency
+          # Per-family: max P95-equivalent peer latency grouped by address_family so
+          # IPv4 and IPv6 alert independently and the firing alert carries the family.
           expr: |
-            histogram_quantile(0.95, sum(rate(node_doctor_monitor_peer_latency_histogram_seconds_bucket[5m])) by (le, node)) * 1000 > {{ .Values.prometheusRule.warning.highPeerLatency.thresholdMs }}
+            max by (node, address_family) (node_doctor_monitor_peer_latency_seconds) * 1000 > {{ .Values.prometheusRule.warning.highPeerLatency.thresholdMs }}
           for: {{ .Values.prometheusRule.warning.highPeerLatency.for }}
           labels:
             severity: warning
             component: network
           annotations:
-            summary: "High peer latency on {{`{{ $labels.node }}`}}"
-            description: "Node {{`{{ $labels.node }}`}} P95 peer latency exceeds {{ .Values.prometheusRule.warning.highPeerLatency.thresholdMs }}ms."
+            summary: "High peer latency ({{`{{ $labels.address_family }}`}}) on {{`{{ $labels.node }}`}}"
+            description: "Node {{`{{ $labels.node }}`}} peer latency over {{`{{ $labels.address_family }}`}} exceeds {{ .Values.prometheusRule.warning.highPeerLatency.thresholdMs }}ms."
 
         - alert: NodeDoctorLowPeerConnectivity
+          # Per-family: percentage of reachable peers grouped by address_family so a
+          # single-stack outage (e.g. IPv6 down) is not masked by a healthy IPv4 path.
           expr: |
-            (sum by (node) (node_doctor_monitor_peers_reachable_total) / sum by (node) (node_doctor_monitor_peers_total)) * 100 < {{ .Values.prometheusRule.warning.lowPeerConnectivity.thresholdPercent }}
+            (avg by (node, address_family) (node_doctor_monitor_peer_reachable)) * 100 < {{ .Values.prometheusRule.warning.lowPeerConnectivity.thresholdPercent }}
           for: {{ .Values.prometheusRule.warning.lowPeerConnectivity.for }}
           labels:
             severity: warning
             component: network
           annotations:
-            summary: "Low peer connectivity on {{`{{ $labels.node }}`}}"
-            description: "Node {{`{{ $labels.node }}`}} can only reach {{`{{ $value | printf \"%.1f\" }}`}}% of peers."
+            summary: "Low peer connectivity ({{`{{ $labels.address_family }}`}}) on {{`{{ $labels.node }}`}}"
+            description: "Node {{`{{ $labels.node }}`}} can only reach {{`{{ $value | printf \"%.1f\" }}`}}% of peers over {{`{{ $labels.address_family }}`}}."
 
         - alert: NodeDoctorAPIServerLatencyHigh
           expr: |
@@ -213,6 +217,31 @@ spec:
           annotations:
             summary: "High API server latency on {{`{{ $labels.node }}`}}"
             description: "Node {{`{{ $labels.node }}`}} is experiencing high latency communicating with the API server."
+        {{- if .Values.prometheusRule.warning.ipv6Misconfigured.enabled }}
+
+        - alert: NodeDoctorIPv6Misconfigured
+          # Covers all IPv6-specific conditions (IPv6SysctlMisconfigured,
+          # IPv6DefaultRouteMissing, IPv6LinkLocalMissing, IPv6GlobalAddressMissing,
+          # IPv6RouterAdvertisementDisabled, IPv6FirewallBlackhole). These conditions
+          # are not family-labeled metrics; condition_status carries only
+          # {node,condition_type}, so the address_family label is fixed to ipv6 here.
+          expr: |
+            node_doctor_monitor_condition_status{condition_type=~"IPv6.*"} == 1
+          for: {{ .Values.prometheusRule.warning.ipv6Misconfigured.for }}
+          labels:
+            severity: warning
+            component: network
+            address_family: ipv6
+            {{- with .Values.prometheusRule.warning.ipv6Misconfigured.labels }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          annotations:
+            summary: "IPv6 misconfiguration ({{`{{ $labels.condition_type }}`}}) on {{`{{ $labels.node }}`}}"
+            description: "Node {{`{{ $labels.node }}`}} has an active IPv6 condition {{`{{ $labels.condition_type }}`}}. IPv6 connectivity may be impaired."
+            {{- with .Values.prometheusRule.warning.ipv6Misconfigured.annotations }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+        {{- end }}
     {{- end }}
 
     {{- if .Values.prometheusRule.info.enabled }}
diff --git a/helm/node-doctor/values.yaml b/helm/node-doctor/values.yaml
index fe877c0..8942d7b 100644
--- a/helm/node-doctor/values.yaml
+++ b/helm/node-doctor/values.yaml
@@ -193,6 +193,14 @@ prometheusRule:
       thresholdPercent: 90
     apiServerLatencyHigh:
       for: 10m
+    # IPv6 misconfiguration alert - fires on any active IPv6* node condition
+    # (sysctl, default route, link-local, global address, RA, firewall blackhole).
+    # Carries address_family=ipv6 so Alertmanager can route IPv6 issues separately.
+    ipv6Misconfigured:
+      enabled: true
+      for: 10m
+      labels: {}
+      annotations: {}
 
   # Informational alerts - for awareness
   info:

From a5d116e20d4873e84658c1cd42834ee27965fca0 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:31:51 -0500
Subject: [PATCH 24/38] test(metrics): guard Go+Process collector wiring in
 served registry (Task #17210)

NewRegistry (the registry the exporter serves) already registers the Go
runtime and process collectors. Add TestNewRegistry_IncludesGoAndProcess
Collectors asserting go_goroutines and (on linux) process_* /
process_start_time_seconds are exposed, so the runtime/process self-
metrics can't silently regress. Test-only; functionality already present.
---
 pkg/exporters/prometheus/registry_test.go | 54 +++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 pkg/exporters/prometheus/registry_test.go

diff --git a/pkg/exporters/prometheus/registry_test.go b/pkg/exporters/prometheus/registry_test.go
new file mode 100644
index 0000000..b2af892
--- /dev/null
+++ b/pkg/exporters/prometheus/registry_test.go
@@ -0,0 +1,54 @@
+package prometheus
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// TestNewRegistry_IncludesGoAndProcessCollectors pins the contract for task
+// #17210: the registry the exporter serves wires the standard Go-runtime and
+// process collectors so go_* and process_* metrics are exposed alongside the
+// node-doctor metrics. NewRegistry is the registry actually used by the
+// exporter (see exporter.go), so this guards against a regression that would
+// silently drop runtime/process self-observability.
+func TestNewRegistry_IncludesGoAndProcessCollectors(t *testing.T) {
+	reg := NewRegistry(prometheus.Labels{"node": "test-node"})
+
+	mfs, err := reg.Gather()
+	if err != nil {
+		t.Fatalf("registry.Gather() error: %v", err)
+	}
+
+	families := make(map[string]bool, len(mfs))
+	var goCount, processCount int
+	for _, mf := range mfs {
+		name := mf.GetName()
+		families[name] = true
+		if strings.HasPrefix(name, "go_") {
+			goCount++
+		}
+		if strings.HasPrefix(name, "process_") {
+			processCount++
+		}
+	}
+
+	// The Go collector is available on every platform; go_goroutines is a
+	// stable, always-present series.
+	if !families["go_goroutines"] {
+		t.Errorf("expected go_goroutines from the Go collector; got %d go_* families", goCount)
+	}
+
+	// The process collector only emits metrics on platforms it supports
+	// (Linux in CI/production). Guard so the test stays green elsewhere.
+	if runtime.GOOS == "linux" {
+		if processCount == 0 {
+			t.Errorf("expected process_* metrics from the process collector on linux, got none")
+		}
+		if !families["process_start_time_seconds"] {
+			t.Errorf("expected process_start_time_seconds from the process collector on linux")
+		}
+	}
+}

From e14e8a137c6099c289d154437a2c8898255de233 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:40:15 -0500
Subject: [PATCH 25/38] feat(metrics): monitor-cycle self-metrics (Task #17211)

Add monitor_cycles_total{node,monitor_name,result} and
monitor_cycle_last_timestamp_seconds{node,monitor_name} (last-run
heartbeat), recorded once per cycle in ExportStatus via new
RecordMonitorCycle. Also wire the previously-defined-but-unobserved
monitor_check_duration_seconds histogram. result=error when the status
carries any ConditionFalse. Tests for the recorder + ExportStatus path.
---
 pkg/exporters/prometheus/exporter.go      |  45 +++++++
 pkg/exporters/prometheus/exporter_test.go |  76 +++++++++++
 pkg/exporters/prometheus/metrics.go       |  32 +++++
 pkg/exporters/prometheus/metrics_test.go  | 150 ++++++++++++++++++++++
 4 files changed, 303 insertions(+)

diff --git a/pkg/exporters/prometheus/exporter.go b/pkg/exporters/prometheus/exporter.go
index 5bbaa22..25fde96 100644
--- a/pkg/exporters/prometheus/exporter.go
+++ b/pkg/exporters/prometheus/exporter.go
@@ -170,6 +170,12 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
 		return fmt.Errorf("status validation failed: %w", err)
 	}
 
+	// One ExportStatus call corresponds to one completed monitor check cycle:
+	// the monitor ran its check and emitted a status, which the detector forwards
+	// here. Time the cycle and record self-metrics at the end via RecordMonitorCycle.
+	cycleStart := time.Now()
+	cycleHadError := statusHasError(status)
+
 	timer := prometheus.NewTimer(e.metrics.ExportDuration.WithLabelValues(
 		e.nodeName, "prometheus", "status"))
 	defer timer.ObserveDuration()
@@ -209,11 +215,50 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
 	e.metrics.ExportOperationsTotal.WithLabelValues(
 		e.nodeName, "prometheus", "status", "success").Inc()
 
+	// Record monitor-cycle self-metrics. status.Source is the monitor name.
+	// A status carrying any ConditionFalse is treated as a failed cycle.
+	var cycleErr error
+	if cycleHadError {
+		cycleErr = fmt.Errorf("monitor %s reported an unhealthy condition", status.Source)
+	}
+	e.RecordMonitorCycle(status.Source, time.Since(cycleStart), cycleErr)
+
 	log.Printf("[DEBUG] Exported status from %s to Prometheus", status.Source)
 
 	return nil
 }
 
+// statusHasError reports whether a status carries any condition signalling an
+// unhealthy/failed monitor cycle (ConditionFalse). Conditions that are True or
+// Unknown (e.g. the synthetic MonitorBlocked condition) do not count as errors.
+func statusHasError(status *types.Status) bool {
+	for _, cond := range status.Conditions {
+		if cond.Status == types.ConditionFalse {
+			return true
+		}
+	}
+	return false
+}
+
+// RecordMonitorCycle records self-metrics for one completed monitor check cycle:
+//   - increments MonitorCyclesTotal with result="success" or result="error"
+//   - observes the cycle duration into MonitorCheckDuration
+//   - sets MonitorCycleLastTimestamp to the current time (a "last run" heartbeat)
+//
+// monitorName is the name of the monitor (status.Source). A non-nil err marks
+// the cycle as an error. This is the seam the detector's per-cycle path reaches
+// via ExportStatus; it is also safe to call directly.
+func (e *PrometheusExporter) RecordMonitorCycle(monitorName string, duration time.Duration, err error) {
+	result := "success"
+	if err != nil {
+		result = "error"
+	}
+
+	e.metrics.MonitorCyclesTotal.WithLabelValues(e.nodeName, monitorName, result).Inc()
+	e.metrics.MonitorCheckDuration.WithLabelValues(e.nodeName, monitorName).Observe(duration.Seconds())
+	e.metrics.MonitorCycleLastTimestamp.WithLabelValues(e.nodeName, monitorName).Set(float64(time.Now().Unix()))
+}
+
 // recordLatencyMetrics extracts latency metrics from status metadata and records them
 func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 	latencyMetrics := status.GetLatencyMetrics()
diff --git a/pkg/exporters/prometheus/exporter_test.go b/pkg/exporters/prometheus/exporter_test.go
index 4f4d814..84772dd 100644
--- a/pkg/exporters/prometheus/exporter_test.go
+++ b/pkg/exporters/prometheus/exporter_test.go
@@ -255,6 +255,82 @@ func TestExportStatus(t *testing.T) {
 	}
 }
 
+// TestExportStatusRecordsMonitorCycle verifies that ExportStatus records the
+// per-cycle self-metrics (MonitorCyclesTotal, MonitorCheckDuration,
+// MonitorCycleLastTimestamp) and classifies the cycle result based on the
+// presence of a ConditionFalse condition.
+func TestExportStatusRecordsMonitorCycle(t *testing.T) {
+	port := freePort(t)
+	config := &types.PrometheusExporterConfig{
+		Enabled:   true,
+		Port:      port,
+		Path:      "/metrics",
+		Namespace: "test",
+	}
+	settings := &types.GlobalSettings{NodeName: "test-node"}
+
+	exporter, err := NewPrometheusExporter(config, settings)
+	if err != nil {
+		t.Fatalf("failed to create exporter: %v", err)
+	}
+
+	ctx := context.Background()
+	if err := exporter.Start(ctx); err != nil {
+		t.Fatalf("failed to start exporter: %v", err)
+	}
+	defer exporter.Stop()
+
+	// Healthy cycle (no ConditionFalse) -> result=success.
+	healthy := &types.Status{
+		Source:    "disk-monitor",
+		Timestamp: time.Now(),
+		Conditions: []types.Condition{
+			{Type: "DiskHealthy", Status: types.ConditionTrue, Reason: "OK", Message: "ok", Transition: time.Now()},
+		},
+	}
+	if err := exporter.ExportStatus(ctx, healthy); err != nil {
+		t.Fatalf("failed to export healthy status: %v", err)
+	}
+
+	// Unhealthy cycle (ConditionFalse) -> result=error.
+	unhealthy := &types.Status{
+		Source:    "disk-monitor",
+		Timestamp: time.Now(),
+		Conditions: []types.Condition{
+			{Type: "DiskHealthy", Status: types.ConditionFalse, Reason: "Full", Message: "disk full", Transition: time.Now()},
+		},
+	}
+	if err := exporter.ExportStatus(ctx, unhealthy); err != nil {
+		t.Fatalf("failed to export unhealthy status: %v", err)
+	}
+
+	families, err := exporter.registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	if got, ok := counterValue(families, "test_monitor_cycles_total", map[string]string{
+		"monitor_name": "disk-monitor", "result": "success",
+	}); !ok || got != 1 {
+		t.Errorf("monitor_cycles_total{result=success} = %v (found=%v), want 1", got, ok)
+	}
+	if got, ok := counterValue(families, "test_monitor_cycles_total", map[string]string{
+		"monitor_name": "disk-monitor", "result": "error",
+	}); !ok || got != 1 {
+		t.Errorf("monitor_cycles_total{result=error} = %v (found=%v), want 1", got, ok)
+	}
+	if got, ok := histogramSampleCount(families, "test_monitor_check_duration_seconds", map[string]string{
+		"monitor_name": "disk-monitor",
+	}); !ok || got != 2 {
+		t.Errorf("monitor_check_duration_seconds sample count = %v (found=%v), want 2", got, ok)
+	}
+	if got, ok := gaugeValue(families, "test_monitor_cycle_last_timestamp_seconds", map[string]string{
+		"monitor_name": "disk-monitor",
+	}); !ok || got <= 0 {
+		t.Errorf("monitor_cycle_last_timestamp_seconds = %v (found=%v), want > 0", got, ok)
+	}
+}
+
 func TestExportProblem(t *testing.T) {
 	port := freePort(t)
 	config := &types.PrometheusExporterConfig{
diff --git a/pkg/exporters/prometheus/metrics.go b/pkg/exporters/prometheus/metrics.go
index e41ea74..57eb957 100644
--- a/pkg/exporters/prometheus/metrics.go
+++ b/pkg/exporters/prometheus/metrics.go
@@ -15,6 +15,7 @@ type Metrics struct {
 	ConditionsTotal       *prometheus.CounterVec
 	ExportOperationsTotal *prometheus.CounterVec
 	ExportErrorsTotal     *prometheus.CounterVec
+	MonitorCyclesTotal    *prometheus.CounterVec
 
 	// Gauge metrics
 	ProblemsActive   *prometheus.GaugeVec
@@ -24,6 +25,11 @@ type Metrics struct {
 	StartTimeSeconds *prometheus.GaugeVec
 	UptimeSeconds    *prometheus.GaugeVec
 
+	// MonitorCycleLastTimestamp records the unix-seconds time of each monitor's
+	// most recently completed check cycle. Used as a per-monitor "last run"
+	// heartbeat for staleness alerting.
+	MonitorCycleLastTimestamp *prometheus.GaugeVec
+
 	// Network latency gauge metrics
 	GatewayLatencySeconds         *prometheus.GaugeVec
 	PeerLatencySeconds            *prometheus.GaugeVec
@@ -136,6 +142,17 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 			[]string{"node", "exporter", "error_type"},
 		),
 
+		MonitorCyclesTotal: prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "monitor_cycles_total",
+				Help:        "Total number of completed monitor check cycles, partitioned by result (success/error)",
+				ConstLabels: labels,
+			},
+			[]string{"node", "monitor_name", "result"},
+		),
+
 		// Gauge metrics
 		ConditionStatus: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
@@ -203,6 +220,17 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 			[]string{"node"},
 		),
 
+		MonitorCycleLastTimestamp: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "monitor_cycle_last_timestamp_seconds",
+				Help:        "Unix timestamp (seconds) of each monitor's most recently completed check cycle",
+				ConstLabels: labels,
+			},
+			[]string{"node", "monitor_name"},
+		),
+
 		// Network latency gauge metrics
 		GatewayLatencySeconds: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
@@ -451,12 +479,14 @@ func (m *Metrics) Register(registry *prometheus.Registry) error {
 		m.ConditionsTotal,
 		m.ExportOperationsTotal,
 		m.ExportErrorsTotal,
+		m.MonitorCyclesTotal,
 		m.ProblemsActive,
 		m.MonitorUp,
 		m.ConditionStatus,
 		m.Info,
 		m.StartTimeSeconds,
 		m.UptimeSeconds,
+		m.MonitorCycleLastTimestamp,
 		m.MonitorCheckDuration,
 		m.ExportDuration,
 		// Network latency metrics
@@ -499,12 +529,14 @@ func (m *Metrics) Unregister(registry *prometheus.Registry) {
 		m.ConditionsTotal,
 		m.ExportOperationsTotal,
 		m.ExportErrorsTotal,
+		m.MonitorCyclesTotal,
 		m.ProblemsActive,
 		m.MonitorUp,
 		m.ConditionStatus,
 		m.Info,
 		m.StartTimeSeconds,
 		m.UptimeSeconds,
+		m.MonitorCycleLastTimestamp,
 		m.MonitorCheckDuration,
 		m.ExportDuration,
 		// Network latency metrics
diff --git a/pkg/exporters/prometheus/metrics_test.go b/pkg/exporters/prometheus/metrics_test.go
index 3a90bf0..4b041e9 100644
--- a/pkg/exporters/prometheus/metrics_test.go
+++ b/pkg/exporters/prometheus/metrics_test.go
@@ -1,7 +1,9 @@
 package prometheus
 
 import (
+	"fmt"
 	"testing"
+	"time"
 
 	"github.com/prometheus/client_golang/prometheus"
 	dto "github.com/prometheus/client_model/go"
@@ -104,6 +106,12 @@ func TestNewMetrics(t *testing.T) {
 			if metrics.ExportDuration == nil {
 				t.Error("ExportDuration metric not created")
 			}
+			if metrics.MonitorCyclesTotal == nil {
+				t.Error("MonitorCyclesTotal metric not created")
+			}
+			if metrics.MonitorCycleLastTimestamp == nil {
+				t.Error("MonitorCycleLastTimestamp metric not created")
+			}
 		})
 	}
 }
@@ -193,6 +201,10 @@ func TestMetricUpdates(t *testing.T) {
 	timer2 := prometheus.NewTimer(metrics.ExportDuration.WithLabelValues("test-node", "prometheus", "status"))
 	timer2.ObserveDuration()
 
+	// Monitor-cycle self-metrics
+	metrics.MonitorCyclesTotal.WithLabelValues("test-node", "disk-monitor", "success").Inc()
+	metrics.MonitorCycleLastTimestamp.WithLabelValues("test-node", "disk-monitor").Set(1640995200)
+
 	// Gather metrics to verify they were updated
 	metricFamilies, err := registry.Gather()
 	if err != nil {
@@ -224,6 +236,8 @@ func TestMetricUpdates(t *testing.T) {
 		"test_uptime_seconds",
 		"test_monitor_check_duration_seconds",
 		"test_export_duration_seconds",
+		"test_monitor_cycles_total",
+		"test_monitor_cycle_last_timestamp_seconds",
 	}
 
 	for _, expectedMetric := range expectedMetrics {
@@ -441,6 +455,142 @@ func TestAddressFamilyLabelEmitted(t *testing.T) {
 	}
 }
 
+// counterValue returns the value of the first sample of the named counter metric
+// family whose labels include all of wantLabels, or (0, false) if not found.
+func counterValue(families []*dto.MetricFamily, metricName string, wantLabels map[string]string) (float64, bool) {
+	for _, mf := range families {
+		if mf.GetName() != metricName {
+			continue
+		}
+		for _, metric := range mf.Metric {
+			labels := make(map[string]string)
+			for _, l := range metric.Label {
+				labels[l.GetName()] = l.GetValue()
+			}
+			match := true
+			for k, v := range wantLabels {
+				if labels[k] != v {
+					match = false
+					break
+				}
+			}
+			if match && metric.Counter != nil {
+				return metric.Counter.GetValue(), true
+			}
+		}
+	}
+	return 0, false
+}
+
+// gaugeValue returns the value of the first sample of the named gauge metric
+// family whose labels include all of wantLabels, or (0, false) if not found.
+func gaugeValue(families []*dto.MetricFamily, metricName string, wantLabels map[string]string) (float64, bool) {
+	for _, mf := range families {
+		if mf.GetName() != metricName {
+			continue
+		}
+		for _, metric := range mf.Metric {
+			labels := make(map[string]string)
+			for _, l := range metric.Label {
+				labels[l.GetName()] = l.GetValue()
+			}
+			match := true
+			for k, v := range wantLabels {
+				if labels[k] != v {
+					match = false
+					break
+				}
+			}
+			if match && metric.Gauge != nil {
+				return metric.Gauge.GetValue(), true
+			}
+		}
+	}
+	return 0, false
+}
+
+// histogramSampleCount returns the sample count of the named histogram metric
+// family whose labels include all of wantLabels, or (0, false) if not found.
+func histogramSampleCount(families []*dto.MetricFamily, metricName string, wantLabels map[string]string) (uint64, bool) {
+	for _, mf := range families {
+		if mf.GetName() != metricName {
+			continue
+		}
+		for _, metric := range mf.Metric {
+			labels := make(map[string]string)
+			for _, l := range metric.Label {
+				labels[l.GetName()] = l.GetValue()
+			}
+			match := true
+			for k, v := range wantLabels {
+				if labels[k] != v {
+					match = false
+					break
+				}
+			}
+			if match && metric.Histogram != nil {
+				return metric.Histogram.GetSampleCount(), true
+			}
+		}
+	}
+	return 0, false
+}
+
+func TestRecordMonitorCycle(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	metrics, err := NewMetrics("test", "", nil)
+	if err != nil {
+		t.Fatalf("failed to create metrics: %v", err)
+	}
+	if err := metrics.Register(registry); err != nil {
+		t.Fatalf("failed to register metrics: %v", err)
+	}
+
+	e := &PrometheusExporter{
+		nodeName: "test-node",
+		registry: registry,
+		metrics:  metrics,
+	}
+
+	// Two successful cycles and one errored cycle for the same monitor.
+	e.RecordMonitorCycle("disk-monitor", 50*time.Millisecond, nil)
+	e.RecordMonitorCycle("disk-monitor", 75*time.Millisecond, nil)
+	e.RecordMonitorCycle("disk-monitor", 10*time.Millisecond, fmt.Errorf("check failed"))
+
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	// Success counter should be 2.
+	if got, ok := counterValue(families, "test_monitor_cycles_total", map[string]string{
+		"monitor_name": "disk-monitor", "result": "success",
+	}); !ok || got != 2 {
+		t.Errorf("monitor_cycles_total{result=success} = %v (found=%v), want 2", got, ok)
+	}
+
+	// Error counter should be 1.
+	if got, ok := counterValue(families, "test_monitor_cycles_total", map[string]string{
+		"monitor_name": "disk-monitor", "result": "error",
+	}); !ok || got != 1 {
+		t.Errorf("monitor_cycles_total{result=error} = %v (found=%v), want 1", got, ok)
+	}
+
+	// MonitorCheckDuration should have observed all 3 cycles.
+	if got, ok := histogramSampleCount(families, "test_monitor_check_duration_seconds", map[string]string{
+		"monitor_name": "disk-monitor",
+	}); !ok || got != 3 {
+		t.Errorf("monitor_check_duration_seconds sample count = %v (found=%v), want 3", got, ok)
+	}
+
+	// Last-timestamp heartbeat gauge should be set to a positive unix time.
+	if got, ok := gaugeValue(families, "test_monitor_cycle_last_timestamp_seconds", map[string]string{
+		"monitor_name": "disk-monitor",
+	}); !ok || got <= 0 {
+		t.Errorf("monitor_cycle_last_timestamp_seconds = %v (found=%v), want > 0", got, ok)
+	}
+}
+
 func TestMetricsReset(t *testing.T) {
 	registry := prometheus.NewRegistry()
 	constLabels := prometheus.Labels{"env": "test"}

From cebbb681a7267cda74c2aee73f5977dfd1e0e5c3 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:46:41 -0500
Subject: [PATCH 26/38] feat(metrics): exporter-health self-metrics (Task
 #17212)

Add exporter_healthy, exporter_last_success_timestamp_seconds, and
exporter_consecutive_failures gauges ({node,exporter}), updated via a
recordExportHealth helper at the success/error branches of both
ExportStatus and ExportProblem. Consecutive failures tracked in a
mutex-guarded field mirrored to the gauge. Validation-failure branches
now also record ExportErrorsTotal (previously had no production site).
Tests cover success/failure/reset. Race-clean.
---
 pkg/exporters/prometheus/exporter.go     | 48 +++++++++++-
 pkg/exporters/prometheus/metrics.go      | 47 +++++++++++
 pkg/exporters/prometheus/metrics_test.go | 99 ++++++++++++++++++++++++
 3 files changed, 193 insertions(+), 1 deletion(-)

diff --git a/pkg/exporters/prometheus/exporter.go b/pkg/exporters/prometheus/exporter.go
index 25fde96..f1b9dfd 100644
--- a/pkg/exporters/prometheus/exporter.go
+++ b/pkg/exporters/prometheus/exporter.go
@@ -25,6 +25,11 @@ type PrometheusExporter struct {
 	activeProblems map[string]*types.Problem // key is problem ID for tracking active problems
 	mu             sync.RWMutex
 	started        bool
+
+	// consecutiveFailures tracks the running count of failed exports since the
+	// last successful export. It backs the ExporterConsecutiveFailures gauge and
+	// is guarded by mu to avoid racy read-modify-write on the gauge itself.
+	consecutiveFailures int
 }
 
 // NewPrometheusExporter creates a new Prometheus exporter with the given configuration
@@ -167,6 +172,11 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
 
 	// Validate status
 	if err := status.Validate(); err != nil {
+		e.metrics.ExportErrorsTotal.WithLabelValues(
+			e.nodeName, "prometheus", "validation").Inc()
+		e.mu.Lock()
+		e.recordExportHealth(false)
+		e.mu.Unlock()
 		return fmt.Errorf("status validation failed: %w", err)
 	}
 
@@ -214,6 +224,9 @@ func (e *PrometheusExporter) ExportStatus(ctx context.Context, status *types.Sta
 	// Record successful export
 	e.metrics.ExportOperationsTotal.WithLabelValues(
 		e.nodeName, "prometheus", "status", "success").Inc()
+	e.mu.Lock()
+	e.recordExportHealth(true)
+	e.mu.Unlock()
 
 	// Record monitor-cycle self-metrics. status.Source is the monitor name.
 	// A status carrying any ConditionFalse is treated as a failed cycle.
@@ -259,6 +272,32 @@ func (e *PrometheusExporter) RecordMonitorCycle(monitorName string, duration tim
 	e.metrics.MonitorCycleLastTimestamp.WithLabelValues(e.nodeName, monitorName).Set(float64(time.Now().Unix()))
 }
 
+// recordExportHealth updates the exporter-health self-metrics for the
+// "prometheus" exporter after an export attempt:
+//   - on success: ExporterHealthy=1, ExporterLastSuccessTimestamp=now, and the
+//     consecutive-failure counter is reset to 0 (ExporterConsecutiveFailures=0).
+//   - on failure: ExporterHealthy=0 and the consecutive-failure counter is
+//     incremented (ExporterConsecutiveFailures=count).
+//
+// The running failure count is tracked in the exporter's consecutiveFailures
+// field rather than via a racy gauge read-modify-write. The caller MUST hold
+// e.mu (write lock) so the field update is safe.
+func (e *PrometheusExporter) recordExportHealth(success bool) {
+	const exporterLabel = "prometheus"
+
+	if success {
+		e.consecutiveFailures = 0
+		e.metrics.ExporterHealthy.WithLabelValues(e.nodeName, exporterLabel).Set(1)
+		e.metrics.ExporterLastSuccessTimestamp.WithLabelValues(e.nodeName, exporterLabel).Set(float64(time.Now().Unix()))
+		e.metrics.ExporterConsecutiveFailures.WithLabelValues(e.nodeName, exporterLabel).Set(0)
+		return
+	}
+
+	e.consecutiveFailures++
+	e.metrics.ExporterHealthy.WithLabelValues(e.nodeName, exporterLabel).Set(0)
+	e.metrics.ExporterConsecutiveFailures.WithLabelValues(e.nodeName, exporterLabel).Set(float64(e.consecutiveFailures))
+}
+
 // recordLatencyMetrics extracts latency metrics from status metadata and records them
 func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 	latencyMetrics := status.GetLatencyMetrics()
@@ -400,6 +439,11 @@ func (e *PrometheusExporter) ExportProblem(ctx context.Context, problem *types.P
 
 	// Validate problem
 	if err := problem.Validate(); err != nil {
+		e.metrics.ExportErrorsTotal.WithLabelValues(
+			e.nodeName, "prometheus", "validation").Inc()
+		e.mu.Lock()
+		e.recordExportHealth(false)
+		e.mu.Unlock()
 		return fmt.Errorf("problem validation failed: %w", err)
 	}
 
@@ -431,9 +475,11 @@ func (e *PrometheusExporter) ExportProblem(ctx context.Context, problem *types.P
 	uptime := time.Since(e.startTime).Seconds()
 	e.metrics.UptimeSeconds.WithLabelValues(e.nodeName).Set(uptime)
 
-	// Record successful export
+	// Record successful export. mu is already held here, so recordExportHealth
+	// is called directly (it must not re-acquire the lock).
 	e.metrics.ExportOperationsTotal.WithLabelValues(
 		e.nodeName, "prometheus", "problem", "success").Inc()
+	e.recordExportHealth(true)
 
 	log.Printf("[DEBUG] Exported problem %s on %s to Prometheus", problem.Type, problem.Resource)
 
diff --git a/pkg/exporters/prometheus/metrics.go b/pkg/exporters/prometheus/metrics.go
index 57eb957..677c9de 100644
--- a/pkg/exporters/prometheus/metrics.go
+++ b/pkg/exporters/prometheus/metrics.go
@@ -30,6 +30,14 @@ type Metrics struct {
 	// heartbeat for staleness alerting.
 	MonitorCycleLastTimestamp *prometheus.GaugeVec
 
+	// Exporter-health self-metrics. These make the exporter's own health
+	// observable so operators can alert on a stuck or failing exporter.
+	// They are keyed by the exporter identity label only (not operation),
+	// since health is per-exporter.
+	ExporterHealthy              *prometheus.GaugeVec
+	ExporterLastSuccessTimestamp *prometheus.GaugeVec
+	ExporterConsecutiveFailures  *prometheus.GaugeVec
+
 	// Network latency gauge metrics
 	GatewayLatencySeconds         *prometheus.GaugeVec
 	PeerLatencySeconds            *prometheus.GaugeVec
@@ -231,6 +239,39 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 			[]string{"node", "monitor_name"},
 		),
 
+		ExporterHealthy: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "exporter_healthy",
+				Help:        "Whether the most recent export succeeded (1 = success, 0 = failure)",
+				ConstLabels: labels,
+			},
+			[]string{"node", "exporter"},
+		),
+
+		ExporterLastSuccessTimestamp: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "exporter_last_success_timestamp_seconds",
+				Help:        "Unix timestamp (seconds) of the most recent successful export (last-success heartbeat for staleness alerting)",
+				ConstLabels: labels,
+			},
+			[]string{"node", "exporter"},
+		),
+
+		ExporterConsecutiveFailures: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "exporter_consecutive_failures",
+				Help:        "Number of consecutive failed exports since the last successful export (reset to 0 on success)",
+				ConstLabels: labels,
+			},
+			[]string{"node", "exporter"},
+		),
+
 		// Network latency gauge metrics
 		GatewayLatencySeconds: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
@@ -487,6 +528,9 @@ func (m *Metrics) Register(registry *prometheus.Registry) error {
 		m.StartTimeSeconds,
 		m.UptimeSeconds,
 		m.MonitorCycleLastTimestamp,
+		m.ExporterHealthy,
+		m.ExporterLastSuccessTimestamp,
+		m.ExporterConsecutiveFailures,
 		m.MonitorCheckDuration,
 		m.ExportDuration,
 		// Network latency metrics
@@ -537,6 +581,9 @@ func (m *Metrics) Unregister(registry *prometheus.Registry) {
 		m.StartTimeSeconds,
 		m.UptimeSeconds,
 		m.MonitorCycleLastTimestamp,
+		m.ExporterHealthy,
+		m.ExporterLastSuccessTimestamp,
+		m.ExporterConsecutiveFailures,
 		m.MonitorCheckDuration,
 		m.ExportDuration,
 		// Network latency metrics
diff --git a/pkg/exporters/prometheus/metrics_test.go b/pkg/exporters/prometheus/metrics_test.go
index 4b041e9..7239b15 100644
--- a/pkg/exporters/prometheus/metrics_test.go
+++ b/pkg/exporters/prometheus/metrics_test.go
@@ -112,6 +112,15 @@ func TestNewMetrics(t *testing.T) {
 			if metrics.MonitorCycleLastTimestamp == nil {
 				t.Error("MonitorCycleLastTimestamp metric not created")
 			}
+			if metrics.ExporterHealthy == nil {
+				t.Error("ExporterHealthy metric not created")
+			}
+			if metrics.ExporterLastSuccessTimestamp == nil {
+				t.Error("ExporterLastSuccessTimestamp metric not created")
+			}
+			if metrics.ExporterConsecutiveFailures == nil {
+				t.Error("ExporterConsecutiveFailures metric not created")
+			}
 		})
 	}
 }
@@ -205,6 +214,11 @@ func TestMetricUpdates(t *testing.T) {
 	metrics.MonitorCyclesTotal.WithLabelValues("test-node", "disk-monitor", "success").Inc()
 	metrics.MonitorCycleLastTimestamp.WithLabelValues("test-node", "disk-monitor").Set(1640995200)
 
+	// Exporter-health self-metrics
+	metrics.ExporterHealthy.WithLabelValues("test-node", "prometheus").Set(1)
+	metrics.ExporterLastSuccessTimestamp.WithLabelValues("test-node", "prometheus").Set(1640995200)
+	metrics.ExporterConsecutiveFailures.WithLabelValues("test-node", "prometheus").Set(0)
+
 	// Gather metrics to verify they were updated
 	metricFamilies, err := registry.Gather()
 	if err != nil {
@@ -238,6 +252,9 @@ func TestMetricUpdates(t *testing.T) {
 		"test_export_duration_seconds",
 		"test_monitor_cycles_total",
 		"test_monitor_cycle_last_timestamp_seconds",
+		"test_exporter_healthy",
+		"test_exporter_last_success_timestamp_seconds",
+		"test_exporter_consecutive_failures",
 	}
 
 	for _, expectedMetric := range expectedMetrics {
@@ -591,6 +608,88 @@ func TestRecordMonitorCycle(t *testing.T) {
 	}
 }
 
+func TestRecordExportHealth(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	metrics, err := NewMetrics("test", "", nil)
+	if err != nil {
+		t.Fatalf("failed to create metrics: %v", err)
+	}
+	if err := metrics.Register(registry); err != nil {
+		t.Fatalf("failed to register metrics: %v", err)
+	}
+
+	e := &PrometheusExporter{
+		nodeName: "test-node",
+		registry: registry,
+		metrics:  metrics,
+	}
+
+	healthLabels := map[string]string{"node": "test-node", "exporter": "prometheus"}
+
+	// recordExportHealth requires the caller to hold e.mu; mirror real usage.
+	record := func(success bool) {
+		e.mu.Lock()
+		e.recordExportHealth(success)
+		e.mu.Unlock()
+	}
+
+	// A successful export: Healthy=1, timestamp>0, consecutive failures=0.
+	record(true)
+
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	if got, ok := gaugeValue(families, "test_exporter_healthy", healthLabels); !ok || got != 1 {
+		t.Errorf("exporter_healthy after success = %v (found=%v), want 1", got, ok)
+	}
+	if got, ok := gaugeValue(families, "test_exporter_last_success_timestamp_seconds", healthLabels); !ok || got <= 0 {
+		t.Errorf("exporter_last_success_timestamp_seconds after success = %v (found=%v), want > 0", got, ok)
+	}
+	if got, ok := gaugeValue(families, "test_exporter_consecutive_failures", healthLabels); !ok || got != 0 {
+		t.Errorf("exporter_consecutive_failures after success = %v (found=%v), want 0", got, ok)
+	}
+
+	// Capture the last-success timestamp so we can confirm failures don't bump it.
+	lastSuccess, _ := gaugeValue(families, "test_exporter_last_success_timestamp_seconds", healthLabels)
+
+	// Two consecutive failures: Healthy=0, consecutive failures increments to 2.
+	record(false)
+	record(false)
+
+	families, err = registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	if got, ok := gaugeValue(families, "test_exporter_healthy", healthLabels); !ok || got != 0 {
+		t.Errorf("exporter_healthy after failures = %v (found=%v), want 0", got, ok)
+	}
+	if got, ok := gaugeValue(families, "test_exporter_consecutive_failures", healthLabels); !ok || got != 2 {
+		t.Errorf("exporter_consecutive_failures after 2 failures = %v (found=%v), want 2", got, ok)
+	}
+	// Last-success timestamp must not change on failure.
+	if got, ok := gaugeValue(families, "test_exporter_last_success_timestamp_seconds", healthLabels); !ok || got != lastSuccess {
+		t.Errorf("exporter_last_success_timestamp_seconds changed on failure = %v, want %v", got, lastSuccess)
+	}
+
+	// A success after failures: Healthy=1, consecutive failures reset to 0.
+	record(true)
+
+	families, err = registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	if got, ok := gaugeValue(families, "test_exporter_healthy", healthLabels); !ok || got != 1 {
+		t.Errorf("exporter_healthy after recovery = %v (found=%v), want 1", got, ok)
+	}
+	if got, ok := gaugeValue(families, "test_exporter_consecutive_failures", healthLabels); !ok || got != 0 {
+		t.Errorf("exporter_consecutive_failures after recovery = %v (found=%v), want 0", got, ok)
+	}
+}
+
 func TestMetricsReset(t *testing.T) {
 	registry := prometheus.NewRegistry()
 	constLabels := prometheus.Labels{"env": "test"}

From 59e6567b28027342fd191a191a6e9b1262424844 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 04:56:45 -0500
Subject: [PATCH 27/38] feat(metrics): remediator circuit-breaker state gauge
 (Task #17213)

Expose the remediator circuit-breaker state as gauge
remediator_circuit_breaker_state{node} (0=closed,1=open,2=half-open) via
a push-observer: remediators defines CircuitStateObserver (int-based, no
prometheus import), the registry notifies it on every transition and on
SetCircuitStateObserver (current-state push); the prometheus exporter
implements ObserveCircuitState to set the gauge; main wires them when
both exist. No import cycle. Tests on both sides.
---
 cmd/node-doctor/main.go                    |  19 +++-
 cmd/node-doctor/main_additional_test.go    |  22 ++--
 cmd/node-doctor/main_comprehensive_test.go |   2 +-
 pkg/exporters/prometheus/exporter.go       |   9 ++
 pkg/exporters/prometheus/metrics.go        |  20 ++++
 pkg/exporters/prometheus/metrics_test.go   |  61 +++++++++++
 pkg/remediators/registry.go                |  61 +++++++++++
 pkg/remediators/registry_test.go           | 113 +++++++++++++++++++++
 8 files changed, 292 insertions(+), 15 deletions(-)

diff --git a/cmd/node-doctor/main.go b/cmd/node-doctor/main.go
index d612a3d..ed300b4 100644
--- a/cmd/node-doctor/main.go
+++ b/cmd/node-doctor/main.go
@@ -220,13 +220,22 @@ func main() {
 	if remediatorRegistry != nil {
 		historyProvider = &remediationHistoryAdapter{registry: remediatorRegistry}
 	}
-	exporters, exporterInterfaces, err := createExporters(ctx, config, historyProvider)
+	exporters, exporterInterfaces, promExporter, err := createExporters(ctx, config, historyProvider)
 	if err != nil {
 		log.Fatalf("Failed to create exporters: %v", err)
 	}
 
 	log.Printf("[INFO] Created %d exporters", len(exporters))
 
+	// Expose the remediator circuit-breaker state as a Prometheus gauge. Only wire
+	// when both the registry (remediation enabled) and the Prometheus exporter are
+	// present. SetCircuitStateObserver pushes the current state immediately and on
+	// every subsequent transition.
+	if remediatorRegistry != nil && promExporter != nil {
+		remediatorRegistry.SetCircuitStateObserver(promExporter)
+		log.Printf("[INFO] Remediator circuit-breaker state wired to Prometheus gauge")
+	}
+
 	// Create monitor factory for hot reload
 	monitorFactory := &monitorFactoryAdapter{ctx: ctx}
 
@@ -328,9 +337,12 @@ func (a *remediationHistoryAdapter) GetHistory(limit int) interface{} {
 // createExporters creates and configures all exporters from the configuration.
 // remediationProvider is optional; when non-nil it is wired to the health server
 // before Start() so /remediation/history is available immediately on first request.
-func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remediationProvider health.RemediationHistoryProvider) ([]ExporterLifecycle, []types.Exporter, error) {
+func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remediationProvider health.RemediationHistoryProvider) ([]ExporterLifecycle, []types.Exporter, *prometheusexporter.PrometheusExporter, error) {
 	var exporters []ExporterLifecycle
 	var exporterInterfaces []types.Exporter
+	// promExporterTyped keeps a typed reference to the Prometheus exporter (if one
+	// is created and started) so the caller can wire it as a circuit-state observer.
+	var promExporterTyped *prometheusexporter.PrometheusExporter
 
 	// Create Kubernetes exporter if enabled
 	if config.Exporters.Kubernetes != nil && config.Exporters.Kubernetes.Enabled {
@@ -416,6 +428,7 @@ func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remedi
 			} else {
 				exporters = append(exporters, promExporter)
 				exporterInterfaces = append(exporterInterfaces, promExporter)
+				promExporterTyped = promExporter
 				log.Printf("[INFO] Prometheus exporter created and started on port %d", config.Exporters.Prometheus.Port)
 			}
 		}
@@ -429,7 +442,7 @@ func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remedi
 		exporterInterfaces = append(exporterInterfaces, noopExp)
 	}
 
-	return exporters, exporterInterfaces, nil
+	return exporters, exporterInterfaces, promExporterTyped, nil
 }
 
 // dumpConfiguration prints the effective configuration as JSON
diff --git a/cmd/node-doctor/main_additional_test.go b/cmd/node-doctor/main_additional_test.go
index 0cdd9c9..91b2a14 100644
--- a/cmd/node-doctor/main_additional_test.go
+++ b/cmd/node-doctor/main_additional_test.go
@@ -157,7 +157,7 @@ func TestCreateExporters_Current(t *testing.T) {
 			},
 		}
 
-		exporters, interfaces, err := createExporters(ctx, config, nil)
+		exporters, interfaces, _, err := createExporters(ctx, config, nil)
 		if err != nil {
 			t.Errorf("createExporters() error = %v, want nil", err)
 		}
@@ -198,7 +198,7 @@ func TestCreateExporters_Current(t *testing.T) {
 			},
 		}
 
-		exporters, _, err := createExporters(ctx, config, nil)
+		exporters, _, _, err := createExporters(ctx, config, nil)
 		if err != nil {
 			t.Errorf("createExporters() error = %v, want nil", err)
 		}
@@ -348,7 +348,7 @@ func TestCreateExporters_HTTPExporterEnabled(t *testing.T) {
 		},
 	}
 
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -391,7 +391,7 @@ func TestCreateExporters_PrometheusExporterEnabled(t *testing.T) {
 		},
 	}
 
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -435,7 +435,7 @@ func TestCreateExporters_KubernetesExporterEnabled(t *testing.T) {
 
 	// This should not panic even without valid kubeconfig
 	// It will log a warning but continue
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -484,7 +484,7 @@ func TestCreateExporters_AllExportersEnabled(t *testing.T) {
 		},
 	}
 
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -524,7 +524,7 @@ func TestCreateExporters_HealthServerCreation(t *testing.T) {
 		},
 	}
 
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -558,7 +558,7 @@ func TestCreateExporters_NoopFallbackVerification(t *testing.T) {
 		},
 	}
 
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -629,7 +629,7 @@ func TestCreateExporters_HTTPExporterWithValidConfig(t *testing.T) {
 		},
 	}
 
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -677,7 +677,7 @@ func TestCreateExporters_KubernetesExporterWithValidConfig(t *testing.T) {
 	}
 
 	// This will fail without kubeconfig but should exercise the validation path
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
@@ -742,7 +742,7 @@ func TestCreateExporters_MultipleExportersWithValidConfig(t *testing.T) {
 		},
 	}
 
-	exporters, interfaces, err := createExporters(ctx, config, nil)
+	exporters, interfaces, _, err := createExporters(ctx, config, nil)
 	if err != nil {
 		t.Errorf("createExporters() error = %v, want nil", err)
 	}
diff --git a/cmd/node-doctor/main_comprehensive_test.go b/cmd/node-doctor/main_comprehensive_test.go
index 1dc9545..707c62e 100644
--- a/cmd/node-doctor/main_comprehensive_test.go
+++ b/cmd/node-doctor/main_comprehensive_test.go
@@ -267,7 +267,7 @@ func TestCreateExporters_TableDriven(t *testing.T) {
 			ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 			defer cancel()
 
-			exporters, interfaces, err := createExporters(ctx, tt.config, nil)
+			exporters, interfaces, _, err := createExporters(ctx, tt.config, nil)
 			if err != nil {
 				t.Errorf("createExporters() error = %v", err)
 				return
diff --git a/pkg/exporters/prometheus/exporter.go b/pkg/exporters/prometheus/exporter.go
index f1b9dfd..2a62964 100644
--- a/pkg/exporters/prometheus/exporter.go
+++ b/pkg/exporters/prometheus/exporter.go
@@ -298,6 +298,15 @@ func (e *PrometheusExporter) recordExportHealth(success bool) {
 	e.metrics.ExporterConsecutiveFailures.WithLabelValues(e.nodeName, exporterLabel).Set(float64(e.consecutiveFailures))
 }
 
+// ObserveCircuitState sets the remediator circuit-breaker state gauge to the
+// supplied value. It implements the remediators.CircuitStateObserver interface
+// so the remediator registry can push state transitions here without importing
+// this package. The state int uses the registry's encoding: 0=closed, 1=open,
+// 2=half-open.
+func (e *PrometheusExporter) ObserveCircuitState(state int) {
+	e.metrics.RemediatorCircuitBreakerState.WithLabelValues(e.nodeName).Set(float64(state))
+}
+
 // recordLatencyMetrics extracts latency metrics from status metadata and records them
 func (e *PrometheusExporter) recordLatencyMetrics(status *types.Status) {
 	latencyMetrics := status.GetLatencyMetrics()
diff --git a/pkg/exporters/prometheus/metrics.go b/pkg/exporters/prometheus/metrics.go
index 677c9de..30c4387 100644
--- a/pkg/exporters/prometheus/metrics.go
+++ b/pkg/exporters/prometheus/metrics.go
@@ -58,6 +58,13 @@ type Metrics struct {
 
 	APIServerLatencySeconds *prometheus.GaugeVec
 
+	// RemediatorCircuitBreakerState exposes the remediator registry's circuit
+	// breaker state. The value encodes the state as: 0=closed (normal operation),
+	// 1=open (remediations blocked after too many failures), 2=half-open (testing
+	// recovery). This encoding matches the CircuitBreakerState iota in
+	// pkg/remediators/registry.go exactly.
+	RemediatorCircuitBreakerState *prometheus.GaugeVec
+
 	// Histogram metrics
 	MonitorCheckDuration      *prometheus.HistogramVec
 	ExportDuration            *prometheus.HistogramVec
@@ -434,6 +441,17 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 			[]string{"node"},
 		),
 
+		RemediatorCircuitBreakerState: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "remediator_circuit_breaker_state",
+				Help:        "Current remediator circuit breaker state (0=closed, 1=open, 2=half-open)",
+				ConstLabels: labels,
+			},
+			[]string{"node"},
+		),
+
 		// Histogram metrics
 		MonitorCheckDuration: prometheus.NewHistogramVec(
 			prometheus.HistogramOpts{
@@ -549,6 +567,7 @@ func (m *Metrics) Register(registry *prometheus.Registry) error {
 		m.DNSPredictedBreachSeconds,
 		m.DNSPredictionConfidence,
 		m.APIServerLatencySeconds,
+		m.RemediatorCircuitBreakerState,
 		m.GatewayLatencyHistogram,
 		m.PeerLatencyHistogram,
 		m.DNSLatencyHistogram,
@@ -602,6 +621,7 @@ func (m *Metrics) Unregister(registry *prometheus.Registry) {
 		m.DNSPredictedBreachSeconds,
 		m.DNSPredictionConfidence,
 		m.APIServerLatencySeconds,
+		m.RemediatorCircuitBreakerState,
 		m.GatewayLatencyHistogram,
 		m.PeerLatencyHistogram,
 		m.DNSLatencyHistogram,
diff --git a/pkg/exporters/prometheus/metrics_test.go b/pkg/exporters/prometheus/metrics_test.go
index 7239b15..fde4733 100644
--- a/pkg/exporters/prometheus/metrics_test.go
+++ b/pkg/exporters/prometheus/metrics_test.go
@@ -749,3 +749,64 @@ func TestMetricsReset(t *testing.T) {
 		t.Error("monitor_up metric should still exist after ProblemsActive reset")
 	}
 }
+
+func TestRemediatorCircuitBreakerStateGauge(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	metrics, err := NewMetrics("test", "", nil)
+	if err != nil {
+		t.Fatalf("failed to create metrics: %v", err)
+	}
+	if err := metrics.Register(registry); err != nil {
+		t.Fatalf("failed to register metrics: %v", err)
+	}
+
+	e := &PrometheusExporter{
+		nodeName: "test-node",
+		registry: registry,
+		metrics:  metrics,
+	}
+
+	// ObserveCircuitState(2) should set the gauge to 2 (half-open).
+	e.ObserveCircuitState(2)
+
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	const metricName = "test_remediator_circuit_breaker_state"
+
+	// The gauge must be present in the gathered (registered) set.
+	found := false
+	for _, mf := range families {
+		if mf.GetName() == metricName {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Fatalf("%s not present in registered/gathered metrics", metricName)
+	}
+
+	got, ok := gaugeValue(families, metricName, map[string]string{"node": "test-node"})
+	if !ok {
+		t.Fatalf("%s{node=test-node} not found", metricName)
+	}
+	if got != 2 {
+		t.Errorf("%s = %v, want 2 (half-open)", metricName, got)
+	}
+
+	// A subsequent transition value should overwrite the gauge.
+	e.ObserveCircuitState(0)
+	families, err = registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+	got, ok = gaugeValue(families, metricName, map[string]string{"node": "test-node"})
+	if !ok {
+		t.Fatalf("%s{node=test-node} not found after second observe", metricName)
+	}
+	if got != 0 {
+		t.Errorf("%s = %v, want 0 (closed)", metricName, got)
+	}
+}
diff --git a/pkg/remediators/registry.go b/pkg/remediators/registry.go
index 39dd570..073bc95 100644
--- a/pkg/remediators/registry.go
+++ b/pkg/remediators/registry.go
@@ -67,6 +67,22 @@ func (s CircuitBreakerState) String() string {
 	}
 }
 
+// CircuitStateObserver is notified whenever the circuit breaker state changes.
+// It is intentionally minimal so the remediators package stays decoupled from any
+// concrete metrics/exporter implementation: the state is passed as an int that
+// matches the CircuitBreakerState iota encoding (0=Closed, 1=Open, 2=HalfOpen).
+//
+// Implementations (e.g. the Prometheus exporter) translate the int into whatever
+// observability primitive they expose. This avoids an import edge from
+// pkg/remediators into pkg/exporters/prometheus.
+type CircuitStateObserver interface {
+	// ObserveCircuitState is called with the current circuit breaker state encoded
+	// as an int (0=Closed, 1=Open, 2=HalfOpen). It is called once immediately when
+	// the observer is registered (to seed the current state) and then on every
+	// subsequent state transition.
+	ObserveCircuitState(state int)
+}
+
 // RemediatorFactory is a function that creates a new remediator instance.
 // It returns a remediator that implements the types.Remediator interface.
 type RemediatorFactory func() (types.Remediator, error)
@@ -212,6 +228,11 @@ type RemediatorRegistry struct {
 
 	// Controller coordination (optional)
 	leaseClient *LeaseClient
+
+	// circuitStateObserver is notified on every circuit breaker state change
+	// (optional). It lets an exporter expose the circuit state as a metric
+	// without the remediators package depending on a concrete exporter.
+	circuitStateObserver CircuitStateObserver
 }
 
 // DefaultCircuitBreakerConfig provides sensible defaults for the circuit breaker.
@@ -314,6 +335,41 @@ func (r *RemediatorRegistry) SetLeaseClient(leaseClient *LeaseClient) {
 	r.logInfof("Lease client configured for controller coordination")
 }
 
+// SetCircuitStateObserver registers an observer that is notified of circuit
+// breaker state changes. The observer is called once immediately with the
+// current state (so a backing metric is correct from the start) and then on
+// every subsequent transition. Passing a nil observer clears any existing one.
+//
+// The state is passed to the observer as an int matching the
+// CircuitBreakerState iota encoding (0=Closed, 1=Open, 2=HalfOpen).
+func (r *RemediatorRegistry) SetCircuitStateObserver(o CircuitStateObserver) {
+	r.mu.Lock()
+	r.circuitStateObserver = o
+	current := r.circuitState
+	r.mu.Unlock()
+
+	// Push the current state immediately so the observer (e.g. a gauge) reflects
+	// reality from the moment it is wired. Done outside the lock to avoid holding
+	// it across the observer callback.
+	if o != nil {
+		o.ObserveCircuitState(int(current))
+	}
+}
+
+// notifyCircuitStateObserver invokes the registered circuit-state observer (if
+// any) with the given state. It MUST be called with r.mu held; it captures the
+// observer reference under the lock and is safe when no observer is set (no-op).
+//
+// The observer callback is invoked while the lock is held. This is acceptable
+// because observers (such as the Prometheus exporter) only translate the int
+// into a metric and never call back into the registry, so there is no risk of
+// re-entrant deadlock.
+func (r *RemediatorRegistry) notifyCircuitStateObserver(state CircuitBreakerState) {
+	if r.circuitStateObserver != nil {
+		r.circuitStateObserver.ObserveCircuitState(int(state))
+	}
+}
+
 // GetLeaseClient returns the configured lease client, if any.
 func (r *RemediatorRegistry) GetLeaseClient() *LeaseClient {
 	r.mu.RLock()
@@ -613,6 +669,7 @@ func (r *RemediatorRegistry) checkCircuitBreaker() error {
 			r.circuitState = CircuitHalfOpen
 			r.circuitLastStateChange = time.Now()
 			r.consecutiveSuccesses = 0
+			r.notifyCircuitStateObserver(r.circuitState)
 			r.logInfof("Circuit breaker transitioning to HalfOpen (timeout elapsed)")
 			return nil
 		}
@@ -687,6 +744,7 @@ func (r *RemediatorRegistry) recordCircuitBreakerSuccess() {
 			r.circuitState = CircuitClosed
 			r.circuitLastStateChange = time.Now()
 			r.consecutiveSuccesses = 0
+			r.notifyCircuitStateObserver(r.circuitState)
 			r.logInfof("Circuit breaker transitioning to Closed (success threshold reached)")
 		}
 	}
@@ -706,6 +764,7 @@ func (r *RemediatorRegistry) recordCircuitBreakerFailure() {
 		r.circuitOpenedAt = time.Now()
 		r.circuitLastStateChange = time.Now()
 		r.consecutiveFailures = 1 // Reset counter
+		r.notifyCircuitStateObserver(r.circuitState)
 		r.logWarnf("Circuit breaker transitioning to Open (failure in HalfOpen state)")
 		return
 	}
@@ -716,6 +775,7 @@ func (r *RemediatorRegistry) recordCircuitBreakerFailure() {
 			r.circuitState = CircuitOpen
 			r.circuitOpenedAt = time.Now()
 			r.circuitLastStateChange = time.Now()
+			r.notifyCircuitStateObserver(r.circuitState)
 			r.logWarnf("Circuit breaker transitioning to Open (failure threshold %d reached)",
 				r.circuitConfig.Threshold)
 		}
@@ -849,6 +909,7 @@ func (r *RemediatorRegistry) ResetCircuitBreaker() {
 	r.consecutiveSuccesses = 0
 	r.circuitOpenedAt = time.Time{}
 	r.circuitLastStateChange = time.Now()
+	r.notifyCircuitStateObserver(r.circuitState)
 	r.logInfof("Circuit breaker manually reset to Closed")
 }
 
diff --git a/pkg/remediators/registry_test.go b/pkg/remediators/registry_test.go
index 75607d9..35988d0 100644
--- a/pkg/remediators/registry_test.go
+++ b/pkg/remediators/registry_test.go
@@ -1494,3 +1494,116 @@ func TestRemediatorRegistry_LogWithLogger(t *testing.T) {
 		t.Errorf("expected 1 error message, got %d", len(logger.errorMessages))
 	}
 }
+
+// fakeCircuitStateObserver records every ObserveCircuitState call for assertions.
+type fakeCircuitStateObserver struct {
+	mu     sync.Mutex
+	states []int
+}
+
+func (f *fakeCircuitStateObserver) ObserveCircuitState(state int) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.states = append(f.states, state)
+}
+
+func (f *fakeCircuitStateObserver) snapshot() []int {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	out := make([]int, len(f.states))
+	copy(out, f.states)
+	return out
+}
+
+func (f *fakeCircuitStateObserver) last() (int, bool) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	if len(f.states) == 0 {
+		return 0, false
+	}
+	return f.states[len(f.states)-1], true
+}
+
+func TestSetCircuitStateObserver(t *testing.T) {
+	t.Run("pushes current state immediately on registration", func(t *testing.T) {
+		registry := NewRegistry(100, 100)
+		obs := &fakeCircuitStateObserver{}
+
+		registry.SetCircuitStateObserver(obs)
+
+		states := obs.snapshot()
+		if len(states) != 1 {
+			t.Fatalf("expected exactly 1 immediate observation, got %d (%v)", len(states), states)
+		}
+		// Fresh registry starts Closed (iota 0).
+		if states[0] != int(CircuitClosed) {
+			t.Errorf("immediate observed state = %d, want %d (CircuitClosed)", states[0], int(CircuitClosed))
+		}
+	})
+
+	t.Run("notifies on forced state transition", func(t *testing.T) {
+		registry := NewRegistry(100, 100)
+		obs := &fakeCircuitStateObserver{}
+
+		// Drive the circuit to Open via failures so the observer (once registered)
+		// will see the transition value. Register the observer first so it is wired
+		// before the transition fires.
+		registry.SetCircuitStateObserver(obs)
+
+		config := CircuitBreakerConfig{
+			Threshold:        2,
+			Timeout:          1 * time.Second,
+			SuccessThreshold: 2,
+		}
+		if err := registry.SetCircuitBreakerConfig(config); err != nil {
+			t.Fatalf("SetCircuitBreakerConfig() failed: %v", err)
+		}
+
+		mock := newMockRemediator("test", true)
+		registry.Register(RemediatorInfo{
+			Type:    "test",
+			Factory: func() (types.Remediator, error) { return mock, nil },
+		})
+
+		problem := createTestProblem("test-type", "test-resource")
+		for i := 0; i < 2; i++ {
+			mock.ClearCooldown(problem)
+			_ = registry.Remediate(context.Background(), "test", problem)
+		}
+
+		if registry.GetCircuitState() != CircuitOpen {
+			t.Fatalf("circuit state = %v, want Open", registry.GetCircuitState())
+		}
+
+		last, ok := obs.last()
+		if !ok {
+			t.Fatal("observer received no notifications")
+		}
+		if last != int(CircuitOpen) {
+			t.Errorf("last observed state = %d, want %d (CircuitOpen)", last, int(CircuitOpen))
+		}
+	})
+
+	t.Run("ResetCircuitBreaker notifies observer with closed state", func(t *testing.T) {
+		registry := NewRegistry(100, 100)
+		obs := &fakeCircuitStateObserver{}
+		registry.SetCircuitStateObserver(obs)
+
+		registry.ResetCircuitBreaker()
+
+		last, ok := obs.last()
+		if !ok {
+			t.Fatal("observer received no notifications")
+		}
+		if last != int(CircuitClosed) {
+			t.Errorf("last observed state = %d, want %d (CircuitClosed)", last, int(CircuitClosed))
+		}
+	})
+
+	t.Run("nil observer is a no-op", func(t *testing.T) {
+		registry := NewRegistry(100, 100)
+		// Should not panic when no observer is set and a transition fires.
+		registry.SetCircuitStateObserver(nil)
+		registry.ResetCircuitBreaker()
+	})
+}

From 4a73e4de9f248c7d42c92ab8246b67d5e48d7f6c Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:07:07 -0500
Subject: [PATCH 28/38] feat(metrics): config hot-reload self-metrics (Task
 #17214)

Add config_reloads_total{node,result}, config_reload_last_timestamp_
seconds, config_reload_last_success, config_reload_duration_seconds.
ReloadCoordinator gains an injected ReloadMetricsRecorder func (no
prometheus import); performReload uses a named return + deferred closure
so every return path (load/validation/callback error, no-op, success)
records exactly once. Exporter.RecordConfigReload implements it; wired in
main via det.SetReloadMetricsRecorder(promExporter.RecordConfigReload).
Tests on both sides; no import cycle.
---
 cmd/node-doctor/main.go                  |   9 +
 pkg/detector/detector.go                 |  14 ++
 pkg/exporters/prometheus/exporter.go     |  26 +++
 pkg/exporters/prometheus/metrics.go      |  71 +++++++
 pkg/exporters/prometheus/metrics_test.go |  70 +++++++
 pkg/reload/coordinator.go                |  38 +++-
 pkg/reload/coordinator_metrics_test.go   | 230 +++++++++++++++++++++++
 7 files changed, 457 insertions(+), 1 deletion(-)
 create mode 100644 pkg/reload/coordinator_metrics_test.go

diff --git a/cmd/node-doctor/main.go b/cmd/node-doctor/main.go
index ed300b4..3c9d2d7 100644
--- a/cmd/node-doctor/main.go
+++ b/cmd/node-doctor/main.go
@@ -253,6 +253,15 @@ func main() {
 		det.SetRemediatorRegistry(remediatorRegistry)
 	}
 
+	// Wire config hot-reload self-metrics. The detector owns the reload
+	// coordinator but only sees exporters via types.Exporter; pass a closure over
+	// the concrete Prometheus exporter's RecordConfigReload. Only wired when the
+	// Prometheus exporter is present (nil otherwise).
+	if promExporter != nil {
+		det.SetReloadMetricsRecorder(promExporter.RecordConfigReload)
+		log.Printf("[INFO] Config hot-reload self-metrics wired to Prometheus exporter")
+	}
+
 	// Start the detector
 	log.Printf("[INFO] Starting detector...")
 	if err := det.Start(); err != nil {
diff --git a/pkg/detector/detector.go b/pkg/detector/detector.go
index a5ce325..c9fc6c7 100644
--- a/pkg/detector/detector.go
+++ b/pkg/detector/detector.go
@@ -221,6 +221,20 @@ func (pd *ProblemDetector) SetRemediatorRegistry(r RemediationExecutor) {
 	pd.remediatorRegistry = r
 }
 
+// SetReloadMetricsRecorder wires a config hot-reload self-metrics recorder into
+// the detector's reload coordinator. The detector only sees exporters via the
+// types.Exporter interface and does not hold the concrete Prometheus exporter,
+// so the recorder is injected here as a decoupled closure (mirroring the
+// EventEmitter pattern). Nil-safe: a nil recorder disables recording, and the
+// call is a no-op if the coordinator has not been constructed.
+func (pd *ProblemDetector) SetReloadMetricsRecorder(recorder reload.ReloadMetricsRecorder) {
+	pd.mu.Lock()
+	defer pd.mu.Unlock()
+	if pd.reloadCoordinator != nil {
+		pd.reloadCoordinator.SetMetricsRecorder(recorder)
+	}
+}
+
 // IsRunning returns true if the detector is currently running
 func (pd *ProblemDetector) IsRunning() bool {
 	pd.mu.RLock()
diff --git a/pkg/exporters/prometheus/exporter.go b/pkg/exporters/prometheus/exporter.go
index 2a62964..3f2279c 100644
--- a/pkg/exporters/prometheus/exporter.go
+++ b/pkg/exporters/prometheus/exporter.go
@@ -272,6 +272,32 @@ func (e *PrometheusExporter) RecordMonitorCycle(monitorName string, duration tim
 	e.metrics.MonitorCycleLastTimestamp.WithLabelValues(e.nodeName, monitorName).Set(float64(time.Now().Unix()))
 }
 
+// RecordConfigReload records self-metrics for one completed config hot-reload
+// attempt:
+//   - increments ConfigReloadsTotal with result="success" or result="failure"
+//   - sets ConfigReloadLastSuccess to 1 (success) or 0 (failure)
+//   - sets ConfigReloadLastTimestamp to the current time (last-attempt heartbeat,
+//     updated on both success and failure)
+//   - observes the reload duration into ConfigReloadDuration
+//
+// It implements the reload.ReloadMetricsRecorder signature so the reload
+// coordinator can push reload outcomes here via an injected closure without
+// importing this package. Safe to call directly and concurrently (the
+// underlying prometheus vecs are goroutine-safe).
+func (e *PrometheusExporter) RecordConfigReload(success bool, duration time.Duration) {
+	result := "failure"
+	lastSuccess := 0.0
+	if success {
+		result = "success"
+		lastSuccess = 1.0
+	}
+
+	e.metrics.ConfigReloadsTotal.WithLabelValues(e.nodeName, result).Inc()
+	e.metrics.ConfigReloadLastSuccess.WithLabelValues(e.nodeName).Set(lastSuccess)
+	e.metrics.ConfigReloadLastTimestamp.WithLabelValues(e.nodeName).Set(float64(time.Now().Unix()))
+	e.metrics.ConfigReloadDuration.WithLabelValues(e.nodeName).Observe(duration.Seconds())
+}
+
 // recordExportHealth updates the exporter-health self-metrics for the
 // "prometheus" exporter after an export attempt:
 //   - on success: ExporterHealthy=1, ExporterLastSuccessTimestamp=now, and the
diff --git a/pkg/exporters/prometheus/metrics.go b/pkg/exporters/prometheus/metrics.go
index 30c4387..45047fa 100644
--- a/pkg/exporters/prometheus/metrics.go
+++ b/pkg/exporters/prometheus/metrics.go
@@ -38,6 +38,19 @@ type Metrics struct {
 	ExporterLastSuccessTimestamp *prometheus.GaugeVec
 	ExporterConsecutiveFailures  *prometheus.GaugeVec
 
+	// Config hot-reload self-metrics. These make the configuration hot-reload
+	// path observable so operators can alert on reload failures or a reload
+	// loop that has gone quiet. All are keyed by the node label only.
+	//
+	// Timestamp semantics: ConfigReloadLastTimestamp records the time of the most
+	// recent reload ATTEMPT (success or failure), updated at the end of every
+	// attempt. Pair it with ConfigReloadLastSuccess (1 if that most-recent attempt
+	// succeeded, 0 if it failed) to distinguish "reloaded recently and it worked"
+	// from "tried recently and it failed".
+	ConfigReloadsTotal        *prometheus.CounterVec
+	ConfigReloadLastTimestamp *prometheus.GaugeVec
+	ConfigReloadLastSuccess   *prometheus.GaugeVec
+
 	// Network latency gauge metrics
 	GatewayLatencySeconds         *prometheus.GaugeVec
 	PeerLatencySeconds            *prometheus.GaugeVec
@@ -72,6 +85,10 @@ type Metrics struct {
 	PeerLatencyHistogram      *prometheus.HistogramVec
 	DNSLatencyHistogram       *prometheus.HistogramVec
 	APIServerLatencyHistogram *prometheus.HistogramVec
+
+	// ConfigReloadDuration observes the wall-clock time of each completed config
+	// reload attempt (performReload), keyed by the node label.
+	ConfigReloadDuration *prometheus.HistogramVec
 }
 
 // NewMetrics creates a new Metrics instance with all metric definitions
@@ -279,6 +296,40 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 			[]string{"node", "exporter"},
 		),
 
+		// Config hot-reload self-metrics
+		ConfigReloadsTotal: prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "config_reloads_total",
+				Help:        "Total number of completed configuration reload attempts, partitioned by result (success/failure)",
+				ConstLabels: labels,
+			},
+			[]string{"node", "result"},
+		),
+
+		ConfigReloadLastTimestamp: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "config_reload_last_timestamp_seconds",
+				Help:        "Unix timestamp (seconds) of the most recent configuration reload attempt (success or failure)",
+				ConstLabels: labels,
+			},
+			[]string{"node"},
+		),
+
+		ConfigReloadLastSuccess: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "config_reload_last_success",
+				Help:        "Whether the most recent configuration reload attempt succeeded (1 = success, 0 = failure)",
+				ConstLabels: labels,
+			},
+			[]string{"node"},
+		),
+
 		// Network latency gauge metrics
 		GatewayLatencySeconds: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
@@ -524,6 +575,18 @@ func NewMetrics(namespace, subsystem string, constLabels prometheus.Labels) (*Me
 			},
 			[]string{"node"},
 		),
+
+		ConfigReloadDuration: prometheus.NewHistogramVec(
+			prometheus.HistogramOpts{
+				Namespace:   namespace,
+				Subsystem:   subsystem,
+				Name:        "config_reload_duration_seconds",
+				Help:        "Duration of configuration reload attempts in seconds",
+				ConstLabels: labels,
+				Buckets:     prometheus.DefBuckets,
+			},
+			[]string{"node"},
+		),
 	}
 
 	return m, nil
@@ -549,8 +612,12 @@ func (m *Metrics) Register(registry *prometheus.Registry) error {
 		m.ExporterHealthy,
 		m.ExporterLastSuccessTimestamp,
 		m.ExporterConsecutiveFailures,
+		m.ConfigReloadsTotal,
+		m.ConfigReloadLastTimestamp,
+		m.ConfigReloadLastSuccess,
 		m.MonitorCheckDuration,
 		m.ExportDuration,
+		m.ConfigReloadDuration,
 		// Network latency metrics
 		m.GatewayLatencySeconds,
 		m.PeerLatencySeconds,
@@ -603,8 +670,12 @@ func (m *Metrics) Unregister(registry *prometheus.Registry) {
 		m.ExporterHealthy,
 		m.ExporterLastSuccessTimestamp,
 		m.ExporterConsecutiveFailures,
+		m.ConfigReloadsTotal,
+		m.ConfigReloadLastTimestamp,
+		m.ConfigReloadLastSuccess,
 		m.MonitorCheckDuration,
 		m.ExportDuration,
+		m.ConfigReloadDuration,
 		// Network latency metrics
 		m.GatewayLatencySeconds,
 		m.PeerLatencySeconds,
diff --git a/pkg/exporters/prometheus/metrics_test.go b/pkg/exporters/prometheus/metrics_test.go
index fde4733..de62cc2 100644
--- a/pkg/exporters/prometheus/metrics_test.go
+++ b/pkg/exporters/prometheus/metrics_test.go
@@ -810,3 +810,73 @@ func TestRemediatorCircuitBreakerStateGauge(t *testing.T) {
 		t.Errorf("%s = %v, want 0 (closed)", metricName, got)
 	}
 }
+
+func TestRecordConfigReload(t *testing.T) {
+	registry := prometheus.NewRegistry()
+	metrics, err := NewMetrics("test", "", nil)
+	if err != nil {
+		t.Fatalf("failed to create metrics: %v", err)
+	}
+	if err := metrics.Register(registry); err != nil {
+		t.Fatalf("failed to register metrics: %v", err)
+	}
+
+	e := &PrometheusExporter{
+		nodeName: "test-node",
+		registry: registry,
+		metrics:  metrics,
+	}
+
+	nodeLabels := map[string]string{"node": "test-node"}
+
+	// A successful reload: LastSuccess=1, timestamp>0, success counter=1,
+	// duration histogram observed once.
+	e.RecordConfigReload(true, 25*time.Millisecond)
+
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	if got, ok := gaugeValue(families, "test_config_reload_last_success", nodeLabels); !ok || got != 1 {
+		t.Errorf("config_reload_last_success after success = %v (found=%v), want 1", got, ok)
+	}
+	if got, ok := gaugeValue(families, "test_config_reload_last_timestamp_seconds", nodeLabels); !ok || got <= 0 {
+		t.Errorf("config_reload_last_timestamp_seconds after success = %v (found=%v), want > 0", got, ok)
+	}
+	if got, ok := counterValue(families, "test_config_reloads_total", map[string]string{
+		"node": "test-node", "result": "success",
+	}); !ok || got != 1 {
+		t.Errorf("config_reloads_total{result=success} = %v (found=%v), want 1", got, ok)
+	}
+	if got, ok := histogramSampleCount(families, "test_config_reload_duration_seconds", nodeLabels); !ok || got != 1 {
+		t.Errorf("config_reload_duration_seconds sample count = %v (found=%v), want 1", got, ok)
+	}
+
+	// A failed reload: LastSuccess flips to 0, failure counter=1, timestamp still
+	// advances (last-attempt heartbeat), duration histogram observed again.
+	e.RecordConfigReload(false, 10*time.Millisecond)
+
+	families, err = registry.Gather()
+	if err != nil {
+		t.Fatalf("failed to gather metrics: %v", err)
+	}
+
+	if got, ok := gaugeValue(families, "test_config_reload_last_success", nodeLabels); !ok || got != 0 {
+		t.Errorf("config_reload_last_success after failure = %v (found=%v), want 0", got, ok)
+	}
+	if got, ok := counterValue(families, "test_config_reloads_total", map[string]string{
+		"node": "test-node", "result": "failure",
+	}); !ok || got != 1 {
+		t.Errorf("config_reloads_total{result=failure} = %v (found=%v), want 1", got, ok)
+	}
+	// Success counter must be unchanged.
+	if got, ok := counterValue(families, "test_config_reloads_total", map[string]string{
+		"node": "test-node", "result": "success",
+	}); !ok || got != 1 {
+		t.Errorf("config_reloads_total{result=success} after failure = %v (found=%v), want 1", got, ok)
+	}
+	if got, ok := histogramSampleCount(families, "test_config_reload_duration_seconds", nodeLabels); !ok || got != 2 {
+		t.Errorf("config_reload_duration_seconds sample count = %v (found=%v), want 2", got, ok)
+	}
+}
diff --git a/pkg/reload/coordinator.go b/pkg/reload/coordinator.go
index 3ed0afd..b1a4c3e 100644
--- a/pkg/reload/coordinator.go
+++ b/pkg/reload/coordinator.go
@@ -17,12 +17,22 @@ type ReloadCallback func(ctx context.Context, newConfig *types.NodeDoctorConfig,
 // EventEmitter emits reload status events.
 type EventEmitter func(severity types.EventSeverity, reason, message string)
 
+// ReloadMetricsRecorder records the outcome of a completed reload attempt.
+// success is true when the reload applied (or determined there were no changes)
+// without error, false when any step failed. duration is the wall-clock time
+// spent in performReload. It is invoked exactly once per reload attempt.
+//
+// This is an injected hook (mirroring EventEmitter) so the reload package never
+// imports the prometheus exporter, avoiding coupling/cycles.
+type ReloadMetricsRecorder func(success bool, duration time.Duration)
+
 // ReloadCoordinator orchestrates configuration reload operations.
 type ReloadCoordinator struct {
 	configPath       string
 	currentConfig    *types.NodeDoctorConfig
 	reloadCallback   ReloadCallback
 	eventEmitter     EventEmitter
+	metricsRecorder  ReloadMetricsRecorder
 	validator        *ConfigValidator
 	mu               sync.Mutex
 	reloadInProgress bool
@@ -82,9 +92,21 @@ func (rc *ReloadCoordinator) TriggerReload(ctx context.Context) error {
 }
 
 // performReload executes the reload process.
-func (rc *ReloadCoordinator) performReload(ctx context.Context) error {
+//
+// The named return value err is inspected by a deferred closure that records
+// reload self-metrics exactly once, on EVERY return path (load error, validation
+// error, no-changes success, callback error, full success). success is derived
+// from err == nil at the moment of return, so adding a new early return cannot
+// silently skip metric recording.
+func (rc *ReloadCoordinator) performReload(ctx context.Context) (err error) {
 	startTime := time.Now()
 
+	// Record reload self-metrics exactly once when performReload returns,
+	// regardless of which path produced the result.
+	defer func() {
+		rc.recordMetrics(err == nil, time.Since(startTime))
+	}()
+
 	// Emit start event
 	rc.emitEvent(types.EventInfo, "ConfigReloadStarted", "Configuration reload initiated")
 
@@ -187,6 +209,20 @@ func (rc *ReloadCoordinator) emitEvent(severity types.EventSeverity, reason, mes
 	}
 }
 
+// SetMetricsRecorder sets (or clears) the reload self-metrics recorder. It is
+// nil-safe: passing nil disables metric recording. Safe to call before the
+// coordinator is used to trigger reloads.
+func (rc *ReloadCoordinator) SetMetricsRecorder(recorder ReloadMetricsRecorder) {
+	rc.metricsRecorder = recorder
+}
+
+// recordMetrics invokes the metrics recorder, if one is set.
+func (rc *ReloadCoordinator) recordMetrics(success bool, duration time.Duration) {
+	if rc.metricsRecorder != nil {
+		rc.metricsRecorder(success, duration)
+	}
+}
+
 // GetCurrentConfig returns the current active configuration (thread-safe).
 func (rc *ReloadCoordinator) GetCurrentConfig() *types.NodeDoctorConfig {
 	rc.mu.Lock()
diff --git a/pkg/reload/coordinator_metrics_test.go b/pkg/reload/coordinator_metrics_test.go
new file mode 100644
index 0000000..406bf96
--- /dev/null
+++ b/pkg/reload/coordinator_metrics_test.go
@@ -0,0 +1,230 @@
+package reload
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/supporttools/node-doctor/pkg/types"
+)
+
+// fakeReloadRecorder captures invocations of a ReloadMetricsRecorder.
+type fakeReloadRecorder struct {
+	mu        sync.Mutex
+	calls     int
+	lastOK    bool
+	lastDur   time.Duration
+	successes int
+	failures  int
+}
+
+func (f *fakeReloadRecorder) record(success bool, duration time.Duration) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.calls++
+	f.lastOK = success
+	f.lastDur = duration
+	if success {
+		f.successes++
+	} else {
+		f.failures++
+	}
+}
+
+// TestPerformReload_RecorderSuccess verifies the metrics recorder is invoked
+// exactly once with success=true and a non-negative duration on a good reload
+// that applies changes.
+func TestPerformReload_RecorderSuccess(t *testing.T) {
+	tempDir := t.TempDir()
+	configPath := filepath.Join(tempDir, "config.yaml")
+
+	configYAML := `
+apiVersion: v1
+kind: NodeDoctorConfig
+metadata:
+  name: test-config
+settings:
+  nodeName: test-node
+monitors:
+  - name: new-monitor
+    type: kubernetes-kubelet-check
+    enabled: true
+    interval: 30s
+    timeout: 10s
+exporters:
+  kubernetes:
+    enabled: true
+    namespace: default
+remediation:
+  enabled: false
+`
+	if err := os.WriteFile(configPath, []byte(configYAML), 0644); err != nil {
+		t.Fatalf("Failed to create config file: %v", err)
+	}
+
+	config := &types.NodeDoctorConfig{
+		APIVersion: "v1",
+		Kind:       "NodeDoctorConfig",
+		Metadata:   types.ConfigMetadata{Name: "test-config"},
+		Settings:   types.GlobalSettings{NodeName: "test-node"},
+		Exporters: types.ExporterConfigs{
+			Kubernetes: &types.KubernetesExporterConfig{Enabled: true, Namespace: "default"},
+		},
+		Remediation: types.RemediationConfig{Enabled: false},
+	}
+
+	callback := func(ctx context.Context, newConfig *types.NodeDoctorConfig, diff *ConfigDiff) error {
+		return nil
+	}
+	emitter := func(severity types.EventSeverity, reason, message string) {}
+
+	coordinator := NewReloadCoordinator(configPath, config, callback, emitter)
+
+	rec := &fakeReloadRecorder{}
+	coordinator.SetMetricsRecorder(rec.record)
+
+	if err := coordinator.TriggerReload(context.Background()); err != nil {
+		t.Fatalf("Unexpected error: %v", err)
+	}
+
+	if rec.calls != 1 {
+		t.Fatalf("expected recorder to be invoked exactly once, got %d", rec.calls)
+	}
+	if !rec.lastOK {
+		t.Errorf("expected success=true, got false")
+	}
+	if rec.lastDur < 0 {
+		t.Errorf("expected non-negative duration, got %v", rec.lastDur)
+	}
+	if rec.successes != 1 || rec.failures != 0 {
+		t.Errorf("expected 1 success / 0 failures, got %d/%d", rec.successes, rec.failures)
+	}
+}
+
+// TestPerformReload_RecorderFailure verifies the metrics recorder is invoked
+// exactly once with success=false when the reload fails (validation failure).
+func TestPerformReload_RecorderFailure(t *testing.T) {
+	tempDir := t.TempDir()
+	configPath := filepath.Join(tempDir, "config.yaml")
+
+	// Config that loads but fails validation (empty monitor name/type, no exporters).
+	configYAML := `
+apiVersion: v1
+kind: NodeDoctorConfig
+metadata:
+  name: test-config
+settings:
+  nodeName: test-node
+monitors:
+  - name: ""
+    type: ""
+    enabled: true
+    interval: 30s
+    timeout: 10s
+exporters:
+  kubernetes:
+    enabled: false
+  http:
+    enabled: false
+  prometheus:
+    enabled: false
+remediation:
+  enabled: false
+`
+	if err := os.WriteFile(configPath, []byte(configYAML), 0644); err != nil {
+		t.Fatalf("Failed to create config file: %v", err)
+	}
+
+	config := &types.NodeDoctorConfig{
+		APIVersion: "v1",
+		Kind:       "NodeDoctorConfig",
+		Metadata:   types.ConfigMetadata{Name: "test-config"},
+		Settings:   types.GlobalSettings{NodeName: "test-node"},
+	}
+
+	callbackCalled := false
+	callback := func(ctx context.Context, newConfig *types.NodeDoctorConfig, diff *ConfigDiff) error {
+		callbackCalled = true
+		return nil
+	}
+	emitter := func(severity types.EventSeverity, reason, message string) {}
+
+	coordinator := NewReloadCoordinator(configPath, config, callback, emitter)
+
+	rec := &fakeReloadRecorder{}
+	coordinator.SetMetricsRecorder(rec.record)
+
+	if err := coordinator.TriggerReload(context.Background()); err == nil {
+		t.Fatal("expected reload to fail validation")
+	}
+	if callbackCalled {
+		t.Error("callback should not run on a failed reload")
+	}
+
+	if rec.calls != 1 {
+		t.Fatalf("expected recorder to be invoked exactly once, got %d", rec.calls)
+	}
+	if rec.lastOK {
+		t.Errorf("expected success=false, got true")
+	}
+	if rec.lastDur < 0 {
+		t.Errorf("expected non-negative duration, got %v", rec.lastDur)
+	}
+	if rec.successes != 0 || rec.failures != 1 {
+		t.Errorf("expected 0 success / 1 failure, got %d/%d", rec.successes, rec.failures)
+	}
+}
+
+// TestPerformReload_NilRecorder ensures reloads are nil-safe when no recorder is set.
+func TestPerformReload_NilRecorder(t *testing.T) {
+	tempDir := t.TempDir()
+	configPath := filepath.Join(tempDir, "config.yaml")
+
+	configYAML := `
+apiVersion: v1
+kind: NodeDoctorConfig
+metadata:
+  name: test-config
+settings:
+  nodeName: test-node
+monitors:
+  - name: new-monitor
+    type: kubernetes-kubelet-check
+    enabled: true
+    interval: 30s
+    timeout: 10s
+exporters:
+  kubernetes:
+    enabled: true
+    namespace: default
+remediation:
+  enabled: false
+`
+	if err := os.WriteFile(configPath, []byte(configYAML), 0644); err != nil {
+		t.Fatalf("Failed to create config file: %v", err)
+	}
+
+	config := &types.NodeDoctorConfig{
+		APIVersion: "v1",
+		Kind:       "NodeDoctorConfig",
+		Metadata:   types.ConfigMetadata{Name: "test-config"},
+		Settings:   types.GlobalSettings{NodeName: "test-node"},
+		Exporters: types.ExporterConfigs{
+			Kubernetes: &types.KubernetesExporterConfig{Enabled: true, Namespace: "default"},
+		},
+	}
+
+	callback := func(ctx context.Context, newConfig *types.NodeDoctorConfig, diff *ConfigDiff) error {
+		return nil
+	}
+	emitter := func(severity types.EventSeverity, reason, message string) {}
+
+	coordinator := NewReloadCoordinator(configPath, config, callback, emitter)
+	// No recorder set; must not panic.
+	if err := coordinator.TriggerReload(context.Background()); err != nil {
+		t.Fatalf("unexpected error with nil recorder: %v", err)
+	}
+}

From b0bccc26ddf4d23567740a37f03f8fcdfc6412ee Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:10:07 -0500
Subject: [PATCH 29/38] test(metrics): authoritative self-metrics registration
 test (Task #17215)

Add TestSelfMetricsRegistered: registers Metrics into NewRegistry (incl.
go/process collectors), populates every self-metric via its recorder
(RecordMonitorCycle/recordExportHealth/ObserveCircuitState/
RecordConfigReload + direct export-ops Inc/Observe), gathers, and asserts
all 14 node_doctor_* self-metric families plus go_goroutines and (linux)
process_start_time_seconds are present. Test-only.
---
 pkg/exporters/prometheus/metrics_test.go | 133 +++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/pkg/exporters/prometheus/metrics_test.go b/pkg/exporters/prometheus/metrics_test.go
index de62cc2..6681ed3 100644
--- a/pkg/exporters/prometheus/metrics_test.go
+++ b/pkg/exporters/prometheus/metrics_test.go
@@ -2,6 +2,8 @@ package prometheus
 
 import (
 	"fmt"
+	"runtime"
+	"strings"
 	"testing"
 	"time"
 
@@ -880,3 +882,134 @@ func TestRecordConfigReload(t *testing.T) {
 		t.Errorf("config_reload_duration_seconds sample count = %v (found=%v), want 2", got, ok)
 	}
 }
+
+// TestSelfMetricsRegistered is the authoritative "all self-metrics registered"
+// test for task #17215. It proves the full self-metrics surface (observability
+// about node-doctor itself) is both registered in the exporter's registry and
+// actually gather-able after each metric has been recorded.
+//
+// Registry choice: it wires NewRegistry(...) — the exact constructor the
+// production exporter uses (see NewPrometheusExporter in exporter.go) — and then
+// registers the node-doctor Metrics into it via metrics.Register. This is the
+// only configuration that lets one test assert BOTH the node-doctor self-metric
+// families AND the standard go_*/process_* collector families that NewRegistry
+// adds. A namespace of "node_doctor" (the production default) is used so the
+// asserted family names carry the real production prefix, e.g.
+// node_doctor_monitor_cycles_total.
+//
+// Population: each self-metric is exercised through the same recorder method
+// production uses (RecordMonitorCycle, recordExportHealth, RecordConfigReload,
+// ObserveCircuitState). The three export-operation self-metrics
+// (export_operations_total, export_errors_total, export_duration_seconds) have
+// no single dedicated recorder, so they are populated directly via
+// WithLabelValues(...).Inc()/Observe() to yield at least one series each.
+func TestSelfMetricsRegistered(t *testing.T) {
+	const (
+		namespace = "node_doctor"
+		nodeName  = "test-node"
+		exporter  = "prometheus"
+	)
+
+	// Use the production registry constructor so go_*/process_* collectors are
+	// present, then register node-doctor metrics into it exactly as the exporter
+	// does.
+	registry := NewRegistry(prometheus.Labels{})
+	metrics, err := NewMetrics(namespace, "", nil)
+	if err != nil {
+		t.Fatalf("NewMetrics() error: %v", err)
+	}
+	if err := metrics.Register(registry); err != nil {
+		t.Fatalf("metrics.Register() error: %v", err)
+	}
+
+	e := &PrometheusExporter{
+		nodeName: nodeName,
+		registry: registry,
+		metrics:  metrics,
+	}
+
+	// --- Populate every self-metric so each family yields at least one series. ---
+
+	// Monitor-cycle self-metrics: monitor_cycles_total,
+	// monitor_check_duration_seconds, monitor_cycle_last_timestamp_seconds.
+	e.RecordMonitorCycle("disk-monitor", 25*time.Millisecond, nil)
+
+	// Exporter-health self-metrics: exporter_healthy,
+	// exporter_last_success_timestamp_seconds, exporter_consecutive_failures.
+	// recordExportHealth requires the caller to hold e.mu; mirror real usage.
+	e.mu.Lock()
+	e.recordExportHealth(true)
+	e.mu.Unlock()
+
+	// Export-operation self-metrics: export_operations_total, export_errors_total,
+	// export_duration_seconds. No single recorder covers these, so drive the vecs
+	// directly to create a series in each family.
+	metrics.ExportOperationsTotal.WithLabelValues(nodeName, exporter, "status", "success").Inc()
+	metrics.ExportErrorsTotal.WithLabelValues(nodeName, exporter, "timeout").Inc()
+	metrics.ExportDuration.WithLabelValues(nodeName, exporter, "status").Observe(0.01)
+
+	// Circuit-breaker self-metric: remediator_circuit_breaker_state.
+	e.ObserveCircuitState(0)
+
+	// Config-reload self-metrics: config_reloads_total,
+	// config_reload_last_timestamp_seconds, config_reload_last_success,
+	// config_reload_duration_seconds.
+	e.RecordConfigReload(true, 15*time.Millisecond)
+
+	families, err := registry.Gather()
+	if err != nil {
+		t.Fatalf("registry.Gather() error: %v", err)
+	}
+
+	present := make(map[string]bool, len(families))
+	for _, mf := range families {
+		present[mf.GetName()] = true
+	}
+
+	// Full self-metrics surface, with the production node_doctor_ prefix derived
+	// from the namespace passed to NewMetrics (subsystem is empty).
+	expected := []string{
+		// monitor cycle
+		"node_doctor_monitor_cycles_total",
+		"node_doctor_monitor_cycle_last_timestamp_seconds",
+		"node_doctor_monitor_check_duration_seconds",
+		// exporter health
+		"node_doctor_exporter_healthy",
+		"node_doctor_exporter_last_success_timestamp_seconds",
+		"node_doctor_exporter_consecutive_failures",
+		// export ops
+		"node_doctor_export_operations_total",
+		"node_doctor_export_errors_total",
+		"node_doctor_export_duration_seconds",
+		// circuit breaker
+		"node_doctor_remediator_circuit_breaker_state",
+		// config reload
+		"node_doctor_config_reloads_total",
+		"node_doctor_config_reload_last_timestamp_seconds",
+		"node_doctor_config_reload_last_success",
+		"node_doctor_config_reload_duration_seconds",
+	}
+
+	var missing []string
+	for _, name := range expected {
+		if !present[name] {
+			missing = append(missing, name)
+		}
+	}
+	if len(missing) > 0 {
+		t.Errorf("self-metric families missing from registered/gathered set: %s", strings.Join(missing, ", "))
+	}
+
+	// Because NewRegistry wires the Go and process collectors, the runtime/process
+	// self-observability families must also be exposed. go_goroutines is present
+	// on every platform; process_* is only emitted on platforms the collector
+	// supports (Linux in CI/production).
+	if !present["go_goroutines"] {
+		t.Errorf("expected go_goroutines from the Go collector wired by NewRegistry; not present")
+	}
+	if runtime.GOOS == "linux" {
+		if !present["process_start_time_seconds"] {
+			t.Errorf("expected process_start_time_seconds from the process collector on linux; not present")
+		}
+	}
+}

From e8e2b18e74f230cfdec8cbeb78c7cce6c91cc924 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:15:38 -0500
Subject: [PATCH 30/38] feat(remediators): ip -6 route flush cache operation
 (Task #17220)

Add NetworkFlushIPv6Route ("flush-ipv6-route") operation to
NetworkRemediator: runs ip -6 route flush cache, optional ip -6 route show
backup when BackupRouting. Unlike reset-routing, a flush failure is a hard
error (dedicated flush remediator). verifyOperation is a no-op (immediate).
Registration/enablement left to #17222. Tests assert exact command args,
failure semantics, backup ordering, and verify path.
---
 pkg/remediators/network.go      |  49 ++++++++-
 pkg/remediators/network_test.go | 176 ++++++++++++++++++++++++++++++++
 2 files changed, 223 insertions(+), 2 deletions(-)

diff --git a/pkg/remediators/network.go b/pkg/remediators/network.go
index 0463432..ce4fe6d 100644
--- a/pkg/remediators/network.go
+++ b/pkg/remediators/network.go
@@ -22,6 +22,9 @@ const (
 
 	// NetworkResetRouting resets the routing table to defaults
 	NetworkResetRouting NetworkOperation = "reset-routing"
+
+	// NetworkFlushIPv6Route flushes the IPv6 route cache
+	NetworkFlushIPv6Route NetworkOperation = "flush-ipv6-route"
 )
 
 // NetworkConfig contains configuration for the network remediator.
@@ -150,10 +153,10 @@ func NewNetworkRemediator(config NetworkConfig) (*NetworkRemediator, error) {
 func validateNetworkConfig(config NetworkConfig) error {
 	// Validate operation
 	switch config.Operation {
-	case NetworkFlushDNS, NetworkRestartInterface, NetworkResetRouting:
+	case NetworkFlushDNS, NetworkRestartInterface, NetworkResetRouting, NetworkFlushIPv6Route:
 		// Valid operation
 	default:
-		return fmt.Errorf("invalid operation: %s (must be flush-dns, restart-interface, or reset-routing)", config.Operation)
+		return fmt.Errorf("invalid operation: %s (must be flush-dns, restart-interface, reset-routing, or flush-ipv6-route)", config.Operation)
 	}
 
 	// RestartInterface requires an interface name
@@ -203,6 +206,8 @@ func (r *NetworkRemediator) executeOperation(ctx context.Context) error {
 		return r.restartInterface(ctx)
 	case NetworkResetRouting:
 		return r.resetRouting(ctx)
+	case NetworkFlushIPv6Route:
+		return r.flushIPv6RouteCache(ctx)
 	default:
 		return fmt.Errorf("unknown operation: %s", r.config.Operation)
 	}
@@ -294,6 +299,41 @@ func (r *NetworkRemediator) resetRouting(ctx context.Context) error {
 	return nil
 }
 
+// flushIPv6RouteCache flushes the IPv6 route cache via "ip -6 route flush cache".
+//
+// Failure semantics: unlike resetRouting (which treats a cache-flush failure as a
+// non-fatal warning because it is one step of a broader reset), this is a dedicated
+// flush remediator. A failed flush means the remediation did not accomplish its sole
+// purpose, so the error is returned (wrapped with command output) and the remediation
+// is reported as failed.
+func (r *NetworkRemediator) flushIPv6RouteCache(ctx context.Context) error {
+	r.logInfof("Flushing IPv6 route cache")
+
+	// Backup current IPv6 routing table if configured
+	var routingBackup string
+	if r.config.BackupRouting {
+		backup, err := r.networkExecutor.ExecuteCommand(ctx, "ip", "-6", "route", "show")
+		if err != nil {
+			r.logWarnf("Failed to backup IPv6 routing table: %v", err)
+		} else {
+			routingBackup = backup
+			r.logInfof("Backed up IPv6 routing table (%d bytes)", len(routingBackup))
+		}
+	}
+
+	output, err := r.networkExecutor.ExecuteCommand(ctx, "ip", "-6", "route", "flush", "cache")
+	if err != nil {
+		return fmt.Errorf("failed to flush IPv6 route cache: %w (output: %s)", err, output)
+	}
+
+	r.logInfof("IPv6 route cache flush complete")
+	if routingBackup != "" {
+		r.logInfof("IPv6 routing backup available for restore if needed")
+	}
+
+	return nil
+}
+
 // verifyOperation verifies that the network operation succeeded.
 func (r *NetworkRemediator) verifyOperation(ctx context.Context) error {
 	// Create a context with timeout for verification
@@ -314,6 +354,11 @@ func (r *NetworkRemediator) verifyOperation(ctx context.Context) error {
 		// Verify routing table exists after reset
 		return r.verifyRoutingTable(verifyCtx)
 
+	case NetworkFlushIPv6Route:
+		// IPv6 route cache flush is immediate, no verification needed
+		r.logInfof("IPv6 route cache flush operation requires no verification")
+		return nil
+
 	default:
 		return fmt.Errorf("unknown operation for verification: %s", r.config.Operation)
 	}
diff --git a/pkg/remediators/network_test.go b/pkg/remediators/network_test.go
index de943f7..c882daf 100644
--- a/pkg/remediators/network_test.go
+++ b/pkg/remediators/network_test.go
@@ -177,6 +177,21 @@ func TestNewNetworkRemediator(t *testing.T) {
 			},
 			wantErr: false,
 		},
+		{
+			name: "valid flush ipv6 route config",
+			config: NetworkConfig{
+				Operation: NetworkFlushIPv6Route,
+			},
+			wantErr: false,
+		},
+		{
+			name: "valid flush ipv6 route config with backup",
+			config: NetworkConfig{
+				Operation:     NetworkFlushIPv6Route,
+				BackupRouting: true,
+			},
+			wantErr: false,
+		},
 		{
 			name: "invalid operation",
 			config: NetworkConfig{
@@ -884,3 +899,164 @@ func TestNetworkRemediator_LogWithLogger(t *testing.T) {
 		t.Errorf("expected 1 warn message, got %d", len(logger.warnMessages))
 	}
 }
+
+// TestNetworkRemediator_FlushIPv6Route tests IPv6 route cache flushing success.
+func TestNetworkRemediator_FlushIPv6Route(t *testing.T) {
+	config := NetworkConfig{
+		Operation: NetworkFlushIPv6Route,
+	}
+
+	r, err := NewNetworkRemediator(config)
+	if err != nil {
+		t.Fatalf("NewNetworkRemediator() error: %v", err)
+	}
+
+	mockExec := &mockNetworkExecutor{}
+	r.SetNetworkExecutor(mockExec)
+
+	problem := types.Problem{
+		Type:     "ipv6-routing-failure",
+		Resource: "ipv6-routing-table",
+		Severity: types.ProblemCritical,
+	}
+
+	ctx := context.Background()
+	err = r.Remediate(ctx, problem)
+	if err != nil {
+		t.Errorf("Remediate() unexpected error: %v", err)
+	}
+
+	// Verify the exact IPv6 route cache flush command + args were issued.
+	mockExec.mu.Lock()
+	commands := mockExec.executedCommands
+	mockExec.mu.Unlock()
+
+	foundFlush := false
+	for _, cmd := range commands {
+		if cmd == "ip -6 route flush cache" {
+			foundFlush = true
+		}
+	}
+	if !foundFlush {
+		t.Errorf("expected exact command 'ip -6 route flush cache' to be executed, got: %v", commands)
+	}
+}
+
+// TestNetworkRemediator_FlushIPv6Route_Failure verifies that a flush failure is a hard error.
+func TestNetworkRemediator_FlushIPv6Route_Failure(t *testing.T) {
+	config := NetworkConfig{
+		Operation: NetworkFlushIPv6Route,
+	}
+
+	r, err := NewNetworkRemediator(config)
+	if err != nil {
+		t.Fatalf("NewNetworkRemediator() error: %v", err)
+	}
+
+	mockExec := &mockNetworkExecutor{
+		shouldFailCommand: true,
+	}
+	r.SetNetworkExecutor(mockExec)
+
+	problem := types.Problem{
+		Type:     "ipv6-routing-failure",
+		Resource: "ipv6-routing-table",
+		Severity: types.ProblemCritical,
+	}
+
+	ctx := context.Background()
+	err = r.Remediate(ctx, problem)
+	if err == nil {
+		t.Errorf("Remediate() expected error for failed IPv6 route cache flush, got nil")
+	}
+}
+
+// TestNetworkRemediator_FlushIPv6Route_Backup verifies the IPv6 routing table is
+// backed up (via "ip -6 route show") before the flush when BackupRouting is set.
+func TestNetworkRemediator_FlushIPv6Route_Backup(t *testing.T) {
+	config := NetworkConfig{
+		Operation:     NetworkFlushIPv6Route,
+		BackupRouting: true,
+	}
+
+	r, err := NewNetworkRemediator(config)
+	if err != nil {
+		t.Fatalf("NewNetworkRemediator() error: %v", err)
+	}
+
+	mockExec := &mockNetworkExecutor{
+		routingTable: "default via fe80::1 dev eth0",
+	}
+	r.SetNetworkExecutor(mockExec)
+
+	problem := types.Problem{
+		Type:     "ipv6-routing-failure",
+		Resource: "ipv6-routing-table",
+		Severity: types.ProblemCritical,
+	}
+
+	ctx := context.Background()
+	err = r.Remediate(ctx, problem)
+	if err != nil {
+		t.Errorf("Remediate() unexpected error: %v", err)
+	}
+
+	mockExec.mu.Lock()
+	commands := mockExec.executedCommands
+	mockExec.mu.Unlock()
+
+	// The backup ("ip -6 route show") must precede the flush ("ip -6 route flush cache").
+	showIdx := -1
+	flushIdx := -1
+	for i, cmd := range commands {
+		if cmd == "ip -6 route show" && showIdx == -1 {
+			showIdx = i
+		}
+		if cmd == "ip -6 route flush cache" && flushIdx == -1 {
+			flushIdx = i
+		}
+	}
+
+	if showIdx == -1 {
+		t.Errorf("expected 'ip -6 route show' backup command, got: %v", commands)
+	}
+	if flushIdx == -1 {
+		t.Errorf("expected 'ip -6 route flush cache' command, got: %v", commands)
+	}
+	if showIdx != -1 && flushIdx != -1 && showIdx >= flushIdx {
+		t.Errorf("expected backup (idx %d) to precede flush (idx %d): %v", showIdx, flushIdx, commands)
+	}
+}
+
+// TestNetworkRemediator_VerifyOperation_FlushIPv6Route verifies that the IPv6
+// route cache flush requires no verification (returns nil) when VerifyAfter is set.
+func TestNetworkRemediator_VerifyOperation_FlushIPv6Route(t *testing.T) {
+	config := NetworkConfig{
+		Operation:     NetworkFlushIPv6Route,
+		VerifyAfter:   true,
+		VerifyTimeout: 2 * time.Second,
+	}
+
+	r, err := NewNetworkRemediator(config)
+	if err != nil {
+		t.Fatalf("NewNetworkRemediator() error: %v", err)
+	}
+
+	mockExec := &mockNetworkExecutor{}
+	r.SetNetworkExecutor(mockExec)
+
+	ctx := context.Background()
+	if err := r.verifyOperation(ctx); err != nil {
+		t.Errorf("verifyOperation() unexpected error: %v", err)
+	}
+
+	problem := types.Problem{
+		Type:     "ipv6-routing-failure",
+		Resource: "ipv6-routing-table",
+		Severity: types.ProblemCritical,
+	}
+
+	if err := r.Remediate(ctx, problem); err != nil {
+		t.Errorf("Remediate() with VerifyAfter unexpected error: %v", err)
+	}
+}

From ac0355620e636a7cb5e6aca113be8055c5210240 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:18:18 -0500
Subject: [PATCH 31/38] test(remediators): verify DNS flush covers IPv6/AAAA
 (Task #17221)

Document and test that flushDNS is address-family agnostic: resolvectl
flush-caches / systemd-resolve --flush-caches clear the full resolver
cache (A and AAAA), so no separate IPv6 DNS-flush op is needed. New
TestNetworkRemediator_FlushDNS_CoversIPv6 asserts the exact full-cache
flush command for both backends and that no family/type-restricting flag
(-4/-6/--type) is passed.
---
 pkg/remediators/network.go      |  5 ++++
 pkg/remediators/network_test.go | 53 +++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/pkg/remediators/network.go b/pkg/remediators/network.go
index ce4fe6d..77c3964 100644
--- a/pkg/remediators/network.go
+++ b/pkg/remediators/network.go
@@ -214,6 +214,11 @@ func (r *NetworkRemediator) executeOperation(ctx context.Context) error {
 }
 
 // flushDNS flushes the DNS resolver cache.
+//
+// The flush is address-family agnostic: both resolvectl flush-caches and
+// systemd-resolve --flush-caches clear the resolver's ENTIRE cache, including
+// AAAA (IPv6) records, not just A (IPv4). There is therefore no separate
+// IPv6 DNS-flush operation — this one covers both families (see Task #17221).
 func (r *NetworkRemediator) flushDNS(ctx context.Context) error {
 	r.logInfof("Flushing DNS cache")
 
diff --git a/pkg/remediators/network_test.go b/pkg/remediators/network_test.go
index c882daf..295a79f 100644
--- a/pkg/remediators/network_test.go
+++ b/pkg/remediators/network_test.go
@@ -298,6 +298,59 @@ func TestNetworkRemediator_FlushDNS(t *testing.T) {
 	}
 }
 
+// TestNetworkRemediator_FlushDNS_CoversIPv6 verifies that the DNS flush is
+// address-family agnostic and therefore covers IPv6 (AAAA) records (Task
+// #17221). resolvectl flush-caches / systemd-resolve --flush-caches clear the
+// resolver's entire cache; the remediator must NOT pass any family-restricting
+// flag (e.g. -4/-6/--type) that would leave AAAA entries cached. This asserts
+// the exact flush command and the absence of any such restriction.
+func TestNetworkRemediator_FlushDNS_CoversIPv6(t *testing.T) {
+	for _, method := range []string{"resolvectl", "systemd-resolve"} {
+		t.Run(method, func(t *testing.T) {
+			mock := &mockNetworkExecutor{dnsFlushMethod: method}
+			config := NetworkConfig{
+				Operation:     NetworkFlushDNS,
+				VerifyTimeout: 2 * time.Second,
+			}
+			r, err := NewNetworkRemediator(config)
+			if err != nil {
+				t.Fatalf("NewNetworkRemediator: %v", err)
+			}
+			r.networkExecutor = mock
+
+			if err := r.Remediate(context.Background(), types.Problem{}); err != nil {
+				t.Fatalf("flush-dns remediation failed: %v", err)
+			}
+
+			var flushCmd string
+			for _, c := range mock.executedCommands {
+				if strings.HasPrefix(c, method) {
+					flushCmd = c
+				}
+			}
+			if flushCmd == "" {
+				t.Fatalf("expected a %s flush command; executed: %v", method, mock.executedCommands)
+			}
+			// Full-cache flush, no per-family restriction.
+			var wantCmd string
+			switch method {
+			case "resolvectl":
+				wantCmd = "resolvectl flush-caches"
+			case "systemd-resolve":
+				wantCmd = "systemd-resolve --flush-caches"
+			}
+			if flushCmd != wantCmd {
+				t.Errorf("flush command = %q, want %q (a family-agnostic full-cache flush)", flushCmd, wantCmd)
+			}
+			for _, restrict := range []string{"-4", "-6", "--type", "ipv4", "ipv6"} {
+				if strings.Contains(flushCmd, restrict) {
+					t.Errorf("flush command %q contains family/type restriction %q; AAAA entries would not be cleared", flushCmd, restrict)
+				}
+			}
+		})
+	}
+}
+
 // TestNetworkRemediator_RestartInterface tests interface restart.
 func TestNetworkRemediator_RestartInterface(t *testing.T) {
 	config := NetworkConfig{

From 57001840eabc43059e2672925a50607056468779 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:25:05 -0500
Subject: [PATCH 32/38] test(network): build-tagged IPv6 pinger integration
 tests (Task #17223)

Add pinger_ipv6_integration_test.go (//go:build integration) exercising
the real defaultPinger against ::1 and a discovered link-local target,
asserting FamilyIPv6 classification; skips cleanly when IPv6 loopback or
CAP_NET_RAW is unavailable. Complements the untagged platform-agnostic
v6 unit tests (zone/family/destAddr/peer-match) in pinger_test.go. The
-short unit tier never opens raw sockets.
---
 .../network/pinger_ipv6_integration_test.go   | 134 ++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 pkg/monitors/network/pinger_ipv6_integration_test.go

diff --git a/pkg/monitors/network/pinger_ipv6_integration_test.go b/pkg/monitors/network/pinger_ipv6_integration_test.go
new file mode 100644
index 0000000..2874def
--- /dev/null
+++ b/pkg/monitors/network/pinger_ipv6_integration_test.go
@@ -0,0 +1,134 @@
+//go:build integration
+
+// Package network IPv6 pinger integration tests.
+//
+// These tests exercise the REAL defaultPinger against the IPv6 loopback (::1)
+// and therefore require raw ICMPv6 socket privileges (CAP_NET_RAW) and a host
+// with IPv6 enabled. They are gated behind the `integration` build tag so the
+// default `go test -short` unit run never attempts raw sockets:
+//
+//	go test -tags=integration -run IPv6 ./pkg/monitors/network/...
+//
+// The platform-agnostic IPv6 pinger UNIT tests (address-family/zone parsing,
+// destination building, reply matching) live untagged in pinger_test.go; this
+// file adds the live-socket v6 coverage that cannot run in the unit tier.
+package network
+
+import (
+	"context"
+	"net"
+	"testing"
+	"time"
+)
+
+// ipv6LoopbackAvailable reports whether the host has a usable IPv6 loopback,
+// so the test can skip cleanly on IPv4-only / IPv6-disabled environments
+// instead of failing.
+func ipv6LoopbackAvailable(t *testing.T) bool {
+	t.Helper()
+	ln, err := net.Listen("tcp6", "[::1]:0")
+	if err != nil {
+		return false
+	}
+	_ = ln.Close()
+	return true
+}
+
+// TestDefaultPinger_IPv6Loopback_Integration pings ::1 with the real pinger and
+// asserts the result is classified as the IPv6 family. It skips (not fails) when
+// IPv6 loopback is unavailable or raw ICMPv6 sockets require privileges the test
+// process lacks.
+func TestDefaultPinger_IPv6Loopback_Integration(t *testing.T) {
+	if !ipv6LoopbackAvailable(t) {
+		t.Skip("IPv6 loopback not available on this host; skipping IPv6 ping integration test")
+	}
+
+	pinger := newDefaultPinger()
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	results, err := pinger.Ping(ctx, "::1", 2, 2*time.Second)
+	if err != nil {
+		// Raw ICMPv6 sockets need elevated privileges; treat as a skip so the
+		// test is meaningful where it can run and silent where it cannot.
+		t.Skipf("IPv6 ping to ::1 could not run (likely missing CAP_NET_RAW): %v", err)
+	}
+
+	if len(results) == 0 {
+		t.Fatal("expected at least one ping result for ::1")
+	}
+
+	var sawSuccess bool
+	for i, r := range results {
+		if r.Family != FamilyIPv6 {
+			t.Errorf("result[%d].Family = %q, want %q for ::1", i, r.Family, FamilyIPv6)
+		}
+		if r.Success {
+			sawSuccess = true
+		}
+	}
+	if !sawSuccess {
+		t.Errorf("expected at least one successful ICMPv6 echo to ::1; got %+v", results)
+	}
+}
+
+// TestDefaultPinger_IPv6LinkLocal_Integration verifies that pinging a link-local
+// target with a zone does not error at the resolve/send layer (it may legitimately
+// time out with no reply). It guards on IPv6 loopback availability and treats a
+// privilege/socket error as a skip.
+func TestDefaultPinger_IPv6LinkLocal_Integration(t *testing.T) {
+	if !ipv6LoopbackAvailable(t) {
+		t.Skip("IPv6 loopback not available on this host; skipping link-local integration test")
+	}
+
+	// Resolve a usable link-local target+zone from the host's interfaces; skip
+	// if none is present (e.g. minimal container netns).
+	target, ok := firstLinkLocalTarget()
+	if !ok {
+		t.Skip("no IPv6 link-local address with a zone found on this host")
+	}
+
+	pinger := newDefaultPinger()
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+	defer cancel()
+
+	results, err := pinger.Ping(ctx, target, 1, 1*time.Second)
+	if err != nil {
+		t.Skipf("link-local ping to %s could not run (likely missing CAP_NET_RAW): %v", target, err)
+	}
+	if len(results) == 0 {
+		t.Fatalf("expected a result for %s", target)
+	}
+	// The probe may time out (no reply), but the family must still be classified v6.
+	if results[0].Family != FamilyIPv6 {
+		t.Errorf("result.Family = %q, want %q for link-local %s", results[0].Family, FamilyIPv6, target)
+	}
+}
+
+// firstLinkLocalTarget returns the first fe80::/10 address found on a non-loopback
+// interface formatted as "addr%zone", and whether one was found.
+func firstLinkLocalTarget() (string, bool) {
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return "", false
+	}
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagLoopback != 0 || iface.Flags&net.FlagUp == 0 {
+			continue
+		}
+		addrs, err := iface.Addrs()
+		if err != nil {
+			continue
+		}
+		for _, a := range addrs {
+			ipnet, ok := a.(*net.IPNet)
+			if !ok {
+				continue
+			}
+			if ipnet.IP.To4() == nil && ipnet.IP.IsLinkLocalUnicast() {
+				return ipnet.IP.String() + "%" + iface.Name, true
+			}
+		}
+	}
+	return "", false
+}

From 053c4ea72666eb24d823aa50b8461280c0c517e8 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:32:07 -0500
Subject: [PATCH 33/38] test(integration): kind dual-stack cluster integration
 test (Task #17226)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add test/integration/testdata/kind-dualstack.yaml (networking.ipFamily:
dual, dual pod/service subnets) and test/integration/ipv6_kind_test.go
(//go:build integration): creates a uniquely-named dual-stack kind
cluster, waits for node readiness, and asserts dual-stack — node PodCIDRs
+ InternalIPs carry both families and a RequireDualStack Service gets
both an IPv4 and IPv6 ClusterIP (fails on single-stack). Skips cleanly
when short/kind-missing/docker-down; t.Cleanup always deletes the
cluster; isolated temp kubeconfig.
---
 test/integration/ipv6_kind_test.go            | 334 ++++++++++++++++++
 test/integration/testdata/kind-dualstack.yaml |  38 ++
 2 files changed, 372 insertions(+)
 create mode 100644 test/integration/ipv6_kind_test.go
 create mode 100644 test/integration/testdata/kind-dualstack.yaml

diff --git a/test/integration/ipv6_kind_test.go b/test/integration/ipv6_kind_test.go
new file mode 100644
index 0000000..0a26c5d
--- /dev/null
+++ b/test/integration/ipv6_kind_test.go
@@ -0,0 +1,334 @@
+// Copyright 2025 Support Tools Contributors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+
+//go:build integration
+// +build integration
+
+// Package integration contains top-level integration tests that exercise
+// Node Doctor against real infrastructure. This file brings up a dual-stack
+// (IPv4 + IPv6) kind cluster and asserts that the cluster is genuinely
+// dual-stack, validating the project's IPv6/dual-stack code paths end-to-end.
+//
+// The test is gated behind the `integration` build tag and skips cleanly when
+// the environment cannot run it (no kind binary, no Docker, or -short). It is
+// intended to run in CI where Docker is available; local/dev sandboxes without
+// Docker will skip rather than fail or hang.
+//
+// Run with:
+//
+//	go test -tags=integration ./test/integration/... -run IPv6Kind -v
+package integration
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/tools/clientcmd"
+)
+
+const (
+	// dualStackConfigPath is the kind config that enables dual-stack networking.
+	dualStackConfigPath = "testdata/kind-dualstack.yaml"
+
+	// clusterCreateTimeout bounds the (potentially slow) cluster bring-up.
+	clusterCreateTimeout = 6 * time.Minute
+
+	// nodeReadyTimeout bounds waiting for nodes to report Ready.
+	nodeReadyTimeout = 3 * time.Minute
+
+	// clusterDeleteTimeout bounds teardown.
+	clusterDeleteTimeout = 2 * time.Minute
+)
+
+// TestIPv6KindDualStackCluster brings up a dual-stack kind cluster and asserts
+// the cluster is genuinely dual-stack (both an IPv4 and an IPv6 pod CIDR on the
+// node, plus IPv6 service IPs in kube-system). The dual-stack assertions are
+// designed to FAIL on a single-stack cluster.
+func TestIPv6KindDualStackCluster(t *testing.T) {
+	// ---- Skip guards: never fail or hang on an unusable environment. ----
+
+	// Guard 1: -short skips heavy infra tests.
+	if testing.Short() {
+		t.Skip("skipping dual-stack kind integration test in -short mode")
+	}
+
+	// Guard 2: kind binary must be installed.
+	if _, err := exec.LookPath("kind"); err != nil {
+		t.Skip("skipping: kind binary not found in PATH")
+	}
+
+	// Guard 3: Docker must be available AND running. `docker info` is a cheap
+	// check that fails fast (non-zero exit) when the daemon is unreachable,
+	// so we skip instead of letting `kind create cluster` hang/fail later.
+	if _, err := exec.LookPath("docker"); err != nil {
+		t.Skip("skipping: docker binary not found in PATH")
+	}
+	{
+		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancel()
+		if out, err := exec.CommandContext(ctx, "docker", "info").CombinedOutput(); err != nil {
+			t.Skipf("skipping: docker daemon not available (docker info failed: %v): %s",
+				err, strings.TrimSpace(string(out)))
+		}
+	}
+
+	// Config file must exist relative to this package directory.
+	if _, err := os.Stat(dualStackConfigPath); err != nil {
+		t.Skipf("skipping: dual-stack kind config not found at %s: %v", dualStackConfigPath, err)
+	}
+
+	// ---- Create the cluster. ----
+
+	// Unique cluster name so parallel/CI runs don't collide.
+	clusterName := fmt.Sprintf("nd-dualstack-%d", time.Now().UnixNano())
+
+	// Always register cleanup BEFORE create returns, so a panic or failure
+	// mid-create still tears the cluster down. kind delete is a no-op if the
+	// cluster doesn't exist.
+	t.Cleanup(func() {
+		ctx, cancel := context.WithTimeout(context.Background(), clusterDeleteTimeout)
+		defer cancel()
+		deleteKindCluster(ctx, t, clusterName)
+	})
+
+	createCtx, cancelCreate := context.WithTimeout(context.Background(), clusterCreateTimeout)
+	defer cancelCreate()
+
+	t.Logf("creating dual-stack kind cluster %q from %s", clusterName, dualStackConfigPath)
+	createArgs := []string{
+		"create", "cluster",
+		"--name", clusterName,
+		"--config", dualStackConfigPath,
+		"--wait", "120s",
+	}
+	cmd := exec.CommandContext(createCtx, "kind", createArgs...)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		// A real failure to create a cluster in an environment that claimed to
+		// have Docker is a genuine test failure.
+		t.Fatalf("kind create cluster failed: %v", err)
+	}
+
+	// ---- Build a kube client from the cluster's kubeconfig. ----
+
+	kubeconfigPath := writeKindKubeconfig(t, clusterName)
+	restCfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath)
+	if err != nil {
+		t.Fatalf("failed to build rest config from kubeconfig %s: %v", kubeconfigPath, err)
+	}
+	clientset, err := kubernetes.NewForConfig(restCfg)
+	if err != nil {
+		t.Fatalf("failed to create kubernetes clientset: %v", err)
+	}
+
+	// ---- Wait for node(s) Ready. ----
+
+	waitForNodesReady(t, clientset, nodeReadyTimeout)
+
+	// ---- Dual-stack assertions. ----
+
+	assertNodesDualStack(t, clientset)
+	assertDualStackServiceGetsIPv6(t, clientset)
+}
+
+// deleteKindCluster tears down the named kind cluster. Failures are logged, not
+// fatal, since this runs in cleanup.
+func deleteKindCluster(ctx context.Context, t *testing.T, name string) {
+	t.Helper()
+	t.Logf("deleting kind cluster %q", name)
+	cmd := exec.CommandContext(ctx, "kind", "delete", "cluster", "--name", name)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	if err := cmd.Run(); err != nil {
+		t.Logf("warning: failed to delete kind cluster %q: %v", name, err)
+	}
+}
+
+// writeKindKubeconfig exports the cluster's kubeconfig to a temp file and
+// returns its path. Using an isolated kubeconfig avoids mutating the user's
+// ~/.kube/config.
+func writeKindKubeconfig(t *testing.T, clusterName string) string {
+	t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	out, err := exec.CommandContext(ctx, "kind", "get", "kubeconfig", "--name", clusterName).Output()
+	if err != nil {
+		t.Fatalf("kind get kubeconfig failed for %q: %v", clusterName, err)
+	}
+
+	path := filepath.Join(t.TempDir(), "kubeconfig")
+	if err := os.WriteFile(path, out, 0o600); err != nil {
+		t.Fatalf("failed to write kubeconfig to %s: %v", path, err)
+	}
+	return path
+}
+
+// waitForNodesReady polls until every node reports Ready or the timeout elapses.
+func waitForNodesReady(t *testing.T, clientset kubernetes.Interface, timeout time.Duration) {
+	t.Helper()
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
+		nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+		cancel()
+		if err == nil && len(nodes.Items) > 0 && allNodesReady(nodes.Items) {
+			t.Logf("all %d node(s) Ready", len(nodes.Items))
+			return
+		}
+		time.Sleep(5 * time.Second)
+	}
+	t.Fatalf("nodes did not become Ready within %s", timeout)
+}
+
+func allNodesReady(nodes []corev1.Node) bool {
+	for _, n := range nodes {
+		ready := false
+		for _, c := range n.Status.Conditions {
+			if c.Type == corev1.NodeReady && c.Status == corev1.ConditionTrue {
+				ready = true
+				break
+			}
+		}
+		if !ready {
+			return false
+		}
+	}
+	return true
+}
+
+// assertNodesDualStack is the PRIMARY dual-stack assertion. On a single-stack
+// cluster a node has exactly one pod CIDR (and only IPv4 internal addresses);
+// these checks would fail there.
+func assertNodesDualStack(t *testing.T, clientset kubernetes.Interface) {
+	t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		t.Fatalf("failed to list nodes: %v", err)
+	}
+	if len(nodes.Items) == 0 {
+		t.Fatal("no nodes found in cluster")
+	}
+
+	for _, node := range nodes.Items {
+		// Collect pod CIDRs (PodCIDRs is the dual-stack-aware field; PodCIDR is
+		// the legacy single value).
+		cidrs := node.Spec.PodCIDRs
+		if len(cidrs) == 0 && node.Spec.PodCIDR != "" {
+			cidrs = []string{node.Spec.PodCIDR}
+		}
+		t.Logf("node %q PodCIDRs=%v", node.Name, cidrs)
+
+		hasV4CIDR, hasV6CIDR := false, false
+		for _, c := range cidrs {
+			if isIPv6CIDR(c) {
+				hasV6CIDR = true
+			} else {
+				hasV4CIDR = true
+			}
+		}
+		if !hasV4CIDR || !hasV6CIDR {
+			t.Errorf("node %q is not dual-stack: PodCIDRs=%v (want both IPv4 and IPv6)", node.Name, cidrs)
+		}
+
+		// Node addresses should also include both families.
+		hasV4Addr, hasV6Addr := false, false
+		for _, addr := range node.Status.Addresses {
+			if addr.Type != corev1.NodeInternalIP {
+				continue
+			}
+			if isIPv6(addr.Address) {
+				hasV6Addr = true
+			} else {
+				hasV4Addr = true
+			}
+		}
+		if !hasV4Addr || !hasV6Addr {
+			t.Errorf("node %q internal addresses are not dual-stack: %v (want both IPv4 and IPv6)",
+				node.Name, node.Status.Addresses)
+		}
+	}
+}
+
+// assertDualStackServiceGetsIPv6 creates a Service with
+// ipFamilyPolicy=RequireDualStack and asserts the apiserver allocates BOTH an
+// IPv4 and an IPv6 ClusterIP. On a single-stack cluster the apiserver rejects
+// RequireDualStack outright, so this assertion is impossible to satisfy without
+// a real dual-stack service CIDR range — making it a definitive dual-stack
+// proof that complements the node-level checks.
+func assertDualStackServiceGetsIPv6(t *testing.T, clientset kubernetes.Interface) {
+	t.Helper()
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+
+	policy := corev1.IPFamilyPolicyRequireDualStack
+	svc := &corev1.Service{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "nd-dualstack-probe",
+			Namespace: "default",
+		},
+		Spec: corev1.ServiceSpec{
+			Type:           corev1.ServiceTypeClusterIP,
+			IPFamilyPolicy: &policy,
+			Ports: []corev1.ServicePort{
+				{Name: "http", Port: 80},
+			},
+			Selector: map[string]string{"app": "nd-dualstack-probe"},
+		},
+	}
+
+	created, err := clientset.CoreV1().Services("default").Create(ctx, svc, metav1.CreateOptions{})
+	if err != nil {
+		// The apiserver rejects RequireDualStack on a single-stack cluster.
+		t.Fatalf("failed to create RequireDualStack service (cluster not dual-stack?): %v", err)
+	}
+	t.Cleanup(func() {
+		delCtx, delCancel := context.WithTimeout(context.Background(), 15*time.Second)
+		defer delCancel()
+		_ = clientset.CoreV1().Services("default").Delete(delCtx, created.Name, metav1.DeleteOptions{})
+	})
+
+	ips := created.Spec.ClusterIPs
+	if len(ips) == 0 && created.Spec.ClusterIP != "" {
+		ips = []string{created.Spec.ClusterIP}
+	}
+	t.Logf("RequireDualStack service got ClusterIPs=%v IPFamilies=%v", ips, created.Spec.IPFamilies)
+
+	hasV4, hasV6 := false, false
+	for _, ip := range ips {
+		if isIPv6(ip) {
+			hasV6 = true
+		} else {
+			hasV4 = true
+		}
+	}
+	if !hasV4 || !hasV6 {
+		t.Errorf("RequireDualStack service did not get both IP families: ClusterIPs=%v (want IPv4 and IPv6)", ips)
+	}
+}
+
+// isIPv6CIDR reports whether a CIDR string is an IPv6 CIDR (heuristic: contains
+// a colon). Pod/Service CIDRs are well-formed, so this is sufficient.
+func isIPv6CIDR(cidr string) bool {
+	return strings.Contains(cidr, ":")
+}
+
+// isIPv6 reports whether an IP string is IPv6 (heuristic: contains a colon).
+func isIPv6(ip string) bool {
+	return strings.Contains(ip, ":")
+}
diff --git a/test/integration/testdata/kind-dualstack.yaml b/test/integration/testdata/kind-dualstack.yaml
new file mode 100644
index 0000000..6268522
--- /dev/null
+++ b/test/integration/testdata/kind-dualstack.yaml
@@ -0,0 +1,38 @@
+# KIND cluster configuration for Node Doctor dual-stack integration tests
+#
+# This configuration creates a single control-plane node cluster with
+# DUAL-STACK (IPv4 + IPv6) networking enabled. It is consumed by
+# test/integration/ipv6_kind_test.go (build tag: integration) to validate
+# that Node Doctor's IPv6/dual-stack code paths run against a real cluster
+# whose nodes advertise both IPv4 and IPv6 pod CIDRs and addresses.
+#
+# Based on test/e2e/cluster/kind-config.yaml to stay consistent with the
+# repo's existing kind usage (privileged containers, allow-privileged).
+
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+
+# Single node cluster (sufficient for dual-stack assertions)
+nodes:
+  - role: control-plane
+    # Enable privileged containers (required for Node Doctor)
+    kubeadmConfigPatches:
+      - |
+        kind: ClusterConfiguration
+        apiServer:
+          extraArgs:
+            "allow-privileged": "true"
+
+# Networking configuration: DUAL-STACK (IPv4 + IPv6)
+networking:
+  # Enable dual-stack networking. This is the key field that flips the
+  # cluster from single-stack to dual-stack; the integration test asserts
+  # on it via the kube API.
+  ipFamily: dual
+
+  # Explicit dual pod/service CIDRs (comma-separated IPv4,IPv6).
+  podSubnet: "10.244.0.0/16,fd00:10:244::/56"
+  serviceSubnet: "10.96.0.0/16,fd00:10:96::/112"
+
+  # Keep the default CNI (kindnet) which supports dual-stack.
+  disableDefaultCNI: false

From cfff0aaae36de97509373006cc3fc593f33e1757 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:35:27 -0500
Subject: [PATCH 34/38] ci: dedicated IPv6/dual-stack integration workflow
 (Task #17227)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add .github/workflows/ci-ipv6.yml with two jobs: ipv6-icmp (compile
network tests with -tags=integration, run the v6 ICMP tests under sudo +
NODE_DOCTOR_ICMP_INTEGRATION for CAP_NET_RAW) and ipv6-kind-dualstack
(enable Docker daemon IPv6 with ip6tables+experimental — required for
kind dual-stack — install pinned kind, run TestIPv6KindDualStackCluster
with 20m timeout + always-cleanup). Path-filtered to IPv6-relevant dirs,
workflow_dispatch, concurrency cancel. Action/Go versions match ci.yml.
---
 .github/workflows/ci-ipv6.yml | 205 ++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 .github/workflows/ci-ipv6.yml

diff --git a/.github/workflows/ci-ipv6.yml b/.github/workflows/ci-ipv6.yml
new file mode 100644
index 0000000..0ee2a4f
--- /dev/null
+++ b/.github/workflows/ci-ipv6.yml
@@ -0,0 +1,205 @@
+name: CI - IPv6 / Dual-Stack
+
+# Runs IPv6 and dual-stack integration tests that require Docker with IPv6
+# enabled (for the kind cluster) and CAP_NET_RAW (for raw ICMPv6). Kept as a
+# separate workflow so these heavy/privileged jobs do not block the main CI
+# aggregate (ci-success) and so a kind cluster flake does not mask ICMP
+# results (and vice versa).
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'v*'
+    paths:
+      - 'pkg/monitors/network/**'
+      - 'pkg/exporters/**'
+      - 'test/integration/**'
+      - '.github/workflows/ci-ipv6.yml'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'pkg/monitors/network/**'
+      - 'pkg/exporters/**'
+      - 'test/integration/**'
+      - '.github/workflows/ci-ipv6.yml'
+  workflow_dispatch:
+
+env:
+  GO_VERSION: '1.25'
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
+
+# Cancel in-flight runs for the same PR / branch so stale runs don't waste
+# runner minutes when new commits are pushed.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # ---------------------------------------------------------------------------
+  # Job: ipv6-icmp
+  #
+  # Compiles the network package test binary as the runner user (preserving the
+  # Go module cache) and then re-executes ONLY the IPv6 integration tests under
+  # sudo to obtain CAP_NET_RAW for raw ICMPv6 sockets.
+  #
+  # NODE_DOCTOR_ICMP_INTEGRATION=1 causes socket / permission errors to be hard
+  # failures rather than skips, so a misconfigured runner surfaces loudly.
+  #
+  # Kept separate from ipv6-kind-dualstack so a kind cluster flake does not
+  # hide ICMP failures and vice versa.
+  # ---------------------------------------------------------------------------
+  ipv6-icmp:
+    name: IPv6 ICMP Integration
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Setup Go
+        uses: actions/setup-go@v6
+        with:
+          go-version: ${{ env.GO_VERSION }}
+          cache: true
+
+      - name: Download dependencies
+        run: go mod download
+
+      # Enable the IPv6 loopback so ::1 is reachable inside the runner netns.
+      - name: Enable IPv6 loopback sysctl
+        run: |
+          sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0
+          sudo sysctl -w net.ipv6.conf.default.disable_ipv6=0
+          sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=0
+
+      # Compile as the runner user so the binary inherits the Go env / module
+      # cache; the -tags flag includes the `integration` gate.
+      - name: Compile network test binary (with integration tag)
+        run: go test -c -tags=integration -o /tmp/nd-net.test ./pkg/monitors/network/
+
+      # Execute only the IPv6 integration tests under sudo for CAP_NET_RAW.
+      - name: Run IPv6 ICMP integration tests (privileged)
+        run: |
+          sudo NODE_DOCTOR_ICMP_INTEGRATION=1 /tmp/nd-net.test \
+            -test.run 'TestDefaultPinger_IPv6Loopback_Integration|TestDefaultPinger_IPv6LinkLocal_Integration' \
+            -test.v \
+            -test.timeout 2m
+
+  # ---------------------------------------------------------------------------
+  # Job: ipv6-kind-dualstack
+  #
+  # Enables Docker IPv6 on the GitHub runner, installs the kind binary and
+  # kubectl, then runs TestIPv6KindDualStackCluster which creates its own
+  # dual-stack kind cluster from test/integration/testdata/kind-dualstack.yaml.
+  #
+  # The Go test owns full cluster lifecycle (create + t.Cleanup delete).  A
+  # safety-net step with `if: always()` calls `kind delete clusters --all`
+  # after the test so lingering clusters do not consume runner resources on
+  # test failure / panic.
+  # ---------------------------------------------------------------------------
+  ipv6-kind-dualstack:
+    name: IPv6 Kind Dual-Stack Cluster
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Setup Go
+        uses: actions/setup-go@v6
+        with:
+          go-version: ${{ env.GO_VERSION }}
+          cache: true
+
+      - name: Download dependencies
+        run: go mod download
+
+      # -----------------------------------------------------------------------
+      # Enable Docker daemon IPv6 BEFORE kind creates the dual-stack cluster.
+      # Without this, kind's kindnet CNI cannot plumb IPv6 pod addresses and
+      # `kind create cluster --config kind-dualstack.yaml` fails or produces a
+      # single-stack cluster.
+      #
+      # We write daemon.json, restart Docker, and wait until the socket comes
+      # back before proceeding.  The `ip6tables` + `experimental` flags are
+      # required for kernel IPv6 NAT support inside the kind node containers.
+      # -----------------------------------------------------------------------
+      - name: Enable Docker daemon IPv6
+        run: |
+          sudo mkdir -p /etc/docker
+          # Merge our IPv6 settings into daemon.json (overwrite is safe on a
+          # fresh GitHub runner where /etc/docker/daemon.json does not exist).
+          cat <<'EOF' | sudo tee /etc/docker/daemon.json
+          {
+            "ipv6": true,
+            "fixed-cidr-v6": "2001:db8:1::/64",
+            "experimental": true,
+            "ip6tables": true
+          }
+          EOF
+          sudo systemctl restart docker
+          # Poll until the Docker socket is back (up to 30 s).
+          for i in $(seq 1 30); do
+            if docker info >/dev/null 2>&1; then
+              echo "Docker daemon is back (attempt $i)"
+              break
+            fi
+            echo "Waiting for Docker daemon... ($i/30)"
+            sleep 1
+          done
+          docker info
+
+      # Enable IPv6 on the host so the runner's kernel and kind node containers
+      # can create IPv6 interfaces.
+      - name: Enable IPv6 sysctls
+        run: |
+          sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0
+          sudo sysctl -w net.ipv6.conf.default.disable_ipv6=0
+          sudo sysctl -w net.ipv6.conf.all.forwarding=1
+
+      # Install the kind binary directly so the Go test can call `kind` from
+      # PATH without kind-action creating its own cluster first (which would
+      # collide with the test's dynamically-named dual-stack cluster).
+      - name: Install kind binary
+        run: |
+          KIND_VERSION="v0.27.0"
+          curl -fsSL "https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-amd64" \
+            -o /tmp/kind
+          chmod +x /tmp/kind
+          sudo mv /tmp/kind /usr/local/bin/kind
+          kind version
+
+      # kubectl is available on ubuntu-latest runners but pin a version check
+      # so the step fails fast if the binary is unexpectedly missing.
+      - name: Verify kubectl is available
+        run: kubectl version --client
+
+      # Run ONLY the dual-stack kind integration test. The test constructs its
+      # own unique cluster name to avoid collisions with parallel runs.
+      # -timeout covers: cluster create (≤6 min) + node-ready wait (≤3 min)
+      # + assertions + cleanup (≤2 min) = 11 min with headroom to 20 min.
+      - name: Run dual-stack kind integration test
+        run: |
+          go test -tags=integration -v -timeout 20m \
+            ./test/integration/... \
+            -run TestIPv6KindDualStackCluster
+
+      # Safety-net cleanup: delete any kind clusters left over by a test
+      # failure or panic.  Runs unconditionally (if: always()) so a flaky
+      # cluster create does not leave a zombie cluster consuming runner resources.
+      - name: Cleanup kind clusters (safety net)
+        if: always()
+        run: |
+          echo "Existing kind clusters:"
+          kind get clusters || true
+          kind delete clusters --all || true
+          echo "Cleanup complete"

From 94c06d82f94508698b4b5ba84e91af75cbc58409 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:37:45 -0500
Subject: [PATCH 35/38] chore(config): add IPv6/dual-stack monitors to default
 config (Task #17228)

Add explicit ipv6-sysctl-check, ipv6-route-check, ipv6-neighbor-check,
and ipv6-firewall-check entries (matching each monitor's DefaultConfig)
to config/node-doctor.yaml, with a note that they are detection-only and
degrade gracefully on IPv4-only nodes (set expectIPv6Enabled/enabled
false to silence). Validated via --validate-config.
---
 config/node-doctor.yaml | 50 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/config/node-doctor.yaml b/config/node-doctor.yaml
index ba82427..0a9e764 100644
--- a/config/node-doctor.yaml
+++ b/config/node-doctor.yaml
@@ -208,6 +208,56 @@ monitors:
       warningLatency: 50
       criticalLatency: 100
 
+  # ---------------------------------------------------------------------------
+  # IPv6 / dual-stack monitors (detection-only — never modify host settings).
+  # These degrade gracefully on IPv4-only nodes: a missing IPv6 stack is
+  # reported as a warning, not an error. On IPv4-only clusters set
+  # expectIPv6Enabled: false (or enabled: false) to silence them.
+  # ---------------------------------------------------------------------------
+
+  # IPv6 disable_ipv6 sysctl monitor — flags IPv6 disabled when it is expected on.
+  - name: ipv6-sysctl-check
+    type: network-ipv6-sysctl
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true
+      checkPerInterface: false
+      procPath: /proc
+
+  # IPv6 default-route monitor — flags a missing IPv6 default route when expected.
+  - name: ipv6-route-check
+    type: network-ipv6-route
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectDefaultRoute: true
+      procPath: /proc
+
+  # IPv6 RA/SLAAC + address-presence monitor (link-local/global address, accept_ra).
+  - name: ipv6-neighbor-check
+    type: network-ipv6-neighbor
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true
+      checkPerInterface: true
+      requireGlobalAddress: false
+      procPath: /proc
+
+  # IPv6 firewall sanity monitor (detection-only ip6tables/nft listing).
+  - name: ipv6-firewall-check
+    type: network-ipv6-firewall
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true
+      backend: auto
+
 # Exporters - export monitoring data to various systems
 exporters:
   # Kubernetes Exporter - Updates node conditions and creates events

From 34135d474243b2bd5b69deb1daadc91edcb328cf Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:42:16 -0500
Subject: [PATCH 36/38] feat(helm): IPv6 monitors + dual-stack bind address in
 chart (Task #17229)

Add ipv6Sysctl/ipv6Route/ipv6Neighbor/ipv6Firewall monitor blocks to
values.yaml(.template) and render them in configmap.yaml (each gated by
its .enabled), using procPath /host/proc (host fs bind-mount). Switch
exporters.http.bindAddress default 0.0.0.0 -> :: (dual-stack; server
falls back to 0.0.0.0 when IPv6 is disabled). helm template renders;
rendered config contains the four network-ipv6-* types and bindAddress "::".
---
 helm/node-doctor/templates/configmap.yaml | 51 +++++++++++++++++++++++
 helm/node-doctor/values.yaml              | 46 +++++++++++++++++++-
 helm/node-doctor/values.yaml.template     | 46 +++++++++++++++++++-
 3 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/helm/node-doctor/templates/configmap.yaml b/helm/node-doctor/templates/configmap.yaml
index dc44446..63e2ca1 100644
--- a/helm/node-doctor/templates/configmap.yaml
+++ b/helm/node-doctor/templates/configmap.yaml
@@ -93,6 +93,57 @@ data:
           procPath: /host/proc
       {{- end }}
 
+      {{- if .Values.monitors.ipv6Sysctl.enabled }}
+      # IPv6 sysctl Monitor — detection-only, degrades gracefully on IPv4-only nodes
+      - name: ipv6-sysctl-check
+        type: network-ipv6-sysctl
+        enabled: true
+        interval: {{ .Values.monitors.ipv6Sysctl.interval }}
+        timeout: {{ .Values.monitors.ipv6Sysctl.timeout }}
+        config:
+          expectIPv6Enabled: {{ .Values.monitors.ipv6Sysctl.expectIPv6Enabled }}
+          checkPerInterface: {{ .Values.monitors.ipv6Sysctl.checkPerInterface }}
+          procPath: {{ .Values.monitors.ipv6Sysctl.procPath }}
+      {{- end }}
+
+      {{- if .Values.monitors.ipv6Route.enabled }}
+      # IPv6 Route Monitor — detection-only, degrades gracefully on IPv4-only nodes
+      - name: ipv6-route-check
+        type: network-ipv6-route
+        enabled: true
+        interval: {{ .Values.monitors.ipv6Route.interval }}
+        timeout: {{ .Values.monitors.ipv6Route.timeout }}
+        config:
+          expectDefaultRoute: {{ .Values.monitors.ipv6Route.expectDefaultRoute }}
+          procPath: {{ .Values.monitors.ipv6Route.procPath }}
+      {{- end }}
+
+      {{- if .Values.monitors.ipv6Neighbor.enabled }}
+      # IPv6 Neighbor Monitor — detection-only, degrades gracefully on IPv4-only nodes
+      - name: ipv6-neighbor-check
+        type: network-ipv6-neighbor
+        enabled: true
+        interval: {{ .Values.monitors.ipv6Neighbor.interval }}
+        timeout: {{ .Values.monitors.ipv6Neighbor.timeout }}
+        config:
+          expectIPv6Enabled: {{ .Values.monitors.ipv6Neighbor.expectIPv6Enabled }}
+          checkPerInterface: {{ .Values.monitors.ipv6Neighbor.checkPerInterface }}
+          requireGlobalAddress: {{ .Values.monitors.ipv6Neighbor.requireGlobalAddress }}
+          procPath: {{ .Values.monitors.ipv6Neighbor.procPath }}
+      {{- end }}
+
+      {{- if .Values.monitors.ipv6Firewall.enabled }}
+      # IPv6 Firewall Monitor — detection-only, degrades gracefully on IPv4-only nodes
+      - name: ipv6-firewall-check
+        type: network-ipv6-firewall
+        enabled: true
+        interval: {{ .Values.monitors.ipv6Firewall.interval }}
+        timeout: {{ .Values.monitors.ipv6Firewall.timeout }}
+        config:
+          expectIPv6Enabled: {{ .Values.monitors.ipv6Firewall.expectIPv6Enabled }}
+          backend: {{ .Values.monitors.ipv6Firewall.backend }}
+      {{- end }}
+
       {{- if .Values.overlayTest.enabled }}
       # CNI Connectivity Monitor - Tests overlay network connectivity
       # Uses overlay-test pods for accurate CNI testing
diff --git a/helm/node-doctor/values.yaml b/helm/node-doctor/values.yaml
index 8942d7b..492376f 100644
--- a/helm/node-doctor/values.yaml
+++ b/helm/node-doctor/values.yaml
@@ -362,6 +362,48 @@ monitors:
     checkIPv6: true
     checkPerInterface: false
 
+  # IPv6 / dual-stack monitors (detection-only — never modify host settings).
+  # These degrade gracefully on IPv4-only nodes: a missing IPv6 stack is
+  # reported as a warning, not an error. On IPv4-only clusters set
+  # expectIPv6Enabled: false (or enabled: false) to silence them entirely.
+
+  # Checks disable_ipv6 sysctl — flags IPv6 disabled when it is expected on.
+  ipv6Sysctl:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectIPv6Enabled: true
+    checkPerInterface: false
+    # procPath is always /host/proc inside the container (host fs is bind-mounted)
+    procPath: /host/proc
+
+  # Checks for a valid IPv6 default route — flags missing route when expected.
+  ipv6Route:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectDefaultRoute: true
+    procPath: /host/proc
+
+  # Checks RA/SLAAC address presence (link-local/global) and accept_ra sysctl.
+  ipv6Neighbor:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectIPv6Enabled: true
+    checkPerInterface: true
+    requireGlobalAddress: false
+    procPath: /host/proc
+
+  # Detection-only ip6tables/nftables listing — flags unexpected firewall state.
+  ipv6Firewall:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectIPv6Enabled: true
+    # backend: auto selects ip6tables or nftables based on what is present on the node
+    backend: auto
+
 # Exporters configuration
 exporters:
   kubernetes:
@@ -382,7 +424,9 @@ exporters:
 
   http:
     enabled: false
-    bindAddress: "0.0.0.0"
+    # "::" = dual-stack (listens on IPv4 and IPv6); the server falls back to
+    # "0.0.0.0" automatically if the host kernel has IPv6 disabled.
+    bindAddress: "::"
     port: 8080
     tlsEnabled: false
 
diff --git a/helm/node-doctor/values.yaml.template b/helm/node-doctor/values.yaml.template
index 222feae..43b31db 100644
--- a/helm/node-doctor/values.yaml.template
+++ b/helm/node-doctor/values.yaml.template
@@ -354,6 +354,48 @@ monitors:
     checkIPv6: true
     checkPerInterface: false
 
+  # IPv6 / dual-stack monitors (detection-only — never modify host settings).
+  # These degrade gracefully on IPv4-only nodes: a missing IPv6 stack is
+  # reported as a warning, not an error. On IPv4-only clusters set
+  # expectIPv6Enabled: false (or enabled: false) to silence them entirely.
+
+  # Checks disable_ipv6 sysctl — flags IPv6 disabled when it is expected on.
+  ipv6Sysctl:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectIPv6Enabled: true
+    checkPerInterface: false
+    # procPath is always /host/proc inside the container (host fs is bind-mounted)
+    procPath: /host/proc
+
+  # Checks for a valid IPv6 default route — flags missing route when expected.
+  ipv6Route:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectDefaultRoute: true
+    procPath: /host/proc
+
+  # Checks RA/SLAAC address presence (link-local/global) and accept_ra sysctl.
+  ipv6Neighbor:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectIPv6Enabled: true
+    checkPerInterface: true
+    requireGlobalAddress: false
+    procPath: /host/proc
+
+  # Detection-only ip6tables/nftables listing — flags unexpected firewall state.
+  ipv6Firewall:
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    expectIPv6Enabled: true
+    # backend: auto selects ip6tables or nftables based on what is present on the node
+    backend: auto
+
 # Exporters configuration
 exporters:
   kubernetes:
@@ -374,7 +416,9 @@ exporters:
 
   http:
     enabled: false
-    bindAddress: "0.0.0.0"
+    # "::" = dual-stack (listens on IPv4 and IPv6); the server falls back to
+    # "0.0.0.0" automatically if the host kernel has IPv6 disabled.
+    bindAddress: "::"
     port: 8080
     tlsEnabled: false
 

From 329a3efb91b32af3c3e8dc372b5eb6f47b050d43 Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:44:10 -0500
Subject: [PATCH 37/38] docs(helm): IPv6 / dual-stack section in chart README
 (Task #17230)

Document the four detection-only IPv6 monitors (ipv6Sysctl/Route/Neighbor/
Firewall), the dual-stack :: bindAddress default with IPv4 fallback, how
they degrade gracefully on IPv4-only nodes (and how to disable them), and
the address_family metric label + IPv6 PrometheusRule alerts.
---
 helm/node-doctor/README.md | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/helm/node-doctor/README.md b/helm/node-doctor/README.md
index d4461bb..858a0c5 100644
--- a/helm/node-doctor/README.md
+++ b/helm/node-doctor/README.md
@@ -91,6 +91,44 @@ The following table lists the configurable parameters of the Node Doctor chart a
 | `serviceMonitor.interval` | Scrape interval | `30s` |
 | `serviceMonitor.scrapeTimeout` | Scrape timeout | `10s` |
 
+### IPv6 / Dual-Stack
+
+Node Doctor ships four **detection-only** IPv6 monitors (they never modify host
+settings) and binds its HTTP/metrics endpoints dual-stack by default.
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `monitors.ipv6Sysctl.enabled` | Detect `disable_ipv6` sysctl set when IPv6 is expected on | `true` |
+| `monitors.ipv6Route.enabled` | Detect a missing IPv6 default route when expected | `true` |
+| `monitors.ipv6Neighbor.enabled` | Detect missing RA/SLAAC address (link-local/global) and `accept_ra` disabled | `true` |
+| `monitors.ipv6Firewall.enabled` | Sanity-check ip6tables/nftables for an IPv6 black-hole (detection only) | `true` |
+| `monitors.ipv6Sysctl.expectIPv6Enabled` | Treat IPv6-disabled as a problem (shared key across the IPv6 monitors) | `true` |
+| `monitors.ipv6Firewall.backend` | Firewall backend to read: `auto`, `ip6tables`, or `nft` | `auto` |
+| `exporters.http.bindAddress` | Listen address; `::` = dual-stack (IPv4+IPv6), falls back to `0.0.0.0` if the kernel has IPv6 disabled | `"::"` |
+
+These monitors **degrade gracefully on IPv4-only nodes**: a missing IPv6 stack is
+reported as a warning, not an error, and the conditions stay healthy when IPv6
+cannot be confirmed. On purely IPv4 clusters you can silence them by setting
+`expectIPv6Enabled: false` (or `enabled: false`) on each:
+
+```yaml
+monitors:
+  ipv6Sysctl:
+    enabled: false
+  ipv6Route:
+    enabled: false
+  ipv6Neighbor:
+    enabled: false
+  ipv6Firewall:
+    enabled: false
+```
+
+Network metrics (`gateway_latency_seconds`, `peer_latency_seconds`,
+`peer_reachable`, `dns_latency_seconds`) carry an `address_family` label
+(`ipv4`/`ipv6`/`unknown`) so dashboards and alerts can distinguish the families;
+the bundled PrometheusRule alerts (`prometheusRule.enabled`) include a
+`NodeDoctorIPv6Misconfigured` alert and per-family peer alerts.
+
 ## Security Considerations
 
 Node Doctor requires privileged access to monitor system health effectively. This includes:

From 4f0362a84f3fb4a8866ee6724945d5ac3d0cecbe Mon Sep 17 00:00:00 2001
From: Matthew Mattox <mmattox@support.tools>
Date: Thu, 25 Jun 2026 05:49:06 -0500
Subject: [PATCH 38/38] docs: IPv6/dual-stack updates to configuration,
 monitors, remediation (Task #17231)

configuration.md: IPv6/dual-stack subsection (4 detection monitors with
config keys, gateway addressFamily, DNS recordType: AAAA, :: bind +
fallback, IPv4-only degradation/disable). monitors.md: section + TOC for
the four network-ipv6-* monitors (checks, keys, conditions) + address_
family metric label. remediation.md: flush-ipv6-route operation (ip -6
route flush cache) and family-agnostic flush-dns note. Verified all type
strings/conditions/keys against source; did not claim a registered
ipv6-route-flush remediator TYPE (that registration is blocked, #19263).
---
 docs/configuration.md |  92 +++++++++++++++++++++++++++++++
 docs/monitors.md      | 122 ++++++++++++++++++++++++++++++++++++++++++
 docs/remediation.md   |  31 ++++++++---
 3 files changed, 239 insertions(+), 6 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index 46d9953..2cfc160 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -241,6 +241,98 @@ See [docs/monitors.md](./monitors.md) for detailed configuration of all 11 monit
 2. **Network Monitors:** dns-check, gateway-check, connectivity-check
 3. **Kubernetes Monitors:** kubelet-check, apiserver-check, runtime-check, capacity-check
 4. **Custom Monitors:** plugin-check, log-pattern-check
+5. **IPv6 / Dual-Stack Monitors:** network-ipv6-sysctl, network-ipv6-route, network-ipv6-neighbor, network-ipv6-firewall (see [IPv6 / dual-stack](#ipv6--dual-stack))
+
+### IPv6 / Dual-Stack
+
+Node Doctor ships detection-only IPv6 monitors and dual-stack options for several existing monitors and the HTTP/health server. These are configured the same way as any other monitor/exporter — this section collects the IPv6-specific keys in one place. See [docs/monitors.md](./monitors.md#ipv6--dual-stack-monitors) for the conditions/events each monitor emits.
+
+**Graceful IPv4-only degradation:** the IPv6 monitors never modify host settings and degrade gracefully on IPv4-only nodes — a missing IPv6 stack is recorded as a non-actionable condition rather than a problem. On IPv4-only clusters, set `expectIPv6Enabled: false` (or `expectDefaultRoute: false` for the route monitor) to silence the warnings, or `enabled: false` to disable a monitor entirely.
+
+**IPv6 detection monitors** (mirrors the `monitors` block in `config/node-doctor.yaml`):
+
+```yaml
+monitors:
+  # IPv6 disable_ipv6 sysctl monitor — flags IPv6 disabled when expected on.
+  - name: ipv6-sysctl-check
+    type: network-ipv6-sysctl
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true      # Default: true
+      checkPerInterface: false     # Default: false
+      procPath: /proc              # Default: /proc
+
+  # IPv6 default-route monitor — flags a missing IPv6 default route when expected.
+  - name: ipv6-route-check
+    type: network-ipv6-route
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectDefaultRoute: true     # Default: true
+      procPath: /proc              # Default: /proc
+
+  # IPv6 RA/SLAAC + address-presence monitor (link-local/global address, accept_ra).
+  - name: ipv6-neighbor-check
+    type: network-ipv6-neighbor
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true      # Default: true
+      checkPerInterface: true      # Default: true
+      requireGlobalAddress: false  # Default: false
+      procPath: /proc              # Default: /proc
+
+  # IPv6 firewall sanity monitor (detection-only ip6tables/nft listing).
+  - name: ipv6-firewall-check
+    type: network-ipv6-firewall
+    enabled: true
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true      # Default: true
+      backend: auto                # Default: auto — one of "auto", "ip6tables", "nft"
+```
+
+**Gateway address family:** the gateway monitor (`network-gateway-check`) accepts an `addressFamily` key selecting which IP family's default route to probe:
+
+```yaml
+monitors:
+  - name: gateway-health
+    type: network-gateway-check
+    config:
+      addressFamily: ipv4          # "ipv4" (default), "ipv6", or "auto"
+```
+
+- `ipv4` (default) — probe the IPv4 default gateway only (preserves pre-dual-stack behavior).
+- `ipv6` — probe the IPv6 default route from `/proc/net/ipv6_route`.
+- `auto` — prefer the IPv4 default route and fall back to the IPv6 default route when no IPv4 default route exists.
+
+**DNS AAAA queries:** the DNS monitor (`network-dns-check`) accepts a per-query `recordType` of `AAAA` for IPv6 resolution checks (`A` is the default):
+
+```yaml
+monitors:
+  - name: dns-health
+    type: network-dns-check
+    config:
+      domains:
+        - domain: kubernetes.default.svc.cluster.local
+          recordType: AAAA         # "A" (default) or "AAAA"
+```
+
+Unsupported record types emit an `UnsupportedQueryType` event. Per-nameserver and consistency-check paths currently support `A` only; for `AAAA` queries they are skipped (an `AAAAFeatureUnsupported` event is emitted).
+
+**Dual-stack bind address:** the HTTP / health server binds dual-stack by default. The Helm chart exposes this as `exporters.http.bindAddress` (default `"::"`), which listens on both IPv4 and IPv6; the server falls back to IPv4 automatically if the host cannot bind `::`.
+
+```yaml
+# helm/node-doctor/values.yaml
+exporters:
+  http:
+    bindAddress: "::"              # Dual-stack (IPv4 + IPv6); falls back to IPv4
+```
 
 ### Monitor Validation
 
diff --git a/docs/monitors.md b/docs/monitors.md
index fe5eac0..3b7fa1b 100644
--- a/docs/monitors.md
+++ b/docs/monitors.md
@@ -15,6 +15,7 @@ This document provides comprehensive information about all monitor types availab
   - [Gateway Monitor](#gateway-monitor)
   - [Connectivity Monitor](#connectivity-monitor)
   - [CNI Monitor](#cni-monitor)
+  - [IPv6 / Dual-Stack Monitors](#ipv6--dual-stack-monitors)
 - [Kubernetes Monitors](#kubernetes-monitors)
   - [Kubelet Monitor](#kubelet-monitor)
   - [API Server Monitor](#api-server-monitor)
@@ -1298,6 +1299,127 @@ Since node-doctor runs with `hostNetwork: true`, the CNI monitor:
 
 ---
 
+## IPv6 / Dual-Stack Monitors
+
+Four detection-only monitors validate that an IPv6 / dual-stack node has a working IPv6 stack. They **never modify host settings** — they read kernel state (sysctls, `/proc/net/ipv6_route`, interface addresses, firewall listings) and emit conditions/events only.
+
+All four degrade gracefully on IPv4-only nodes: when `expectIPv6Enabled` (or `expectDefaultRoute` for the route monitor) is `false`, a missing IPv6 stack is recorded as a non-actionable `*NotExpected` condition rather than a problem. To silence a monitor entirely, set `enabled: false`. See [configuration.md](./configuration.md#ipv6--dual-stack) for the default config block and related dual-stack options (gateway `addressFamily`, DNS `recordType: AAAA`, dual-stack bind address).
+
+> **Metric label:** network metrics now carry an `address_family` label (`ipv4` / `ipv6` / `unknown`) so dual-stack probes can be distinguished in Prometheus. See `pkg/exporters/prometheus/metrics.go`.
+
+### IPv6 Sysctl Monitor
+
+Flags IPv6 being disabled via the `disable_ipv6` sysctl when it is expected to be enabled.
+
+**Monitor Type:** `network-ipv6-sysctl`
+
+**Source File:** `pkg/monitors/network/ipv6_sysctl.go`
+
+**Configuration:**
+
+```yaml
+monitors:
+  - name: ipv6-sysctl-check
+    type: network-ipv6-sysctl
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true       # Default: true
+      checkPerInterface: false      # Default: false — also glob per-interface disable_ipv6
+      procPath: /proc               # Default: /proc
+```
+
+**What It Checks:** Reads `net.ipv6.conf.all.disable_ipv6` (and per-interface `disable_ipv6` sysctls when `checkPerInterface: true`) under `procPath`.
+
+**Conditions:**
+- `IPv6SysctlMisconfigured`: True when IPv6 is disabled but `expectIPv6Enabled: true`; held at False (with an explanatory `*NotExpected`-style reason) when `expectIPv6Enabled: false`.
+
+### IPv6 Route Monitor
+
+Flags a missing IPv6 default route when one is expected.
+
+**Monitor Type:** `network-ipv6-route`
+
+**Source File:** `pkg/monitors/network/ipv6_route.go`
+
+**Configuration:**
+
+```yaml
+monitors:
+  - name: ipv6-route-check
+    type: network-ipv6-route
+    interval: 60s
+    timeout: 5s
+    config:
+      expectDefaultRoute: true      # Default: true
+      procPath: /proc               # Default: /proc
+```
+
+**What It Checks:** Parses `<procPath>/net/ipv6_route` for a `::/0` default route.
+
+**Conditions:**
+- `IPv6DefaultRouteMissing`: True when no IPv6 default route is present but `expectDefaultRoute: true`; held at False when `expectDefaultRoute: false`.
+
+### IPv6 Neighbor Monitor
+
+Checks IPv6 address presence (link-local / global / SLAAC) and Router Advertisement (RA / `accept_ra`) state per interface.
+
+**Monitor Type:** `network-ipv6-neighbor`
+
+**Source File:** `pkg/monitors/network/ipv6_neighbor.go`
+
+**Configuration:**
+
+```yaml
+monitors:
+  - name: ipv6-neighbor-check
+    type: network-ipv6-neighbor
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true       # Default: true
+      checkPerInterface: true       # Default: true — evaluate RA/autoconf per interface
+      requireGlobalAddress: false   # Default: false — when true, a missing global/SLAAC address is flagged
+      procPath: /proc               # Default: /proc
+```
+
+**What It Checks:** Per-interface IPv6 link-local and global/SLAAC address presence plus `accept_ra` state.
+
+**Conditions:**
+- `IPv6LinkLocalMissing`: True when an interface lacks an IPv6 link-local address and IPv6 is expected.
+- `IPv6GlobalAddressMissing`: True when an interface lacks a global/SLAAC IPv6 address and `requireGlobalAddress: true`.
+- `IPv6RouterAdvertisementDisabled`: True when `accept_ra` is disabled where RA-based autoconfiguration is expected.
+
+When `expectIPv6Enabled: false` (or `requireGlobalAddress: false` for the global-address case) these conditions are held at False with a non-actionable reason.
+
+### IPv6 Firewall Monitor
+
+Detection-only sanity check that the IPv6 firewall is not black-holing all IPv6 traffic.
+
+**Monitor Type:** `network-ipv6-firewall`
+
+**Source File:** `pkg/monitors/network/ipv6_firewall.go`
+
+**Configuration:**
+
+```yaml
+monitors:
+  - name: ipv6-firewall-check
+    type: network-ipv6-firewall
+    interval: 60s
+    timeout: 5s
+    config:
+      expectIPv6Enabled: true       # Default: true
+      backend: auto                 # Default: auto — one of "auto", "ip6tables", "nft"
+```
+
+**What It Checks:** Reads (does not modify) the IPv6 firewall ruleset. In `auto` mode it prefers `nft` when present and falls back to `ip6tables`.
+
+**Conditions:**
+- `IPv6FirewallBlackhole`: True when the IPv6 firewall appears to black-hole all IPv6 traffic and IPv6 is expected; held at False (`IPv6FirewallBlackholeNotExpected`) when `expectIPv6Enabled: false`. This monitor is detection-only and never edits firewall rules.
+
+---
+
 ## Kubernetes Monitors
 
 ### Kubelet Monitor
diff --git a/docs/remediation.md b/docs/remediation.md
index 9f17fe7..308d90f 100644
--- a/docs/remediation.md
+++ b/docs/remediation.md
@@ -540,15 +540,16 @@ remediator, err := NewCustomRemediator(config)
 
 **Purpose**: Fixes network connectivity issues through DNS cache flushing, interface restarts, and routing table operations.
 
-**Supported Operations** (`pkg/remediators/network.go:13-25`):
-- `flush-dns` - Flushes DNS resolver cache
+**Supported Operations** (`pkg/remediators/network.go`):
+- `flush-dns` - Flushes DNS resolver cache (address-family agnostic — clears IPv4 **and** IPv6/AAAA records; see below)
 - `restart-interface` - Restarts network interface (down/up)
 - `reset-routing` - Resets routing table to defaults
+- `flush-ipv6-route` - Flushes the IPv6 routing cache via `ip -6 route flush cache`
 
 **Configuration**:
 ```go
 type NetworkConfig struct {
-    Operation       NetworkOperation  // flush-dns, restart-interface, reset-routing
+    Operation       NetworkOperation  // flush-dns, restart-interface, reset-routing, flush-ipv6-route
     InterfaceName   string           // Required for restart-interface (e.g., "eth0")
     BackupRouting   bool             // Backup routing table before reset
     VerifyAfter     bool             // Verify operation succeeded
@@ -557,13 +558,21 @@ type NetworkConfig struct {
 }
 ```
 
-**DNS Cache Flush** (`pkg/remediators/network.go:212-230`):
+**DNS Cache Flush** (`pkg/remediators/network.go`):
 
 Tries multiple methods in order:
 1. `resolvectl flush-caches` (modern systemd)
 2. `systemd-resolve --flush-caches` (older systemd)
 
-**Interface Restart** (`pkg/remediators/network.go:233-263`):
+Both methods clear the resolver's **entire** cache, including AAAA (IPv6) records as well as A (IPv4). The flush is therefore address-family agnostic — there is no separate IPv6 DNS-flush operation, because `flush-dns` already covers both families.
+
+**IPv6 Route Cache Flush** (`pkg/remediators/network.go`):
+
+The `flush-ipv6-route` operation (`NetworkFlushIPv6Route`) flushes the IPv6 routing **cache** — it runs `ip -6 route flush cache` and does not alter routing-table entries. When `BackupRouting` is set it first captures `ip -6 route show`. Unlike the cache flush that is one step of `reset-routing` (where a flush failure is a non-fatal warning), this dedicated operation treats a failed flush as a failed remediation and returns the error.
+
+> **Note:** Only the network *operation* `flush-ipv6-route` exists. There is no separately registered top-level remediator type for IPv6 route flushing — it is invoked via `NetworkConfig.Operation` on the `NetworkRemediator`.
+
+**Interface Restart** (`pkg/remediators/network.go`):
 ```go
 // Safety: Verify interface exists first
 // 1. Bring interface down: ip link set <iface> down
@@ -588,7 +597,17 @@ Tries multiple methods in order:
 ```go
 config := NetworkConfig{
     Operation:   NetworkFlushDNS,
-    VerifyAfter: false,  // DNS flush is immediate
+    VerifyAfter: false,  // DNS flush is immediate; clears A and AAAA records
+}
+
+remediator, err := NewNetworkRemediator(config)
+```
+
+**Example - IPv6 Route Cache Flush**:
+```go
+config := NetworkConfig{
+    Operation:     NetworkFlushIPv6Route,  // "flush-ipv6-route"
+    BackupRouting: true,                   // Capture "ip -6 route show" first
 }
 
 remediator, err := NewNetworkRemediator(config)