Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
1ccc308
feat(network): IPv6 disable_ipv6 sysctl detection monitor (Task #17205)
mattmattox Jun 25, 2026
f2b98f4
feat(network): dual-stack gateway selection via addressFamily config …
mattmattox Jun 25, 2026
8f69d70
feat(network): standalone IPv6 default-route detection monitor (Task …
mattmattox Jun 25, 2026
dc665d3
feat(network): IPv6 RA/SLAAC + address-presence detection monitor (Ta…
mattmattox Jun 25, 2026
09e1ae3
feat(network): IPv6 firewall sanity detection monitor (Task #17208)
mattmattox Jun 25, 2026
b8d6a79
test(cmd): verify new IPv6 monitors register + auto-enable (Task #17209)
mattmattox Jun 25, 2026
6074ae7
feat(net): dual-stack (::) default bind for exporter + health server …
mattmattox Jun 25, 2026
6bea045
feat(kubernetes): IPv6 loopback fallback for kubelet probes (Task #17…
mattmattox Jun 25, 2026
c11fc21
feat(network): IPv6 link-local zone ID support in ICMP pinger (Task #…
mattmattox Jun 25, 2026
6117038
feat(network): httpPinger emits Family for hostname targets (Task #17…
mattmattox Jun 25, 2026
218988e
chore(network): ICMPv6 socket polish — filter, per-instance ID (Task …
mattmattox Jun 25, 2026
8e8c080
chore(ci): run ICMP pinger integration test instead of silently skipp…
mattmattox Jun 25, 2026
5751627
test(network): cover resolveTarget hostname DNS path (Task #17237)
mattmattox Jun 25, 2026
c04bfff
feat(network): metric-based default-route selection v4+v6 (Task #17240)
mattmattox Jun 25, 2026
fecf25d
feat(network): warn when AAAA query drops A-only DNS checks (Task #17…
mattmattox Jun 25, 2026
75b543e
test(network): AAAA scoped/link-local address coverage (Task #17242)
mattmattox Jun 25, 2026
9e46ce4
test(network): assert AAAA record type appears in DNS event messages …
mattmattox Jun 25, 2026
5cdb9e7
fix(network): refresh CNI peer Family on re-IP/toggle (Task #17247)
mattmattox Jun 25, 2026
3c6b49b
test(controller): fix TestCorrelationDetectionFlow port/start failure…
mattmattox Jun 25, 2026
6a03ab6
feat(metrics): add address_family label to network metrics (Task #17216)
mattmattox Jun 25, 2026
c20a48a
feat(dashboards): address_family template var + family-aware CNI pane…
mattmattox Jun 25, 2026
d261cf8
feat(dashboards): new IPv6 / dual-stack health Grafana dashboard (Tas…
mattmattox Jun 25, 2026
483c890
feat(alerts): per-family PrometheusRule alerts + IPv6 condition alert…
mattmattox Jun 25, 2026
a5d116e
test(metrics): guard Go+Process collector wiring in served registry (…
mattmattox Jun 25, 2026
e14e8a1
feat(metrics): monitor-cycle self-metrics (Task #17211)
mattmattox Jun 25, 2026
cebbb68
feat(metrics): exporter-health self-metrics (Task #17212)
mattmattox Jun 25, 2026
59e6567
feat(metrics): remediator circuit-breaker state gauge (Task #17213)
mattmattox Jun 25, 2026
4a73e4d
feat(metrics): config hot-reload self-metrics (Task #17214)
mattmattox Jun 25, 2026
b0bccc2
test(metrics): authoritative self-metrics registration test (Task #17…
mattmattox Jun 25, 2026
e8e2b18
feat(remediators): ip -6 route flush cache operation (Task #17220)
mattmattox Jun 25, 2026
ac03556
test(remediators): verify DNS flush covers IPv6/AAAA (Task #17221)
mattmattox Jun 25, 2026
5700184
test(network): build-tagged IPv6 pinger integration tests (Task #17223)
mattmattox Jun 25, 2026
053c4ea
test(integration): kind dual-stack cluster integration test (Task #17…
mattmattox Jun 25, 2026
cfff0aa
ci: dedicated IPv6/dual-stack integration workflow (Task #17227)
mattmattox Jun 25, 2026
94c06d8
chore(config): add IPv6/dual-stack monitors to default config (Task #…
mattmattox Jun 25, 2026
34135d4
feat(helm): IPv6 monitors + dual-stack bind address in chart (Task #1…
mattmattox Jun 25, 2026
329a3ef
docs(helm): IPv6 / dual-stack section in chart README (Task #17230)
mattmattox Jun 25, 2026
4f0362a
docs: IPv6/dual-stack updates to configuration, monitors, remediation…
mattmattox Jun 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions .github/workflows/ci-ipv6.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
name: CI - IPv6 / Dual-Stack

# Runs IPv6 and dual-stack integration tests that require Docker with IPv6
# enabled (for the kind cluster) and CAP_NET_RAW (for raw ICMPv6). Kept as a
# separate workflow so these heavy/privileged jobs do not block the main CI
# aggregate (ci-success) and so a kind cluster flake does not mask ICMP
# results (and vice versa).

on:
push:
branches:
- main
tags:
- 'v*'
paths:
- 'pkg/monitors/network/**'
- 'pkg/exporters/**'
- 'test/integration/**'
- '.github/workflows/ci-ipv6.yml'
pull_request:
branches:
- main
paths:
- 'pkg/monitors/network/**'
- 'pkg/exporters/**'
- 'test/integration/**'
- '.github/workflows/ci-ipv6.yml'
workflow_dispatch:

env:
GO_VERSION: '1.25'
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

# Cancel in-flight runs for the same PR / branch so stale runs don't waste
# runner minutes when new commits are pushed.
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
# ---------------------------------------------------------------------------
# Job: ipv6-icmp
#
# Compiles the network package test binary as the runner user (preserving the
# Go module cache) and then re-executes ONLY the IPv6 integration tests under
# sudo to obtain CAP_NET_RAW for raw ICMPv6 sockets.
#
# NODE_DOCTOR_ICMP_INTEGRATION=1 causes socket / permission errors to be hard
# failures rather than skips, so a misconfigured runner surfaces loudly.
#
# Kept separate from ipv6-kind-dualstack so a kind cluster flake does not
# hide ICMP failures and vice versa.
# ---------------------------------------------------------------------------
ipv6-icmp:
name: IPv6 ICMP Integration
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read

steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Setup Go
uses: actions/setup-go@v6
with:
go-version: ${{ env.GO_VERSION }}
cache: true

- name: Download dependencies
run: go mod download

# Enable the IPv6 loopback so ::1 is reachable inside the runner netns.
- name: Enable IPv6 loopback sysctl
run: |
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=0
sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=0

# Compile as the runner user so the binary inherits the Go env / module
# cache; the -tags flag includes the `integration` gate.
- name: Compile network test binary (with integration tag)
run: go test -c -tags=integration -o /tmp/nd-net.test ./pkg/monitors/network/

# Execute only the IPv6 integration tests under sudo for CAP_NET_RAW.
- name: Run IPv6 ICMP integration tests (privileged)
run: |
sudo NODE_DOCTOR_ICMP_INTEGRATION=1 /tmp/nd-net.test \
-test.run 'TestDefaultPinger_IPv6Loopback_Integration|TestDefaultPinger_IPv6LinkLocal_Integration' \
-test.v \
-test.timeout 2m

# ---------------------------------------------------------------------------
# Job: ipv6-kind-dualstack
#
# Enables Docker IPv6 on the GitHub runner, installs the kind binary and
# kubectl, then runs TestIPv6KindDualStackCluster which creates its own
# dual-stack kind cluster from test/integration/testdata/kind-dualstack.yaml.
#
# The Go test owns full cluster lifecycle (create + t.Cleanup delete). A
# safety-net step with `if: always()` calls `kind delete clusters --all`
# after the test so lingering clusters do not consume runner resources on
# test failure / panic.
# ---------------------------------------------------------------------------
ipv6-kind-dualstack:
name: IPv6 Kind Dual-Stack Cluster
runs-on: ubuntu-latest
timeout-minutes: 40
permissions:
contents: read

steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Setup Go
uses: actions/setup-go@v6
with:
go-version: ${{ env.GO_VERSION }}
cache: true

- name: Download dependencies
run: go mod download

# -----------------------------------------------------------------------
# Enable Docker daemon IPv6 BEFORE kind creates the dual-stack cluster.
# Without this, kind's kindnet CNI cannot plumb IPv6 pod addresses and
# `kind create cluster --config kind-dualstack.yaml` fails or produces a
# single-stack cluster.
#
# We write daemon.json, restart Docker, and wait until the socket comes
# back before proceeding. The `ip6tables` + `experimental` flags are
# required for kernel IPv6 NAT support inside the kind node containers.
# -----------------------------------------------------------------------
- name: Enable Docker daemon IPv6
run: |
sudo mkdir -p /etc/docker
# Merge our IPv6 settings into daemon.json (overwrite is safe on a
# fresh GitHub runner where /etc/docker/daemon.json does not exist).
cat <<'EOF' | sudo tee /etc/docker/daemon.json
{
"ipv6": true,
"fixed-cidr-v6": "2001:db8:1::/64",
"experimental": true,
"ip6tables": true
}
EOF
sudo systemctl restart docker
# Poll until the Docker socket is back (up to 30 s).
for i in $(seq 1 30); do
if docker info >/dev/null 2>&1; then
echo "Docker daemon is back (attempt $i)"
break
fi
echo "Waiting for Docker daemon... ($i/30)"
sleep 1
done
docker info

# Enable IPv6 on the host so the runner's kernel and kind node containers
# can create IPv6 interfaces.
- name: Enable IPv6 sysctls
run: |
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=0
sudo sysctl -w net.ipv6.conf.all.forwarding=1

# Install the kind binary directly so the Go test can call `kind` from
# PATH without kind-action creating its own cluster first (which would
# collide with the test's dynamically-named dual-stack cluster).
- name: Install kind binary
run: |
KIND_VERSION="v0.27.0"
curl -fsSL "https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-amd64" \
-o /tmp/kind
chmod +x /tmp/kind
sudo mv /tmp/kind /usr/local/bin/kind
kind version

# kubectl is available on ubuntu-latest runners but pin a version check
# so the step fails fast if the binary is unexpectedly missing.
- name: Verify kubectl is available
run: kubectl version --client

# Run ONLY the dual-stack kind integration test. The test constructs its
# own unique cluster name to avoid collisions with parallel runs.
# -timeout covers: cluster create (≤6 min) + node-ready wait (≤3 min)
# + assertions + cleanup (≤2 min) = 11 min with headroom to 20 min.
- name: Run dual-stack kind integration test
run: |
go test -tags=integration -v -timeout 20m \
./test/integration/... \
-run TestIPv6KindDualStackCluster

# Safety-net cleanup: delete any kind clusters left over by a test
# failure or panic. Runs unconditionally (if: always()) so a flaky
# cluster create does not leave a zombie cluster consuming runner resources.
- name: Cleanup kind clusters (safety net)
if: always()
run: |
echo "Existing kind clusters:"
kind get clusters || true
kind delete clusters --all || true
echo "Cleanup complete"
32 changes: 32 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,38 @@ jobs:
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }}

# Pinger ICMP integration - runs the real raw-ICMP loopback test under privilege.
#
# Kept as a SEPARATE job (not a step in `test`) so a privileged-socket flake on
# the runner does not block the main unit-test/coverage job. The runner user
# compiles the test binary (preserving the Go env / cache), then runs ONLY the
# integration test via sudo so it has CAP_NET_RAW. NODE_DOCTOR_ICMP_INTEGRATION=1
# makes socket/permission errors HARD failures so a misconfigured runner surfaces
# loudly instead of silently passing without exercising real ICMP.
pinger-icmp-integration:
name: Pinger ICMP Integration
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Setup Go
uses: actions/setup-go@v6
with:
go-version: ${{ env.GO_VERSION }}
cache: true

- name: Download dependencies
run: go mod download

- name: Compile network test binary
run: go test -c -o /tmp/nd-network.test ./pkg/monitors/network/

- name: Run ICMP integration test (privileged)
run: |
sudo NODE_DOCTOR_ICMP_INTEGRATION=1 /tmp/nd-network.test \
-test.run '^TestDefaultPinger_Integration$' -test.v

# Security scan - gosec
security-gosec:
name: Security Scan (gosec)
Expand Down
15 changes: 15 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
gh-status gh-watch gh-logs gh-builds \
check-prerequisites check-docker check-kubectl \
build test test-integration test-e2e test-all \
test-net-icmp-integration \
lint fmt clean install-deps \
docker-build docker-push \
coverage-check
Expand Down Expand Up @@ -232,6 +233,20 @@ test-e2e:
fi
@$(call print_success,"E2E tests completed")

# Run the real ICMP pinger integration test under privilege.
#
# The default pinger opens RAW ICMP sockets (CAP_NET_RAW), so this must run as
# root. We compile the test binary as the normal user first (preserving the Go
# environment / module cache) and then run ONLY this test under sudo with the
# integration env var set, so socket/permission failures are HARD failures
# instead of silent skips.
test-net-icmp-integration:
@$(call print_status,"Compiling network test binary...")
@go test -c -o /tmp/nd-network.test ./pkg/monitors/network/
@$(call print_status,"Running ICMP integration test as root (CAP_NET_RAW)...")
@sudo NODE_DOCTOR_ICMP_INTEGRATION=1 /tmp/nd-network.test -test.run '^TestDefaultPinger_Integration$$' -test.v
@$(call print_success,"ICMP integration test passed")

# Run all tests with coverage
test-all:
@$(call print_status,"Running all tests with coverage...")
Expand Down
34 changes: 29 additions & 5 deletions cmd/node-doctor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,22 @@ func main() {
if remediatorRegistry != nil {
historyProvider = &remediationHistoryAdapter{registry: remediatorRegistry}
}
exporters, exporterInterfaces, err := createExporters(ctx, config, historyProvider)
exporters, exporterInterfaces, promExporter, err := createExporters(ctx, config, historyProvider)
if err != nil {
log.Fatalf("Failed to create exporters: %v", err)
}

log.Printf("[INFO] Created %d exporters", len(exporters))

// Expose the remediator circuit-breaker state as a Prometheus gauge. Only wire
// when both the registry (remediation enabled) and the Prometheus exporter are
// present. SetCircuitStateObserver pushes the current state immediately and on
// every subsequent transition.
if remediatorRegistry != nil && promExporter != nil {
remediatorRegistry.SetCircuitStateObserver(promExporter)
log.Printf("[INFO] Remediator circuit-breaker state wired to Prometheus gauge")
}

// Create monitor factory for hot reload
monitorFactory := &monitorFactoryAdapter{ctx: ctx}

Expand All @@ -244,6 +253,15 @@ func main() {
det.SetRemediatorRegistry(remediatorRegistry)
}

// Wire config hot-reload self-metrics. The detector owns the reload
// coordinator but only sees exporters via types.Exporter; pass a closure over
// the concrete Prometheus exporter's RecordConfigReload. Only wired when the
// Prometheus exporter is present (nil otherwise).
if promExporter != nil {
det.SetReloadMetricsRecorder(promExporter.RecordConfigReload)
log.Printf("[INFO] Config hot-reload self-metrics wired to Prometheus exporter")
}

// Start the detector
log.Printf("[INFO] Starting detector...")
if err := det.Start(); err != nil {
Expand Down Expand Up @@ -328,9 +346,12 @@ func (a *remediationHistoryAdapter) GetHistory(limit int) interface{} {
// createExporters creates and configures all exporters from the configuration.
// remediationProvider is optional; when non-nil it is wired to the health server
// before Start() so /remediation/history is available immediately on first request.
func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remediationProvider health.RemediationHistoryProvider) ([]ExporterLifecycle, []types.Exporter, error) {
func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remediationProvider health.RemediationHistoryProvider) ([]ExporterLifecycle, []types.Exporter, *prometheusexporter.PrometheusExporter, error) {
var exporters []ExporterLifecycle
var exporterInterfaces []types.Exporter
// promExporterTyped keeps a typed reference to the Prometheus exporter (if one
// is created and started) so the caller can wire it as a circuit-state observer.
var promExporterTyped *prometheusexporter.PrometheusExporter

// Create Kubernetes exporter if enabled
if config.Exporters.Kubernetes != nil && config.Exporters.Kubernetes.Enabled {
Expand Down Expand Up @@ -375,8 +396,10 @@ func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remedi
// Create Health Server (always enabled for Kubernetes probes)
log.Printf("[INFO] Creating health server...")
healthServer, err := health.NewServer(&health.Config{
Enabled: true,
BindAddress: "0.0.0.0",
Enabled: true,
// "::" binds dual-stack (IPv4 + IPv6) with graceful fallback to
// "0.0.0.0" when IPv6 is disabled on the node (handled in Start()).
BindAddress: "::",
Port: 8080,
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
Expand Down Expand Up @@ -414,6 +437,7 @@ func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remedi
} else {
exporters = append(exporters, promExporter)
exporterInterfaces = append(exporterInterfaces, promExporter)
promExporterTyped = promExporter
log.Printf("[INFO] Prometheus exporter created and started on port %d", config.Exporters.Prometheus.Port)
}
}
Expand All @@ -427,7 +451,7 @@ func createExporters(ctx context.Context, config *types.NodeDoctorConfig, remedi
exporterInterfaces = append(exporterInterfaces, noopExp)
}

return exporters, exporterInterfaces, nil
return exporters, exporterInterfaces, promExporterTyped, nil
}

// dumpConfiguration prints the effective configuration as JSON
Expand Down
Loading
Loading