diff --git a/internal/assets/commands/text/errors.yaml b/internal/assets/commands/text/errors.yaml index 716615774..4ca32bacf 100644 --- a/internal/assets/commands/text/errors.yaml +++ b/internal/assets/commands/text/errors.yaml @@ -649,6 +649,8 @@ err.hub.duplicate-project: short: 'project already registered: %q' err.hub.invalid-peer-action: short: "action must be 'add' or 'remove', got %q" +err.hub.connect-sync-locked: + short: 'connect sync already in progress; if no sync is running, remove the stale lock %s: %w' err.serve.no-running-hub: short: 'no running hub: %w' err.serve.invalid-pid: diff --git a/internal/cli/connection/core/sync/state.go b/internal/cli/connection/core/sync/state.go index f7ab4de97..fec97a904 100644 --- a/internal/cli/connection/core/sync/state.go +++ b/internal/cli/connection/core/sync/state.go @@ -14,6 +14,7 @@ import ( "github.com/ActiveMemory/ctx/internal/config/fs" cfgHub "github.com/ActiveMemory/ctx/internal/config/hub" cfgWarn "github.com/ActiveMemory/ctx/internal/config/warn" + errHub "github.com/ActiveMemory/ctx/internal/err/hub" "github.com/ActiveMemory/ctx/internal/io" logWarn "github.com/ActiveMemory/ctx/internal/log/warn" "github.com/ActiveMemory/ctx/internal/rc" @@ -41,18 +42,23 @@ func loadState() (state, func(), error) { return s, nil, mkErr } - // Acquire lock: fail if another sync is running. - if _, statErr := os.Stat(lockPath); statErr == nil { - return s, nil, os.ErrExist + // Acquire lock: fail if another sync is running. The + // O_CREATE|O_EXCL create-or-fail is a single syscall, so + // there is no check-to-write window for a concurrent sync + // to slip through. + acquired, lockErr := io.SafeTryLock(lockPath, fs.PermFile) + if lockErr != nil { + return s, nil, lockErr } - if writeErr := io.SafeWriteFile( - lockPath, []byte(cfgHub.LockSentinel), fs.PermFile, - ); writeErr != nil { - return s, nil, writeErr + if !acquired { + // Another sync holds the lock — or a crashed sync left it + // stale. Name the path so the wedge is self-documenting; + // the error still wraps os.ErrExist for errors.Is callers. + return s, nil, errHub.ConnectSyncLocked(lockPath) } release := func() { - if rmErr := os.Remove(lockPath); rmErr != nil { + if rmErr := io.SafeUnlock(lockPath); rmErr != nil { logWarn.Warn(cfgWarn.Remove, lockPath, rmErr) } } diff --git a/internal/cli/connection/core/sync/state_test.go b/internal/cli/connection/core/sync/state_test.go new file mode 100644 index 000000000..7fd312ebd --- /dev/null +++ b/internal/cli/connection/core/sync/state_test.go @@ -0,0 +1,188 @@ +// / ctx: https://ctx.ist +// ,'`./ do you remember? +// `.,'\ +// \ Copyright 2026-present Context contributors. +// SPDX-License-Identifier: Apache-2.0 + +package sync + +import ( + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/ActiveMemory/ctx/internal/config/fs" + cfgHub "github.com/ActiveMemory/ctx/internal/config/hub" + "github.com/ActiveMemory/ctx/internal/io" + "github.com/ActiveMemory/ctx/internal/testutil/testctx" +) + +// declareContext positions the test in a temp project with a +// materialized .context/ directory and returns its path. +func declareContext(t *testing.T) string { + t.Helper() + dir := t.TempDir() + ctxDir := testctx.Declare(t, dir) + if mkErr := os.MkdirAll(ctxDir, fs.PermExec); mkErr != nil { + t.Fatal(mkErr) + } + return ctxDir +} + +func TestLoadState_RejectsConcurrentSyncs(t *testing.T) { + declareContext(t) + + const attempts = 16 + + type outcome struct { + release func() + err error + } + + start := make(chan struct{}) + results := make(chan outcome, attempts) + for i := 0; i < attempts; i++ { + go func() { + <-start + _, release, lockErr := loadState() + results <- outcome{release: release, err: lockErr} + }() + } + close(start) + + // Winners must not release until every attempt has + // finished, or a late goroutine could legitimately + // re-acquire the freed lock and skew the count. + var winners []func() + var contended int + for i := 0; i < attempts; i++ { + got := <-results + switch { + case got.err == nil: + winners = append(winners, got.release) + case errors.Is(got.err, os.ErrExist): + contended++ + default: + t.Fatalf("unexpected error: %v", got.err) + } + } + + if len(winners) != 1 { + t.Errorf("winners = %d, want exactly 1", len(winners)) + } + if contended != attempts-1 { + t.Errorf( + "contended = %d, want %d", contended, attempts-1, + ) + } + for _, release := range winners { + release() + } +} + +func TestLoadState_ReleaseRemovesLock(t *testing.T) { + ctxDir := declareContext(t) + lockPath := filepath.Join( + ctxDir, cfgHub.DirHub, cfgHub.FileSyncLock, + ) + + _, release, lockErr := loadState() + if lockErr != nil { + t.Fatalf("first loadState: %v", lockErr) + } + if _, statErr := os.Stat(lockPath); statErr != nil { + t.Fatalf("lock file should exist while held: %v", statErr) + } + + release() + + if _, statErr := os.Stat(lockPath); !os.IsNotExist(statErr) { + t.Errorf( + "lock file should be gone after release: %v", statErr, + ) + } + + // The next sync must be able to proceed. + _, release, lockErr = loadState() + if lockErr != nil { + t.Fatalf("loadState after release: %v", lockErr) + } + release() +} + +func TestLoadState_ReleasesLockOnCorruptState(t *testing.T) { + ctxDir := declareContext(t) + hubDir := filepath.Join(ctxDir, cfgHub.DirHub) + if mkErr := io.SafeMkdirAll(hubDir, fs.PermKeyDir); mkErr != nil { + t.Fatal(mkErr) + } + statePath := filepath.Join(hubDir, cfgHub.FileSyncState) + if writeErr := os.WriteFile( + statePath, []byte("{not json"), fs.PermFile, + ); writeErr != nil { + t.Fatal(writeErr) + } + + _, _, loadErr := loadState() + if loadErr == nil { + t.Fatal("loadState should fail on corrupt state") + } + + lockPath := filepath.Join(hubDir, cfgHub.FileSyncLock) + if _, statErr := os.Stat(lockPath); !os.IsNotExist(statErr) { + t.Errorf( + "lock must not leak after a failed load: %v", statErr, + ) + } +} + +func TestLoadState_LockedErrorIsActionable(t *testing.T) { + ctxDir := declareContext(t) + lockPath := filepath.Join( + ctxDir, cfgHub.DirHub, cfgHub.FileSyncLock, + ) + + _, release, lockErr := loadState() + if lockErr != nil { + t.Fatalf("first loadState: %v", lockErr) + } + defer release() + + _, _, contendErr := loadState() + if contendErr == nil { + t.Fatal("second loadState should fail while lock is held") + } + // Contract preserved: still matches the pre-existing sentinel + // so any errors.Is(err, os.ErrExist) caller keeps working. + if !errors.Is(contendErr, os.ErrExist) { + t.Errorf( + "contended error must wrap os.ErrExist, got: %v", + contendErr, + ) + } + // Actionable: names the lock path so a wedged stale lock is + // self-documenting instead of an opaque "file already exists". + if !strings.Contains(contendErr.Error(), lockPath) { + t.Errorf( + "error should name the lock path %q, got: %v", + lockPath, contendErr, + ) + } +} + +func TestLoadState_LockFileLocation(t *testing.T) { + ctxDir := declareContext(t) + + _, release, lockErr := loadState() + if lockErr != nil { + t.Fatalf("loadState: %v", lockErr) + } + defer release() + + want := filepath.Join(ctxDir, cfgHub.DirHub, cfgHub.FileSyncLock) + if _, statErr := os.Stat(want); statErr != nil { + t.Errorf("lock file not at %s: %v", want, statErr) + } +} diff --git a/internal/cli/connection/core/sync/testmain_test.go b/internal/cli/connection/core/sync/testmain_test.go new file mode 100644 index 000000000..77bb34e55 --- /dev/null +++ b/internal/cli/connection/core/sync/testmain_test.go @@ -0,0 +1,22 @@ +// / ctx: https://ctx.ist +// ,'`./ do you remember? +// `.,'\ +// \ Copyright 2026-present Context contributors. +// SPDX-License-Identifier: Apache-2.0 + +package sync + +import ( + "os" + "testing" + + "github.com/ActiveMemory/ctx/internal/assets/read/lookup" +) + +// TestMain initialises the embedded asset lookup so the +// lock-contention error (errHub.ConnectSyncLocked) renders its +// parsed format string rather than the empty default. +func TestMain(m *testing.M) { + lookup.Init() + os.Exit(m.Run()) +} diff --git a/internal/config/embed/text/err_hub.go b/internal/config/embed/text/err_hub.go index c22588c83..3de9f67f1 100644 --- a/internal/config/embed/text/err_hub.go +++ b/internal/config/embed/text/err_hub.go @@ -20,4 +20,8 @@ const ( // DescKeyErrHubInvalidPeerAction is the text key for // unrecognized peer action errors. DescKeyErrHubInvalidPeerAction = "err.hub.invalid-peer-action" + // DescKeyErrHubConnectSyncLocked is the text key for a + // connect-sync lock-contention failure; names the lock path + // so a wedged stale lock is self-documenting. + DescKeyErrHubConnectSyncLocked = "err.hub.connect-sync-locked" ) diff --git a/internal/config/hub/doc.go b/internal/config/hub/doc.go index 0eb87a683..d22c657ab 100644 --- a/internal/config/hub/doc.go +++ b/internal/config/hub/doc.go @@ -59,7 +59,7 @@ // connection config files // - FilePID ("hub.pid"), FileAdminToken, // DirHubData: daemon management files -// - JSONIndent, LockSentinel, SuffixPluralMD: +// - JSONIndent, SuffixPluralMD: // formatting and naming helpers // // # Raft Cluster Configuration diff --git a/internal/config/hub/hub.go b/internal/config/hub/hub.go index 0d8fe4e70..16e7cc15d 100644 --- a/internal/config/hub/hub.go +++ b/internal/config/hub/hub.go @@ -167,8 +167,6 @@ const ( FileConnect = ".connect.enc" // JSONIndent is the indentation string for JSON marshaling. JSONIndent = " " - // LockSentinel is the content written to lock files. - LockSentinel = "lock" // SuffixPluralMD is the suffix for typed hub markdown // filenames (e.g. "decisions.md"). SuffixPluralMD = "s.md" diff --git a/internal/err/hub/hub.go b/internal/err/hub/hub.go index 222e16033..8124dacee 100644 --- a/internal/err/hub/hub.go +++ b/internal/err/hub/hub.go @@ -8,6 +8,7 @@ package hub import ( "fmt" + "os" "github.com/ActiveMemory/ctx/internal/assets/read/desc" "github.com/ActiveMemory/ctx/internal/config/embed/text" @@ -67,3 +68,22 @@ func InvalidPeerAction(action string) error { action, ) } + +// ConnectSyncLocked returns the error surfaced when connect +// sync cannot acquire its lock because another sync holds it. +// It names the lock path so a wedged stale lock (e.g. left by +// a crashed sync) is self-documenting, and wraps os.ErrExist +// so callers matching the pre-existing contract via +// errors.Is(err, os.ErrExist) still match. +// +// Parameters: +// - lockPath: absolute path to the contended lock file +// +// Returns: +// - error: lock-contention error wrapping os.ErrExist +func ConnectSyncLocked(lockPath string) error { + return fmt.Errorf( + desc.Text(text.DescKeyErrHubConnectSyncLocked), + lockPath, os.ErrExist, + ) +} diff --git a/internal/io/security.go b/internal/io/security.go index 49666c3b8..b47775975 100644 --- a/internal/io/security.go +++ b/internal/io/security.go @@ -139,6 +139,13 @@ func SafeCreateFile(path string, perm os.FileMode) (*os.File, error) { // holder). The handle is closed before returning; the lock is held by // the file's existence until SafeUnlock removes it. // +// If the create succeeds but closing the handle fails, the freshly +// created lock file is removed and (false, err) is returned: callers +// treat a non-nil error as "not acquired" and get no release func, so +// leaving the file behind would wedge every future caller with a lock +// nobody owns. Removing it keeps the on-disk state consistent with the +// reported outcome. +// // Parameters: // - path: lock file path // - perm: file permission bits @@ -161,7 +168,12 @@ func SafeTryLock(path string, perm os.FileMode) (bool, error) { } return false, openErr } - return true, f.Close() + if closeErr := f.Close(); closeErr != nil { + // Best-effort cleanup so a half-acquired lock cannot leak. + _ = os.Remove(clean) + return false, closeErr + } + return true, nil } // SafeUnlock releases a lock acquired by SafeTryLock by removing the diff --git a/specs/fix-connect-sync-lock-toctou.md b/specs/fix-connect-sync-lock-toctou.md new file mode 100644 index 000000000..4ade0b831 --- /dev/null +++ b/specs/fix-connect-sync-lock-toctou.md @@ -0,0 +1,125 @@ +# Fix Connect Sync Lock TOCTOU Race + +`internal/cli/connection/core/sync.loadState` guarded +`ctx connect sync` against concurrent execution with a +check-then-write pattern (`os.Stat` followed by +`io.SafeWriteFile`), not a real lock. Upstream issue: +[ActiveMemory/ctx#93](https://github.com/ActiveMemory/ctx/issues/93). + +## Problem + +The window between the existence check and the write is +unbounded, and the pattern is racy by construction: + +```go +// Acquire lock: fail if another sync is running. +if _, statErr := os.Stat(lockPath); statErr == nil { + return s, nil, os.ErrExist +} +if writeErr := io.SafeWriteFile( + lockPath, []byte(cfgHub.LockSentinel), fs.PermFile, +); writeErr != nil { + return s, nil, writeErr +} +``` + +Two processes racing past the `os.Stat` (hook-triggered plus +manual invocation; cron plus interactive run; two terminals) +can both decide the lock is free, both write the lock file, +both load the same `LastSequence`, both fetch the same entries +from the hub, and both write duplicate content into +`.context/hub/`. (Issue #93 says `.context/shared/`; that is +the pre-rename path — the renderer joins `cfgHub.DirHub` at +`internal/cli/connection/core/render/render.go:32`.) + +The doc comment ("Acquires a lock file to prevent concurrent +access.") overstated what the code did. + +The package had no test coverage at all: no test exercised +the contention path, the release path, or the lock location. + +## Solution + +Replace the stat-then-write with the atomic create-or-fail +primitive that already exists for exactly this purpose: +`io.SafeTryLock` (`O_CREATE|O_EXCL` in a single syscall) and +its counterpart `io.SafeUnlock`. Both landed with the dream +consolidator and have prior art at +`internal/cli/dream/core/pass/pass.go:62-70`. + +1. `internal/cli/connection/core/sync/state.go` — swap the + `os.Stat` / `io.SafeWriteFile` pair for one + `io.SafeTryLock` call. `acquired == false` maps to the + existing `os.ErrExist` contract so `Run`'s caller-facing + behavior is unchanged. The `release` closure delegates to + `io.SafeUnlock`, keeping the warn-on-failure logging. +2. `internal/config/hub/hub.go` — delete `LockSentinel`. Its + only consumer was the racy write; `SafeTryLock` creates an + empty lock file (the lock is the file's existence, not its + content), so the constant is dead. Leaving it would be a + broken window. +3. `internal/config/hub/doc.go` — drop `LockSentinel` from + the package-doc constant inventory. +4. `internal/cli/connection/core/sync/state_test.go` — new; + regression-pins the contract: + - `TestLoadState_RejectsConcurrentSyncs`: N goroutines + race `loadState`; exactly one acquires, the rest observe + `os.ErrExist`. + - `TestLoadState_ReleaseRemovesLock`: after `release()`, + the lock file is gone and a subsequent `loadState` + succeeds. + - `TestLoadState_ReleasesLockOnCorruptState`: a corrupt + `.sync-state.json` makes `loadState` fail *after* + acquisition; the lock must not leak. + - `TestLoadState_LockFileLocation`: the lock lives at + `/hub/.sync.lock` (pins `DirHub` + + `FileSyncLock` composition). + +## Review Follow-ups (PR #113) + +Code review approved the concurrency fix and raised two +hardening points (both non-blocking); addressed here: + +1. **Actionable lock-contention error.** Making the lock + *reliable* raises the stakes of a wedged stale lock: every + subsequent sync now hard-fails, and `Run` surfaced the raw + `os.ErrExist` ("file already exists") with no hint. The + `!acquired` branch now returns `errHub.ConnectSyncLocked(lockPath)` + (new `err.hub.connect-sync-locked` text key) which names the + lock path — "remove the stale lock `/hub/.sync.lock`" + — and wraps `os.ErrExist` with `%w` so the pre-existing + `errors.Is(err, os.ErrExist)` contract (and the contention + test) still holds. This is the cheap half of the deferred + stale-lock work: a self-documenting wedge ahead of automatic + PID/age recovery. `TestLoadState_LockedErrorIsActionable` + pins both the path hint and the `os.ErrExist` wrap. +2. **`SafeTryLock` no longer leaks on close failure.** + Previously a create-succeeds-but-`Close`-fails returned + `(true, closeErr)`; callers treat any non-nil error as + "not acquired" and get no release func, yet the file + persisted — a leaked lock that would wedge every future + caller. `SafeTryLock` now removes the freshly created file + (best-effort) and returns `(false, closeErr)` so the on-disk + state matches the reported outcome. Both callers (`loadState` + and `dream/pass`) check the error before `!acquired`, so the + observable behavior is unchanged except the file no longer + leaks. The close-failure path is exotic (only ENOSPC-on-close + / NFS) and not unit-testable without fault injection, so it + carries no dedicated test. + +The remaining review footnotes (NFS `O_EXCL` caveat on ancient +NFSv2; the lock lives in local `.context/hub/`) are unchanged +and remain negligible. + +## Out of Scope + +- Stale-lock detection (PID-in-lockfile, age-based cleanup). + A crashed process still leaves a stale lock; the issue + explicitly defers this to a follow-up. The review-follow-up + error above makes such a wedge self-documenting, but does not + auto-recover it. +- `flock`-based locking (issue's Option B). Rejected for now: + `syscall.Flock` is Unix-only and `SafeTryLock` already + matches the existing lockfile-as-sentinel model. +- The hubsync hook's silent error suppression + (ActiveMemory/ctx#100) — adjacent code, separate issue.