From 9e60ec8ef947f7e330f0f756a3c11f244f6255a2 Mon Sep 17 00:00:00 2001 From: Omer Kocaoglu Date: Wed, 10 Jun 2026 22:23:03 -0400 Subject: [PATCH 1/2] fix(connect): close TOCTOU race in connect sync lock acquisition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sync lock was a stat-then-write: two processes racing past the existence check could both acquire, both load the same LastSequence, and both write duplicate entries into .context/hub/. Replace the pair with the atomic io.SafeTryLock (O_CREATE|O_EXCL, single syscall) and release via io.SafeUnlock — the same primitive the dream pass already uses, preserving the os.ErrExist contract for callers. - LockSentinel removed: the lock is the file's existence, not its content, and the racy write was the constant's only consumer. - state_test.go regression-pins the contract: 16-way contention yields exactly one winner, release frees the next sync, a corrupt state file does not leak the lock, and the lock path stays at /hub/.sync.lock. Closes #93. Spec: specs/fix-connect-sync-lock-toctou.md Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Omer Kocaoglu --- internal/cli/connection/core/sync/state.go | 18 ++- .../cli/connection/core/sync/state_test.go | 153 ++++++++++++++++++ internal/config/hub/doc.go | 2 +- internal/config/hub/hub.go | 2 - specs/fix-connect-sync-lock-toctou.md | 87 ++++++++++ 5 files changed, 251 insertions(+), 11 deletions(-) create mode 100644 internal/cli/connection/core/sync/state_test.go create mode 100644 specs/fix-connect-sync-lock-toctou.md diff --git a/internal/cli/connection/core/sync/state.go b/internal/cli/connection/core/sync/state.go index f7ab4de97..36095e72f 100644 --- a/internal/cli/connection/core/sync/state.go +++ b/internal/cli/connection/core/sync/state.go @@ -41,18 +41,20 @@ func loadState() (state, func(), error) { return s, nil, mkErr } - // Acquire lock: fail if another sync is running. - if _, statErr := os.Stat(lockPath); statErr == nil { - return s, nil, os.ErrExist + // Acquire lock: fail if another sync is running. The + // O_CREATE|O_EXCL create-or-fail is a single syscall, so + // there is no check-to-write window for a concurrent sync + // to slip through. + acquired, lockErr := io.SafeTryLock(lockPath, fs.PermFile) + if lockErr != nil { + return s, nil, lockErr } - if writeErr := io.SafeWriteFile( - lockPath, []byte(cfgHub.LockSentinel), fs.PermFile, - ); writeErr != nil { - return s, nil, writeErr + if !acquired { + return s, nil, os.ErrExist } release := func() { - if rmErr := os.Remove(lockPath); rmErr != nil { + if rmErr := io.SafeUnlock(lockPath); rmErr != nil { logWarn.Warn(cfgWarn.Remove, lockPath, rmErr) } } diff --git a/internal/cli/connection/core/sync/state_test.go b/internal/cli/connection/core/sync/state_test.go new file mode 100644 index 000000000..dedafe601 --- /dev/null +++ b/internal/cli/connection/core/sync/state_test.go @@ -0,0 +1,153 @@ +// / ctx: https://ctx.ist +// ,'`./ do you remember? +// `.,'\ +// \ Copyright 2026-present Context contributors. +// SPDX-License-Identifier: Apache-2.0 + +package sync + +import ( + "errors" + "os" + "path/filepath" + "testing" + + "github.com/ActiveMemory/ctx/internal/config/fs" + cfgHub "github.com/ActiveMemory/ctx/internal/config/hub" + "github.com/ActiveMemory/ctx/internal/io" + "github.com/ActiveMemory/ctx/internal/testutil/testctx" +) + +// declareContext positions the test in a temp project with a +// materialized .context/ directory and returns its path. +func declareContext(t *testing.T) string { + t.Helper() + dir := t.TempDir() + ctxDir := testctx.Declare(t, dir) + if mkErr := os.MkdirAll(ctxDir, fs.PermExec); mkErr != nil { + t.Fatal(mkErr) + } + return ctxDir +} + +func TestLoadState_RejectsConcurrentSyncs(t *testing.T) { + declareContext(t) + + const attempts = 16 + + type outcome struct { + release func() + err error + } + + start := make(chan struct{}) + results := make(chan outcome, attempts) + for i := 0; i < attempts; i++ { + go func() { + <-start + _, release, lockErr := loadState() + results <- outcome{release: release, err: lockErr} + }() + } + close(start) + + // Winners must not release until every attempt has + // finished, or a late goroutine could legitimately + // re-acquire the freed lock and skew the count. + var winners []func() + var contended int + for i := 0; i < attempts; i++ { + got := <-results + switch { + case got.err == nil: + winners = append(winners, got.release) + case errors.Is(got.err, os.ErrExist): + contended++ + default: + t.Fatalf("unexpected error: %v", got.err) + } + } + + if len(winners) != 1 { + t.Errorf("winners = %d, want exactly 1", len(winners)) + } + if contended != attempts-1 { + t.Errorf( + "contended = %d, want %d", contended, attempts-1, + ) + } + for _, release := range winners { + release() + } +} + +func TestLoadState_ReleaseRemovesLock(t *testing.T) { + ctxDir := declareContext(t) + lockPath := filepath.Join( + ctxDir, cfgHub.DirHub, cfgHub.FileSyncLock, + ) + + _, release, lockErr := loadState() + if lockErr != nil { + t.Fatalf("first loadState: %v", lockErr) + } + if _, statErr := os.Stat(lockPath); statErr != nil { + t.Fatalf("lock file should exist while held: %v", statErr) + } + + release() + + if _, statErr := os.Stat(lockPath); !os.IsNotExist(statErr) { + t.Errorf( + "lock file should be gone after release: %v", statErr, + ) + } + + // The next sync must be able to proceed. + _, release, lockErr = loadState() + if lockErr != nil { + t.Fatalf("loadState after release: %v", lockErr) + } + release() +} + +func TestLoadState_ReleasesLockOnCorruptState(t *testing.T) { + ctxDir := declareContext(t) + hubDir := filepath.Join(ctxDir, cfgHub.DirHub) + if mkErr := io.SafeMkdirAll(hubDir, fs.PermKeyDir); mkErr != nil { + t.Fatal(mkErr) + } + statePath := filepath.Join(hubDir, cfgHub.FileSyncState) + if writeErr := os.WriteFile( + statePath, []byte("{not json"), fs.PermFile, + ); writeErr != nil { + t.Fatal(writeErr) + } + + _, _, loadErr := loadState() + if loadErr == nil { + t.Fatal("loadState should fail on corrupt state") + } + + lockPath := filepath.Join(hubDir, cfgHub.FileSyncLock) + if _, statErr := os.Stat(lockPath); !os.IsNotExist(statErr) { + t.Errorf( + "lock must not leak after a failed load: %v", statErr, + ) + } +} + +func TestLoadState_LockFileLocation(t *testing.T) { + ctxDir := declareContext(t) + + _, release, lockErr := loadState() + if lockErr != nil { + t.Fatalf("loadState: %v", lockErr) + } + defer release() + + want := filepath.Join(ctxDir, cfgHub.DirHub, cfgHub.FileSyncLock) + if _, statErr := os.Stat(want); statErr != nil { + t.Errorf("lock file not at %s: %v", want, statErr) + } +} diff --git a/internal/config/hub/doc.go b/internal/config/hub/doc.go index 0eb87a683..d22c657ab 100644 --- a/internal/config/hub/doc.go +++ b/internal/config/hub/doc.go @@ -59,7 +59,7 @@ // connection config files // - FilePID ("hub.pid"), FileAdminToken, // DirHubData: daemon management files -// - JSONIndent, LockSentinel, SuffixPluralMD: +// - JSONIndent, SuffixPluralMD: // formatting and naming helpers // // # Raft Cluster Configuration diff --git a/internal/config/hub/hub.go b/internal/config/hub/hub.go index 0d8fe4e70..16e7cc15d 100644 --- a/internal/config/hub/hub.go +++ b/internal/config/hub/hub.go @@ -167,8 +167,6 @@ const ( FileConnect = ".connect.enc" // JSONIndent is the indentation string for JSON marshaling. JSONIndent = " " - // LockSentinel is the content written to lock files. - LockSentinel = "lock" // SuffixPluralMD is the suffix for typed hub markdown // filenames (e.g. "decisions.md"). SuffixPluralMD = "s.md" diff --git a/specs/fix-connect-sync-lock-toctou.md b/specs/fix-connect-sync-lock-toctou.md new file mode 100644 index 000000000..021f9edd5 --- /dev/null +++ b/specs/fix-connect-sync-lock-toctou.md @@ -0,0 +1,87 @@ +# Fix Connect Sync Lock TOCTOU Race + +`internal/cli/connection/core/sync.loadState` guarded +`ctx connect sync` against concurrent execution with a +check-then-write pattern (`os.Stat` followed by +`io.SafeWriteFile`), not a real lock. Upstream issue: +[ActiveMemory/ctx#93](https://github.com/ActiveMemory/ctx/issues/93). + +## Problem + +The window between the existence check and the write is +unbounded, and the pattern is racy by construction: + +```go +// Acquire lock: fail if another sync is running. +if _, statErr := os.Stat(lockPath); statErr == nil { + return s, nil, os.ErrExist +} +if writeErr := io.SafeWriteFile( + lockPath, []byte(cfgHub.LockSentinel), fs.PermFile, +); writeErr != nil { + return s, nil, writeErr +} +``` + +Two processes racing past the `os.Stat` (hook-triggered plus +manual invocation; cron plus interactive run; two terminals) +can both decide the lock is free, both write the lock file, +both load the same `LastSequence`, both fetch the same entries +from the hub, and both write duplicate content into +`.context/hub/`. (Issue #93 says `.context/shared/`; that is +the pre-rename path — the renderer joins `cfgHub.DirHub` at +`internal/cli/connection/core/render/render.go:32`.) + +The doc comment ("Acquires a lock file to prevent concurrent +access.") overstated what the code did. + +The package had no test coverage at all: no test exercised +the contention path, the release path, or the lock location. + +## Solution + +Replace the stat-then-write with the atomic create-or-fail +primitive that already exists for exactly this purpose: +`io.SafeTryLock` (`O_CREATE|O_EXCL` in a single syscall) and +its counterpart `io.SafeUnlock`. Both landed with the dream +consolidator and have prior art at +`internal/cli/dream/core/pass/pass.go:62-70`. + +1. `internal/cli/connection/core/sync/state.go` — swap the + `os.Stat` / `io.SafeWriteFile` pair for one + `io.SafeTryLock` call. `acquired == false` maps to the + existing `os.ErrExist` contract so `Run`'s caller-facing + behavior is unchanged. The `release` closure delegates to + `io.SafeUnlock`, keeping the warn-on-failure logging. +2. `internal/config/hub/hub.go` — delete `LockSentinel`. Its + only consumer was the racy write; `SafeTryLock` creates an + empty lock file (the lock is the file's existence, not its + content), so the constant is dead. Leaving it would be a + broken window. +3. `internal/config/hub/doc.go` — drop `LockSentinel` from + the package-doc constant inventory. +4. `internal/cli/connection/core/sync/state_test.go` — new; + regression-pins the contract: + - `TestLoadState_RejectsConcurrentSyncs`: N goroutines + race `loadState`; exactly one acquires, the rest observe + `os.ErrExist`. + - `TestLoadState_ReleaseRemovesLock`: after `release()`, + the lock file is gone and a subsequent `loadState` + succeeds. + - `TestLoadState_ReleasesLockOnCorruptState`: a corrupt + `.sync-state.json` makes `loadState` fail *after* + acquisition; the lock must not leak. + - `TestLoadState_LockFileLocation`: the lock lives at + `/hub/.sync.lock` (pins `DirHub` + + `FileSyncLock` composition). + +## Out of Scope + +- Stale-lock detection (PID-in-lockfile, age-based cleanup). + A crashed process still leaves a stale lock; the issue + explicitly defers this to a follow-up. +- `flock`-based locking (issue's Option B). Rejected for now: + `syscall.Flock` is Unix-only and `SafeTryLock` already + matches the existing lockfile-as-sentinel model. +- The hubsync hook's silent error suppression + (ActiveMemory/ctx#100) — adjacent code, separate issue. From c85a52dbbb162c6e0a2f3eec6864268db3769a62 Mon Sep 17 00:00:00 2001 From: Omer Kocaoglu Date: Sat, 27 Jun 2026 22:22:25 -0400 Subject: [PATCH 2/2] fix(connect): name the contended sync lock; stop SafeTryLock leaking (#113 review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review of the #93 TOCTOU fix raised two non-blocking hardening points; addressed here: - Making the lock reliable raised the stakes of a wedged stale lock: loadState surfaced the raw os.ErrExist ("file already exists") with no hint. The !acquired branch now returns errHub.ConnectSyncLocked(lockPath) — a new err.hub.connect-sync-locked text key that names the lock path so the wedge is self-documenting — and wraps os.ErrExist via %w so the pre-existing errors.Is(err, os.ErrExist) contract and the contention test keep working. - SafeTryLock returned (true, closeErr) when the create succeeded but Close failed; callers treat a non-nil error as "not acquired" and get no release func, so the created file leaked and wedged future callers. It now removes the file (best-effort) and returns (false, closeErr) so the on-disk state matches the reported outcome. Both callers (loadState, dream/pass) check the error before !acquired, so observable behavior is otherwise unchanged. TestLoadState_LockedErrorIsActionable pins the path hint and the os.ErrExist wrap. Spec: specs/fix-connect-sync-lock-toctou.md Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Omer Kocaoglu --- internal/assets/commands/text/errors.yaml | 2 + internal/cli/connection/core/sync/state.go | 6 ++- .../cli/connection/core/sync/state_test.go | 35 ++++++++++++++++ .../cli/connection/core/sync/testmain_test.go | 22 ++++++++++ internal/config/embed/text/err_hub.go | 4 ++ internal/err/hub/hub.go | 20 ++++++++++ internal/io/security.go | 14 ++++++- specs/fix-connect-sync-lock-toctou.md | 40 ++++++++++++++++++- 8 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 internal/cli/connection/core/sync/testmain_test.go diff --git a/internal/assets/commands/text/errors.yaml b/internal/assets/commands/text/errors.yaml index 716615774..4ca32bacf 100644 --- a/internal/assets/commands/text/errors.yaml +++ b/internal/assets/commands/text/errors.yaml @@ -649,6 +649,8 @@ err.hub.duplicate-project: short: 'project already registered: %q' err.hub.invalid-peer-action: short: "action must be 'add' or 'remove', got %q" +err.hub.connect-sync-locked: + short: 'connect sync already in progress; if no sync is running, remove the stale lock %s: %w' err.serve.no-running-hub: short: 'no running hub: %w' err.serve.invalid-pid: diff --git a/internal/cli/connection/core/sync/state.go b/internal/cli/connection/core/sync/state.go index 36095e72f..fec97a904 100644 --- a/internal/cli/connection/core/sync/state.go +++ b/internal/cli/connection/core/sync/state.go @@ -14,6 +14,7 @@ import ( "github.com/ActiveMemory/ctx/internal/config/fs" cfgHub "github.com/ActiveMemory/ctx/internal/config/hub" cfgWarn "github.com/ActiveMemory/ctx/internal/config/warn" + errHub "github.com/ActiveMemory/ctx/internal/err/hub" "github.com/ActiveMemory/ctx/internal/io" logWarn "github.com/ActiveMemory/ctx/internal/log/warn" "github.com/ActiveMemory/ctx/internal/rc" @@ -50,7 +51,10 @@ func loadState() (state, func(), error) { return s, nil, lockErr } if !acquired { - return s, nil, os.ErrExist + // Another sync holds the lock — or a crashed sync left it + // stale. Name the path so the wedge is self-documenting; + // the error still wraps os.ErrExist for errors.Is callers. + return s, nil, errHub.ConnectSyncLocked(lockPath) } release := func() { diff --git a/internal/cli/connection/core/sync/state_test.go b/internal/cli/connection/core/sync/state_test.go index dedafe601..7fd312ebd 100644 --- a/internal/cli/connection/core/sync/state_test.go +++ b/internal/cli/connection/core/sync/state_test.go @@ -10,6 +10,7 @@ import ( "errors" "os" "path/filepath" + "strings" "testing" "github.com/ActiveMemory/ctx/internal/config/fs" @@ -137,6 +138,40 @@ func TestLoadState_ReleasesLockOnCorruptState(t *testing.T) { } } +func TestLoadState_LockedErrorIsActionable(t *testing.T) { + ctxDir := declareContext(t) + lockPath := filepath.Join( + ctxDir, cfgHub.DirHub, cfgHub.FileSyncLock, + ) + + _, release, lockErr := loadState() + if lockErr != nil { + t.Fatalf("first loadState: %v", lockErr) + } + defer release() + + _, _, contendErr := loadState() + if contendErr == nil { + t.Fatal("second loadState should fail while lock is held") + } + // Contract preserved: still matches the pre-existing sentinel + // so any errors.Is(err, os.ErrExist) caller keeps working. + if !errors.Is(contendErr, os.ErrExist) { + t.Errorf( + "contended error must wrap os.ErrExist, got: %v", + contendErr, + ) + } + // Actionable: names the lock path so a wedged stale lock is + // self-documenting instead of an opaque "file already exists". + if !strings.Contains(contendErr.Error(), lockPath) { + t.Errorf( + "error should name the lock path %q, got: %v", + lockPath, contendErr, + ) + } +} + func TestLoadState_LockFileLocation(t *testing.T) { ctxDir := declareContext(t) diff --git a/internal/cli/connection/core/sync/testmain_test.go b/internal/cli/connection/core/sync/testmain_test.go new file mode 100644 index 000000000..77bb34e55 --- /dev/null +++ b/internal/cli/connection/core/sync/testmain_test.go @@ -0,0 +1,22 @@ +// / ctx: https://ctx.ist +// ,'`./ do you remember? +// `.,'\ +// \ Copyright 2026-present Context contributors. +// SPDX-License-Identifier: Apache-2.0 + +package sync + +import ( + "os" + "testing" + + "github.com/ActiveMemory/ctx/internal/assets/read/lookup" +) + +// TestMain initialises the embedded asset lookup so the +// lock-contention error (errHub.ConnectSyncLocked) renders its +// parsed format string rather than the empty default. +func TestMain(m *testing.M) { + lookup.Init() + os.Exit(m.Run()) +} diff --git a/internal/config/embed/text/err_hub.go b/internal/config/embed/text/err_hub.go index c22588c83..3de9f67f1 100644 --- a/internal/config/embed/text/err_hub.go +++ b/internal/config/embed/text/err_hub.go @@ -20,4 +20,8 @@ const ( // DescKeyErrHubInvalidPeerAction is the text key for // unrecognized peer action errors. DescKeyErrHubInvalidPeerAction = "err.hub.invalid-peer-action" + // DescKeyErrHubConnectSyncLocked is the text key for a + // connect-sync lock-contention failure; names the lock path + // so a wedged stale lock is self-documenting. + DescKeyErrHubConnectSyncLocked = "err.hub.connect-sync-locked" ) diff --git a/internal/err/hub/hub.go b/internal/err/hub/hub.go index 222e16033..8124dacee 100644 --- a/internal/err/hub/hub.go +++ b/internal/err/hub/hub.go @@ -8,6 +8,7 @@ package hub import ( "fmt" + "os" "github.com/ActiveMemory/ctx/internal/assets/read/desc" "github.com/ActiveMemory/ctx/internal/config/embed/text" @@ -67,3 +68,22 @@ func InvalidPeerAction(action string) error { action, ) } + +// ConnectSyncLocked returns the error surfaced when connect +// sync cannot acquire its lock because another sync holds it. +// It names the lock path so a wedged stale lock (e.g. left by +// a crashed sync) is self-documenting, and wraps os.ErrExist +// so callers matching the pre-existing contract via +// errors.Is(err, os.ErrExist) still match. +// +// Parameters: +// - lockPath: absolute path to the contended lock file +// +// Returns: +// - error: lock-contention error wrapping os.ErrExist +func ConnectSyncLocked(lockPath string) error { + return fmt.Errorf( + desc.Text(text.DescKeyErrHubConnectSyncLocked), + lockPath, os.ErrExist, + ) +} diff --git a/internal/io/security.go b/internal/io/security.go index 49666c3b8..b47775975 100644 --- a/internal/io/security.go +++ b/internal/io/security.go @@ -139,6 +139,13 @@ func SafeCreateFile(path string, perm os.FileMode) (*os.File, error) { // holder). The handle is closed before returning; the lock is held by // the file's existence until SafeUnlock removes it. // +// If the create succeeds but closing the handle fails, the freshly +// created lock file is removed and (false, err) is returned: callers +// treat a non-nil error as "not acquired" and get no release func, so +// leaving the file behind would wedge every future caller with a lock +// nobody owns. Removing it keeps the on-disk state consistent with the +// reported outcome. +// // Parameters: // - path: lock file path // - perm: file permission bits @@ -161,7 +168,12 @@ func SafeTryLock(path string, perm os.FileMode) (bool, error) { } return false, openErr } - return true, f.Close() + if closeErr := f.Close(); closeErr != nil { + // Best-effort cleanup so a half-acquired lock cannot leak. + _ = os.Remove(clean) + return false, closeErr + } + return true, nil } // SafeUnlock releases a lock acquired by SafeTryLock by removing the diff --git a/specs/fix-connect-sync-lock-toctou.md b/specs/fix-connect-sync-lock-toctou.md index 021f9edd5..4ade0b831 100644 --- a/specs/fix-connect-sync-lock-toctou.md +++ b/specs/fix-connect-sync-lock-toctou.md @@ -75,11 +75,49 @@ consolidator and have prior art at `/hub/.sync.lock` (pins `DirHub` + `FileSyncLock` composition). +## Review Follow-ups (PR #113) + +Code review approved the concurrency fix and raised two +hardening points (both non-blocking); addressed here: + +1. **Actionable lock-contention error.** Making the lock + *reliable* raises the stakes of a wedged stale lock: every + subsequent sync now hard-fails, and `Run` surfaced the raw + `os.ErrExist` ("file already exists") with no hint. The + `!acquired` branch now returns `errHub.ConnectSyncLocked(lockPath)` + (new `err.hub.connect-sync-locked` text key) which names the + lock path — "remove the stale lock `/hub/.sync.lock`" + — and wraps `os.ErrExist` with `%w` so the pre-existing + `errors.Is(err, os.ErrExist)` contract (and the contention + test) still holds. This is the cheap half of the deferred + stale-lock work: a self-documenting wedge ahead of automatic + PID/age recovery. `TestLoadState_LockedErrorIsActionable` + pins both the path hint and the `os.ErrExist` wrap. +2. **`SafeTryLock` no longer leaks on close failure.** + Previously a create-succeeds-but-`Close`-fails returned + `(true, closeErr)`; callers treat any non-nil error as + "not acquired" and get no release func, yet the file + persisted — a leaked lock that would wedge every future + caller. `SafeTryLock` now removes the freshly created file + (best-effort) and returns `(false, closeErr)` so the on-disk + state matches the reported outcome. Both callers (`loadState` + and `dream/pass`) check the error before `!acquired`, so the + observable behavior is unchanged except the file no longer + leaks. The close-failure path is exotic (only ENOSPC-on-close + / NFS) and not unit-testable without fault injection, so it + carries no dedicated test. + +The remaining review footnotes (NFS `O_EXCL` caveat on ancient +NFSv2; the lock lives in local `.context/hub/`) are unchanged +and remain negligible. + ## Out of Scope - Stale-lock detection (PID-in-lockfile, age-based cleanup). A crashed process still leaves a stale lock; the issue - explicitly defers this to a follow-up. + explicitly defers this to a follow-up. The review-follow-up + error above makes such a wedge self-documenting, but does not + auto-recover it. - `flock`-based locking (issue's Option B). Rejected for now: `syscall.Flock` is Unix-only and `SafeTryLock` already matches the existing lockfile-as-sentinel model.