diff --git a/apps/api/src/app.ts b/apps/api/src/app.ts index d18ceeb..8180260 100644 --- a/apps/api/src/app.ts +++ b/apps/api/src/app.ts @@ -55,6 +55,7 @@ import { projectBuzzRoutes } from './routes/projects-buzz.js'; import { helpWantedRoutes } from './routes/projects-help-wanted.js'; import { projectMembershipRoutes } from './routes/projects-members.js'; import { previewRoutes } from './routes/preview.js'; +import { attachmentRoutes } from './routes/attachments.js'; import { samlRoutes } from './routes/saml.js'; import { internalRoutes } from './routes/internal.js'; @@ -180,6 +181,7 @@ export async function buildApp(opts: BuildAppOptions = {}): Promise` against the bare data clone, + * piping stdout directly into the Fastify reply for streaming. Per + * specs/behaviors/storage.md → "Attachments": "Web serves attachments + * via a streamed GET /api/attachments/ route with cache headers." + * + * Bypasses gitsheets' `Sheet.getAttachment()` API in favor of direct git + * plumbing because: + * 1. The attachment key IS the HEAD-tree path (per spec) — parsing it + * back into (sheet, record, name) and re-resolving via the sheet API + * is redundant. + * 2. Standing Sheet handles cache `dataTree` at openStore time + * (documented in storage.md → "Direct gitsheets reads after a + * transact"); plumbing reads from current HEAD on every request, + * so attachment updates are visible immediately without a + * Store.swapPublic(). + */ +import type { FastifyInstance, FastifyRequest } from 'fastify'; +import { spawn } from 'node:child_process'; + +import { ApiNotFoundError, ApiValidationError } from '../lib/errors.js'; + +/** + * Minimum-viable extension → Content-Type table. The set we care about + * today is avatars + buzz images + the occasional PDF; unknown extensions + * fall back to application/octet-stream (clients that need to render know + * what they asked for). + */ +const MIME_BY_EXT: Record = { + jpg: 'image/jpeg', + jpeg: 'image/jpeg', + png: 'image/png', + gif: 'image/gif', + webp: 'image/webp', + avif: 'image/avif', + svg: 'image/svg+xml', + pdf: 'application/pdf', +}; + +function inferContentType(key: string): string { + const dot = key.lastIndexOf('.'); + if (dot < 0 || dot === key.length - 1) return 'application/octet-stream'; + const ext = key.slice(dot + 1).toLowerCase(); + return MIME_BY_EXT[ext] ?? 'application/octet-stream'; +} + +/** + * Validate a wildcard-captured attachment key. Returns the key on success; + * throws ApiValidationError on shape violations. Defense in depth — git + * cat-file with a ref:path argument is itself resistant to shell exploits + * (no shell interpolation; the path is a single argv), but rejecting + * obviously-malformed keys up front gives clearer error messages and + * sidesteps any sheet/path-template-related edge cases. + */ +function validateKey(raw: string): string { + if (raw.length === 0) { + throw new ApiValidationError('attachment key is required', { key: 'required' }); + } + if (raw.startsWith('/')) { + throw new ApiValidationError('attachment key must not start with /', { key: 'no_leading_slash' }); + } + // Reject any control char or null byte. The eslint-disable is for the + // explicit \x00-\x1f range — intentional precisely because we DO want + // to catch control chars in keys (security-relevant input validation). + // eslint-disable-next-line no-control-regex + if (/[\x00-\x1f\x7f]/.test(raw)) { + throw new ApiValidationError('attachment key contains control characters', { + key: 'invalid_chars', + }); + } + // Split on `/` and reject `..`, `.`, or empty segments (`//`, trailing `/`). + for (const segment of raw.split('/')) { + if (segment === '' || segment === '.' || segment === '..') { + throw new ApiValidationError('attachment key contains an invalid segment', { + key: 'invalid_path', + }); + } + } + return raw; +} + +export async function attachmentRoutes(fastify: FastifyInstance): Promise { + const repoPath = fastify.config.CFP_DATA_REPO_PATH; + + fastify.get( + '/api/attachments/*', + { + schema: { + tags: ['attachments'], + summary: 'Serve a gitsheets attachment by its on-record key', + // Wildcard params get folded into params['*']; document the response + // shape but skip strict param validation (the route does its own). + response: { + 200: { type: 'string', description: 'Binary blob; streamed as the response body.' }, + }, + }, + }, + async (request: FastifyRequest, reply) => { + const raw = (request.params as Record)['*'] ?? ''; + const key = validateKey(raw); + + // `git cat-file blob HEAD:` writes the blob to stdout and exits + // 0 on success, non-zero (with a "fatal:" message on stderr) if the + // path doesn't resolve in HEAD. We branch on exit code below. + const child = spawn('git', ['cat-file', 'blob', `HEAD:${key}`], { + cwd: repoPath, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + const stderrChunks: Buffer[] = []; + child.stderr.on('data', (chunk: Buffer) => stderrChunks.push(chunk)); + + // Wait for either: + // - first stdout data → success, set headers and pipe + // - exit before any stdout → failure, translate to 4xx/5xx + // Race-style with a single resolve. + const exited = new Promise((resolve) => { + child.on('close', (code) => resolve(code ?? -1)); + }); + + const firstData = new Promise((resolve) => { + let resolved = false; + const onData = (chunk: Buffer): void => { + if (!resolved) { + resolved = true; + child.stdout.off('data', onData); + resolve(chunk); + } + }; + child.stdout.on('data', onData); + // If the child exits without ever emitting stdout, resolve null. + child.on('close', () => { + if (!resolved) { + resolved = true; + resolve(null); + } + }); + }); + + const first = await firstData; + if (first === null) { + const code = await exited; + const stderr = Buffer.concat(stderrChunks).toString('utf8').trim(); + // git cat-file's stderr for a missing path looks like: + // fatal: path '' does not exist in 'HEAD' + // fatal: Not a valid object name HEAD: + // Both are 404-shaped; any other non-zero exit is unexpected. + if (code !== 0 && /not (?:a valid|exist)|fatal:/i.test(stderr)) { + throw new ApiNotFoundError(`attachment not found: ${key}`); + } + throw new Error(`git cat-file failed (exit ${code}): ${stderr || 'no stderr'}`); + } + + // First chunk arrived → take over the raw response so we can write + // the buffered first chunk + pipe the rest. `reply.hijack()` tells + // Fastify "I'll send the response myself" — headers must be set + // directly on reply.raw from this point on. + reply.hijack(); + reply.raw.writeHead(200, { + 'Content-Type': inferContentType(key), + 'Cache-Control': 'public, max-age=3600', + }); + reply.raw.write(first); + child.stdout.pipe(reply.raw); + + // Resolve when the child finishes flushing — without this, Fastify's + // handler-promise resolves immediately and may close the socket. + await new Promise((resolve, reject) => { + child.stdout.on('end', () => resolve()); + child.on('error', reject); + }); + }, + ); +} diff --git a/apps/api/tests/attachments.test.ts b/apps/api/tests/attachments.test.ts new file mode 100644 index 0000000..baea459 --- /dev/null +++ b/apps/api/tests/attachments.test.ts @@ -0,0 +1,156 @@ +/** + * Tests for GET /api/attachments/:key — implements + * specs/behaviors/storage.md → "Attachments". + * + * Seeds binary blobs at the path the attachment key points to, then + * exercises the route via Fastify inject. Path-traversal + missing-key + * cases verify the validator and the git-cat-file failure translation. + */ +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; +import type { FastifyInstance } from 'fastify'; +import { buildApp } from '../src/app.js'; +import { createFullDataRepo, createPrivateStorageDir } from './helpers/test-full-repo.js'; +import { seedRawBlob } from './helpers/seed-fixtures.js'; + +let dataRepo: { path: string; cleanup: () => Promise }; +let privateStore: { path: string; cleanup: () => Promise }; +let app: FastifyInstance | undefined; + +async function bootApp(): Promise { + return buildApp({ + serverOptions: { logger: false }, + overrideEnv: { + CFP_DATA_REPO_PATH: dataRepo.path, + STORAGE_BACKEND: 'filesystem', + CFP_PRIVATE_STORAGE_PATH: privateStore.path, + CFP_JWT_SIGNING_KEY: 'test-jwt-signing-key-at-least-32-chars!!', + NODE_ENV: 'test', + }, + }); +} + +// Minimal valid PNG: 8-byte signature + IHDR + IEND. Not a real image +// (no IDAT), but byte-comparable through git cat-file and the route. +const TINY_PNG = Buffer.from([ + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, // PNG signature + 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52, // IHDR chunk header + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, + 0x08, 0x00, 0x00, 0x00, 0x00, 0x3b, 0x7e, 0x9b, + 0x55, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4e, 0x44, 0xae, 0x42, 0x60, 0x82, +]); + +beforeEach(async () => { + dataRepo = await createFullDataRepo(); + privateStore = await createPrivateStorageDir(); +}); + +afterEach(async () => { + if (app) { + await app.close(); + app = undefined; + } + await dataRepo.cleanup(); + await privateStore.cleanup(); +}); + +describe('GET /api/attachments/*', () => { + it('serves a seeded avatar by key with image/png Content-Type', async () => { + await seedRawBlob( + dataRepo.path, + 'people/chris/avatar.png', + TINY_PNG, + 'seed avatar for chris', + ); + app = await bootApp(); + + const res = await app.inject({ + method: 'GET', + url: '/api/attachments/people/chris/avatar.png', + }); + expect(res.statusCode).toBe(200); + expect(res.headers['content-type']).toBe('image/png'); + expect(res.headers['cache-control']).toContain('max-age=3600'); + expect(Buffer.from(res.rawPayload).equals(TINY_PNG)).toBe(true); + }); + + it('infers Content-Type from each known extension', async () => { + const cases: Array<{ path: string; type: string }> = [ + { path: 'people/a/avatar.jpg', type: 'image/jpeg' }, + { path: 'people/b/avatar.jpeg', type: 'image/jpeg' }, + { path: 'people/c/avatar.webp', type: 'image/webp' }, + { path: 'people/d/avatar.gif', type: 'image/gif' }, + { path: 'people/e/avatar.svg', type: 'image/svg+xml' }, + { path: 'people/f/doc.pdf', type: 'application/pdf' }, + { path: 'people/g/unknown.xyz', type: 'application/octet-stream' }, + ]; + for (const { path } of cases) { + await seedRawBlob(dataRepo.path, path, Buffer.from([0x00, 0x01]), `seed ${path}`); + } + app = await bootApp(); + + for (const { path, type } of cases) { + const res = await app.inject({ method: 'GET', url: `/api/attachments/${path}` }); + expect(res.statusCode, `${path} status`).toBe(200); + expect(res.headers['content-type'], `${path} content-type`).toBe(type); + } + }); + + it('returns 404 for a key not in HEAD', async () => { + app = await bootApp(); + const res = await app.inject({ + method: 'GET', + url: '/api/attachments/people/nobody/avatar.png', + }); + expect(res.statusCode).toBe(404); + const body = res.json(); + expect(body.success).toBe(false); + expect(body.error?.code).toBe('not_found'); + }); + + it('does not serve files outside the data repo via URL-based traversal', async () => { + // Fastify normalizes `..` segments in the URL path before our handler + // sees them, so `/api/attachments/../etc/passwd` becomes `/etc/passwd` + // (no route match → 404). Our validator catches `..` segments too as + // defense in depth, but the operative contract is: traversal never + // serves a 200 from a file outside the data repo. + app = await bootApp(); + const cases = [ + '/api/attachments/../etc/passwd', + '/api/attachments/people/../../foo', + ]; + for (const url of cases) { + const res = await app.inject({ method: 'GET', url }); + expect(res.statusCode, url).not.toBe(200); + } + }); + + it('rejects keys with embedded null bytes with 422', async () => { + app = await bootApp(); + // %00 decodes to a null byte; the validator rejects control chars + // explicitly so even if Fastify lets it through to the route, we 422. + const res = await app.inject({ + method: 'GET', + url: '/api/attachments/people/chris/avatar%00.png', + }); + expect(res.statusCode).toBe(422); + }); + + it('serves binary content byte-identical (no transcoding)', async () => { + // Include all bytes 0-255 to verify the streaming path is byte-clean. + const allBytes = Buffer.from(Array.from({ length: 256 }, (_, i) => i)); + await seedRawBlob( + dataRepo.path, + 'people/binary-test/data.bin', + allBytes, + 'seed binary test', + ); + app = await bootApp(); + + const res = await app.inject({ + method: 'GET', + url: '/api/attachments/people/binary-test/data.bin', + }); + expect(res.statusCode).toBe(200); + expect(Buffer.from(res.rawPayload).equals(allBytes)).toBe(true); + }); +}); diff --git a/apps/api/tests/helpers/seed-fixtures.ts b/apps/api/tests/helpers/seed-fixtures.ts index 09c2beb..3df249b 100644 --- a/apps/api/tests/helpers/seed-fixtures.ts +++ b/apps/api/tests/helpers/seed-fixtures.ts @@ -59,6 +59,37 @@ export async function seedRawToml( } } +/** + * Sibling of `seedRawToml` for binary blobs (attachments, images, …). + * Writes raw bytes at `relPath` and commits via the same transient + * working-tree-clone-then-push dance. + */ +export async function seedRawBlob( + bareRepoPath: string, + relPath: string, + bytes: Buffer, + commitMessage: string, +): Promise { + const wt = await mkdtemp(join(tmpdir(), 'cfp-seed-wt-')); + try { + await execAsync('git', ['clone', bareRepoPath, wt]); + await execAsync('git', ['config', 'user.email', 'test@cfp.test'], { cwd: wt }); + await execAsync('git', ['config', 'user.name', 'cfp test'], { cwd: wt }); + await execAsync('git', ['config', 'commit.gpgsign', 'false'], { cwd: wt }); + await execAsync('git', ['config', 'core.hooksPath', '/dev/null'], { cwd: wt }); + + const absPath = join(wt, relPath); + await mkdir(dirname(absPath), { recursive: true }); + await writeFile(absPath, bytes); + + await execAsync('git', ['add', relPath], { cwd: wt }); + await execAsync('git', ['commit', '-m', commitMessage], { cwd: wt }); + await execAsync('git', ['push', 'origin', 'main'], { cwd: wt }); + } finally { + await rm(wt, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 }); + } +} + const NOW = '2026-05-01T00:00:00Z'; const NOW2 = '2026-05-10T00:00:00Z'; diff --git a/plans/attachments-serving.md b/plans/attachments-serving.md new file mode 100644 index 0000000..2ac6eaf --- /dev/null +++ b/plans/attachments-serving.md @@ -0,0 +1,116 @@ +--- +status: done +depends: [] +specs: + - specs/behaviors/storage.md + - specs/api/people.md +issues: [77] +pr: 94 +--- + +# Plan: GET /api/attachments/:key + +## Scope + +[`specs/behaviors/storage.md`](../specs/behaviors/storage.md#attachments) specifies that binary blobs (avatars, buzz images, featured-project hero images) live alongside their owning record via gitsheets' attachment plumbing, and are served via a streamed `GET /api/attachments/` route. Serializers already construct these URLs (e.g. [`projects-members.ts:33`](../apps/api/src/routes/projects-members.ts) builds `/api/attachments/${person.avatarKey}` for `avatarUrl`), but the route doesn't exist — every reference 404s. + +This is a pre-req for [#32](https://github.com/CodeForPhilly/codeforphilly-ng/issues/32) (avatar upload): even once upload lands, the file can't be read back without this. + +Closes [#77](https://github.com/CodeForPhilly/codeforphilly-ng/issues/77). + +## Implements + +- [behaviors/storage.md#attachments](../specs/behaviors/storage.md#attachments) — "Web serves attachments via a streamed `GET /api/attachments/` route with cache headers." +- [api/people.md](../specs/api/people.md) — `avatarUrl` shape (`/api/attachments/`). +- [api/projects-buzz.md](../specs/api/projects-buzz.md) — `imageUrl` shape (`/api/attachments/`). + +## Approach + +### 1. Route + +`GET /api/attachments/*` (wildcard segment) in `apps/api/src/routes/attachments.ts`. The wildcard captures the full path-after-prefix as a single string, which is the gitsheets attachment key per spec (e.g. `people/janedoe/avatar.jpg`, `project-buzz/squadquest/inquirer-praises-foo/image.jpg`). + +### 2. Implementation — direct git plumbing + +Bare-repo invariant ([`storage.md`](../specs/behaviors/storage.md) → "The data clone is bare") means we can't read files off disk; we read blobs from the git object DB. The attachment key IS the path in the HEAD tree, so: + +``` +git cat-file blob HEAD: +``` + +streams the bytes. The route spawns this as a child process, pipes stdout into the Fastify reply, and sets headers. + +The alternative — going through `sheet.getAttachment(record, name)` — requires parsing the key to identify (sheet, record, attachment-name), looking up the record in memory, then resolving the blob via the cached `dataTree` on the standing Sheet (which is itself a known staleness vector per #47). Plumbing skips both burdens. + +### 3. Path validation + +Before invoking git, validate the key: + +- Must not start with `/` (Fastify wildcard already strips the leading `/`, but defensive). +- Must not contain `..` segments (path traversal). +- Must not contain empty segments (`//`, leading/trailing `/`). +- Must not contain control characters or null bytes. + +Invalid → 400. The `git cat-file` invocation itself sanitizes against most exploits because we pass `HEAD:` as a single argument (not a shell string), but defense in depth. + +### 4. Content-Type + +Infer from the file extension. The set we care about today: + +- Images: jpg/jpeg, png, gif, webp, svg +- Documents: pdf +- Fallback: `application/octet-stream` + +A small in-file table; if we ever want a larger set, lift the table from gitsheets' own `inferMimeType` (it has ~30 entries; not worth importing for our subset). + +### 5. Caching + +`Cache-Control: public, max-age=3600` (1 hour). Path-keyed attachments mean an update to (say) `people/janedoe/avatar.jpg` keeps the same URL — we *can't* use immutable + long max-age. Future polish: derive an `ETag` from `git rev-parse HEAD:` (the blob hash) so conditional GETs short-circuit with 304. Skipped for v1 — short max-age is good enough; ETag adds an extra git invocation per request. + +### 6. Error shapes + +- 404 — `git cat-file` exits non-zero (path not in HEAD tree) +- 400 — path validation fails +- 500 — git invocation errors not matching the 404 case (rare; probably a corrupt repo) + +All 4xx/5xx use the standard response envelope from `lib/response.ts` (the route helpers handle that). + +### 7. Streaming vs buffering + +`git cat-file blob` writes to stdout; the child's stdout is a `Readable`. Fastify accepts a Readable as the response body and streams it. For large attachments (PDF, hi-res images), this avoids buffering into memory. No size limit at the route level — the data repo's own write surface is what gates upload size. + +## Validation + +- [x] `GET /api/attachments/people//avatar.png` against a seeded attachment returns 200 + the bytes + correct Content-Type. +- [x] `GET /api/attachments/` returns 404 with the standard envelope. +- [x] URL-based path traversal (`/api/attachments/../etc/passwd`) — Fastify normalizes upstream of the handler, so these never reach our validator and never serve 200 from outside the data repo. Test confirms `statusCode !== 200`. +- [x] Null-byte-in-key (`%00`) returns 422 from the validator. +- [x] Content-Type inferred for jpg/jpeg/png/gif/webp/avif/svg/pdf; unknown extensions get `application/octet-stream`. +- [x] Binary content byte-identical end-to-end (test seeds all 256 byte values; route returns them unchanged). +- [x] All 280 API tests pass (274 pre-existing + 6 new). +- [x] `npm run type-check && npm run lint` clean. + +## Risks / unknowns + +- **Backpressure when the client is slow.** Streaming from `git cat-file` to Fastify means a slow client could block the git process. Fastify handles backpressure on the destination side; we don't need to do anything special. +- **Concurrent attachment serves.** Each request spawns a `git` process. For high concurrency (hundreds of avatars per page load via project lists), this could fork-bomb. Today's traffic is low; if profiling ever shows it, we can pool or batch. Out of scope. +- **No range request support.** A 200 OK with the full body for every request — no `Range:`/206. Fine for small avatars/images; if we ever serve video or very large PDFs, revisit. +- **No ETag in v1.** Path-keyed attachments + 1-hour cache means a user who updates their avatar may see the old one for up to an hour. Acceptable for civic-platform avatars; we can add ETag in a follow-up if it bites. +- **Stale gitsheets Sheet dataTree (#47) does NOT apply here.** We read from `HEAD` via plumbing on every request, not from any cached Sheet handle. So the staleness footgun documented in `behaviors/storage.md` is sidestepped by going direct. + +## Notes + +Three commits — plan opening + route + tests/lint-fix. + +Surprises: + +- **Fastify normalizes URL path segments before routing.** `/api/attachments/../etc/passwd` gets collapsed to `/etc/passwd` upstream of our handler — the validator never sees the `..` from URL-borne traversal attempts. The validator's `..` check is defense in depth, not the primary line. Adjusted the test to assert "doesn't 200 from outside the data repo" rather than "validator returns 422". The validator still catches null bytes, control chars, and leading `/` which Fastify doesn't normalize. +- **Hijacking the reply for streaming.** First pass used `reply.header()` + `reply.raw.write()` to inject the buffered first chunk, but headers set via `reply.header()` are tracked by Fastify and only flushed on `reply.send()` — bypassing send via `.raw.write()` skips header flush, so the response went out with no Content-Type. Fixed by calling `reply.hijack()` first (tells Fastify "I own the response from here"), then `reply.raw.writeHead(200, { ... })` to write headers directly. +- **First-chunk-or-exit race.** `git cat-file` exits non-zero with `fatal:` on stderr when a path is missing. Race-style `Promise` on `firstData` vs `exit` lets us distinguish "exited before any stdout" (translate to 404) from "first chunk arrived" (200 + stream). Zero-byte blobs would currently 404 — edge case not worth designing around for v1 since gitsheets' `setAttachment` rejects zero-byte input. + +## Follow-ups + +- **ETag + 304 conditional GETs.** Easy win: `git rev-parse HEAD:` gives the blob hash; use it as ETag. With `If-None-Match`, 304 the request instead of streaming the blob. Saves the git-spawn cost on cache hits. *Tracked as*: low-priority polish; the 1-hour max-age already gives most clients fresh-enough responses. +- **Range request support.** No `Range:` handling today — full 200 OK on every request. Fine for small avatars/buzz-images; if we ever serve video or large PDFs, revisit. +- **Process pool for high concurrency.** Each request spawns a `git cat-file` process. If a project-list page renders 30 avatars in parallel, that's 30 fork/execs. Today's traffic doesn't warrant pooling; if flamegraphs show fork overhead, switch to `git cat-file --batch` long-running mode shared via a pool. +- **404 vs 410 for tombstoned records.** Plan originally mentioned 410 for tombstoned record attachments. Distinguishing requires re-querying the record by sheet/slug, which adds a lookup per request. *Deferred* until a concrete user-facing need for the distinction appears.