diff --git a/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts index 68e3643f9e9..13adb18cab7 100644 --- a/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts @@ -23,6 +23,9 @@ import { } from "~/v3/mollifier/readFallback.server"; import { generatePresignedUrl } from "~/v3/objectStore.server"; import { runStore } from "~/v3/runStore.server"; +import { hydrateParentAndRoot, hydrateChildRuns } from "~/v3/runHierarchy.server"; +import { v2RunsMayExist } from "~/v3/runTableV2Status.server"; +import { env as serverEnv } from "~/env.server"; import { tracer } from "~/v3/tracer.server"; import { startSpanWithEnv } from "~/v3/tracing.server"; @@ -133,21 +136,44 @@ export class ApiRetrieveRunPresenter { attemptNumber: true, engine: true, taskEventStore: true, - parentTaskRun: { - select: commonRunSelect, - }, - rootTaskRun: { - select: commonRunSelect, - }, - childRuns: { - select: commonRunSelect, - }, + parentTaskRunId: true, + rootTaskRunId: true, }, }, $replica ); - if (pgRow) return { ...pgRow, isBuffered: false }; + if (pgRow) { + // Resolve parent/root/children across both run tables. A single Prisma + // relation select is table-bound, so a v2 run's legacy parent (or a + // legacy run's v2 children), which arise in the mixed window, would come + // back null/empty. Resolve parent/root by id (RunStore routes by format) + // and children by a both-table predicate. + // Scope the cross-table reads on whether a v2 run could exist at all, NOT + // the org's current flag: a run's table is fixed by its id format, and an + // org that was on v2 then flipped off still HAS v2 runs (and v2 children) + // that stay readable. pgRow is routed here by id format, so it can be a v2 + // run for a now-non-v2 org; scoping to "legacy" would then silently drop + // its v2 children/parent. v2RunsMayExist is monotonic (native on now, OR + // task_run_v2 already has rows), so turning the native master switch off + // does not re-scope to legacy and hide existing v2 runs. While no v2 run + // has ever existed it stays "legacy" and skips the empty task_run_v2 query. + // The reads also run in parallel. + const tables = v2RunsMayExist(serverEnv.REALTIME_BACKEND_NATIVE_ENABLED === "1") + ? "both" + : "legacy"; + const [{ parentTaskRun, rootTaskRun }, childRuns] = await Promise.all([ + hydrateParentAndRoot( + { parentTaskRunId: pgRow.parentTaskRunId, rootTaskRunId: pgRow.rootTaskRunId }, + { runtimeEnvironmentId: env.id, tables }, + commonRunSelect, + $replica + ), + hydrateChildRuns(pgRow.id, { runtimeEnvironmentId: env.id, tables }, commonRunSelect, $replica), + ]); + + return { ...pgRow, parentTaskRun, rootTaskRun, childRuns, isBuffered: false }; + } // Postgres miss → fall back to the mollifier buffer. When the gate // diverted a trigger, the run lives in Redis until the drainer replays diff --git a/apps/webapp/app/presenters/v3/RunPresenter.server.ts b/apps/webapp/app/presenters/v3/RunPresenter.server.ts index c4c3ac88c48..9b37448b88e 100644 --- a/apps/webapp/app/presenters/v3/RunPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/RunPresenter.server.ts @@ -9,6 +9,7 @@ import { isFinalRunStatus } from "~/v3/taskStatus"; import { env } from "~/env.server"; import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server"; import { runStore } from "~/v3/runStore.server"; +import { hydrateParentAndRoot } from "~/v3/runHierarchy.server"; type Result = Awaited>; export type Run = Result["run"]; @@ -93,20 +94,8 @@ export class RunPresenter { completedAt: true, logsDeletedAt: true, annotations: true, - rootTaskRun: { - select: { - friendlyId: true, - spanId: true, - createdAt: true, - }, - }, - parentTaskRun: { - select: { - friendlyId: true, - spanId: true, - createdAt: true, - }, - }, + rootTaskRunId: true, + parentTaskRunId: true, runtimeEnvironment: { select: { id: true, @@ -143,6 +132,16 @@ export class RunPresenter { const showLogs = showDeletedLogs || !run.logsDeletedAt; + // Resolve parent/root across both physical run tables: a v2 run can have a + // legacy parent/root (or vice versa) in the mixed window, which a + // table-bound Prisma relation select would miss. + const { parentTaskRun, rootTaskRun } = await hydrateParentAndRoot( + { parentTaskRunId: run.parentTaskRunId, rootTaskRunId: run.rootTaskRunId }, + { runtimeEnvironmentId: run.runtimeEnvironment.id }, + { friendlyId: true, spanId: true, createdAt: true }, + this.#prismaClient + ); + const runData = { id: run.id, number: run.number, @@ -154,8 +153,8 @@ export class RunPresenter { startedAt: run.startedAt, completedAt: run.completedAt, logsDeletedAt: showDeletedLogs ? null : run.logsDeletedAt, - rootTaskRun: run.rootTaskRun, - parentTaskRun: run.parentTaskRun, + rootTaskRun, + parentTaskRun, environment: { id: run.runtimeEnvironment.id, organizationId: run.runtimeEnvironment.organizationId, @@ -184,7 +183,7 @@ export class RunPresenter { getTaskEventStoreTableForRun(run), run.runtimeEnvironment.id, run.traceId, - run.rootTaskRun?.createdAt ?? run.createdAt, + rootTaskRun?.createdAt ?? run.createdAt, run.completedAt ?? undefined, { includeDebugLogs: showDebug } ); diff --git a/apps/webapp/app/presenters/v3/SpanPresenter.server.ts b/apps/webapp/app/presenters/v3/SpanPresenter.server.ts index 49d8f303560..30541e0c2c1 100644 --- a/apps/webapp/app/presenters/v3/SpanPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/SpanPresenter.server.ts @@ -587,22 +587,9 @@ export class SpanPresenter extends BasePresenter { filePath: true, }, }, - //relationships - rootTaskRun: { - select: { - taskIdentifier: true, - friendlyId: true, - spanId: true, - createdAt: true, - }, - }, - parentTaskRun: { - select: { - taskIdentifier: true, - friendlyId: true, - spanId: true, - }, - }, + //relationships (resolved across both run tables after the fetch) + rootTaskRunId: true, + parentTaskRunId: true, batch: { select: { friendlyId: true, @@ -626,7 +613,31 @@ export class SpanPresenter extends BasePresenter { this._replica ); - return run; + if (!run) { + return run; + } + + // Resolve parent/root across both run tables: a v2 run can reference a + // legacy parent/root (or vice versa) in the mixed window, which a + // table-bound Prisma relation select on a single table would miss. + const [parentTaskRun, rootTaskRun] = await Promise.all([ + run.parentTaskRunId + ? runStore.findRun( + { id: run.parentTaskRunId, runtimeEnvironmentId: environmentId }, + { select: { taskIdentifier: true, friendlyId: true, spanId: true } }, + this._replica + ) + : Promise.resolve(null), + run.rootTaskRunId + ? runStore.findRun( + { id: run.rootTaskRunId, runtimeEnvironmentId: environmentId }, + { select: { taskIdentifier: true, friendlyId: true, spanId: true, createdAt: true } }, + this._replica + ) + : Promise.resolve(null), + ]); + + return { ...run, parentTaskRun, rootTaskRun }; } async #getSpan({ diff --git a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts index 513616470a0..621143c54b4 100644 --- a/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts +++ b/apps/webapp/app/routes/admin.api.v1.orgs.$organizationId.feature-flags.ts @@ -2,7 +2,7 @@ import { ActionFunctionArgs, LoaderFunctionArgs, json } from "@remix-run/server- import { z } from "zod"; import { prisma } from "~/db.server"; import { requireAdminApiRequest } from "~/services/personalAccessToken.server"; -import { validatePartialFeatureFlags } from "~/v3/featureFlags"; +import { validateFeatureFlagInvariants, validatePartialFeatureFlags } from "~/v3/featureFlags"; const ParamsSchema = z.object({ organizationId: z.string(), @@ -85,6 +85,14 @@ export async function action({ request, params }: ActionFunctionArgs) { ...validationResult.data, }; + // Enforce cross-flag invariants on the merged result (e.g. runTableV2 + // requires realtimeBackend=native). Checked on the merge so it also rejects + // turning realtime back to Electric while runTableV2 stays on. + const invariant = validateFeatureFlagInvariants(mergedFlags); + if (!invariant.ok) { + return json({ error: invariant.error }, { status: 400 }); + } + // Update the organization's feature flags const updatedOrganization = await prisma.organization.update({ where: { diff --git a/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts b/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts index 6081febb526..c2a85833f72 100644 --- a/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts +++ b/apps/webapp/app/routes/admin.api.v2.orgs.$organizationId.feature-flags.ts @@ -5,7 +5,12 @@ import { z } from "zod"; import { prisma } from "~/db.server"; import { requireUser } from "~/services/session.server"; import { flags as getGlobalFlags } from "~/v3/featureFlags.server"; -import { FEATURE_FLAG, validatePartialFeatureFlags, getAllFlagControlTypes } from "~/v3/featureFlags"; +import { + FEATURE_FLAG, + validateFeatureFlagInvariants, + validatePartialFeatureFlags, + getAllFlagControlTypes, +} from "~/v3/featureFlags"; import { featuresForRequest } from "~/features.server"; // Session-auth route for the admin feature flags dialog. @@ -113,6 +118,15 @@ export async function action({ request, params }: ActionFunctionArgs) { { status: 400 } ); } + + // Enforce cross-flag invariants (e.g. runTableV2 requires + // realtimeBackend=native). This route replaces the whole set, so the + // validated data IS the final resolved set. + const invariant = validateFeatureFlagInvariants(validationResult.data); + if (!invariant.ok) { + return json({ error: invariant.error }, { status: 400 }); + } + featureFlags = validationResult.data; } diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts index 061199f33e9..4e8d85b7cf8 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts @@ -126,6 +126,11 @@ export const loader = createLoaderApiRoute( const triggeredRuns = await runStore.findRuns( { take: 50, + // A parentSpanId predicate spans both run tables (it carries no id), so + // the cross-table store requires a total-order key to bound the merge; + // createdAt also makes the 50-row cap deterministic (most recent first) + // rather than an arbitrary single-table slice. + orderBy: { createdAt: "desc" }, select: { friendlyId: true, taskIdentifier: true, diff --git a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts index 7cb813a6dec..ec9c11568cb 100644 --- a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts +++ b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.append.ts @@ -36,16 +36,8 @@ const { action } = createActionApiRoute( select: { id: true, friendlyId: true, - parentTaskRun: { - select: { - friendlyId: true, - }, - }, - rootTaskRun: { - select: { - friendlyId: true, - }, - }, + parentTaskRunId: true, + rootTaskRunId: true, }, }, $replica @@ -55,12 +47,24 @@ const { action } = createActionApiRoute( return new Response("Run not found", { status: 404 }); } - const targetId = - params.target === "self" - ? run.friendlyId - : params.target === "parent" - ? run.parentTaskRun?.friendlyId - : run.rootTaskRun?.friendlyId; + // parentTaskRunId/rootTaskRunId are scalar ids that may point at a run in + // the OTHER physical table (the runTableV2 mixed window), so resolve the + // target's friendlyId by id (RunStore routes by id format) rather than via a + // table-bound relation select, which would return null cross-table. + let targetId: string | undefined; + if (params.target === "self") { + targetId = run.friendlyId; + } else { + const targetScalarId = params.target === "parent" ? run.parentTaskRunId : run.rootTaskRunId; + if (targetScalarId) { + const target = await runStore.findRun( + { id: targetScalarId, runtimeEnvironmentId: authentication.environment.id }, + { select: { friendlyId: true } }, + $replica + ); + targetId = target?.friendlyId; + } + } if (!targetId) { return new Response("Target not found", { status: 404 }); diff --git a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts index c71ad48d121..fa2cee9b110 100644 --- a/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts +++ b/apps/webapp/app/routes/realtime.v1.streams.$runId.$target.$streamId.ts @@ -14,6 +14,23 @@ const ParamsSchema = z.object({ streamId: z.string(), }); +// Resolve a parent/root stream target across BOTH run tables. The scalar +// parentTaskRunId/rootTaskRunId may reference a run in the other physical table +// during the runTableV2 mixed window; findRun routes by id format, so this +// resolves the target whichever table it lives in (a table-bound relation +// select would resolve null for a cross-table parent/root). +async function resolveStreamTargetById( + targetScalarId: string | null, + runtimeEnvironmentId: string +): Promise<{ friendlyId: string; streamBasinName: string | null } | null> { + if (!targetScalarId) return null; + return runStore.findRun( + { id: targetScalarId, runtimeEnvironmentId }, + { select: { friendlyId: true, streamBasinName: true } }, + $replica + ); +} + const { action } = createActionApiRoute( { params: ParamsSchema, @@ -29,18 +46,8 @@ const { action } = createActionApiRoute( id: true, friendlyId: true, streamBasinName: true, - parentTaskRun: { - select: { - friendlyId: true, - streamBasinName: true, - }, - }, - rootTaskRun: { - select: { - friendlyId: true, - streamBasinName: true, - }, - }, + parentTaskRunId: true, + rootTaskRunId: true, }, }, $replica @@ -50,12 +57,18 @@ const { action } = createActionApiRoute( return new Response("Run not found", { status: 404 }); } + // Resolve the target across BOTH run tables. parentTaskRunId/rootTaskRunId + // are scalar pointers that may reference a run in the OTHER physical table + // (the runTableV2 mixed window); a table-bound relation select would resolve + // null and 404 a target that exists. findRun routes by id format; "self" is + // the run itself. const targetRun = params.target === "self" - ? run - : params.target === "parent" - ? run.parentTaskRun - : run.rootTaskRun; + ? { friendlyId: run.friendlyId, streamBasinName: run.streamBasinName } + : await resolveStreamTargetById( + params.target === "parent" ? run.parentTaskRunId : run.rootTaskRunId, + authentication.environment.id + ); if (!targetRun?.friendlyId) { return new Response("Target not found", { status: 404 }); @@ -164,18 +177,8 @@ const loader = createLoaderApiRoute( id: true, friendlyId: true, streamBasinName: true, - parentTaskRun: { - select: { - friendlyId: true, - streamBasinName: true, - }, - }, - rootTaskRun: { - select: { - friendlyId: true, - streamBasinName: true, - }, - }, + parentTaskRunId: true, + rootTaskRunId: true, }, }, $replica @@ -187,12 +190,15 @@ const loader = createLoaderApiRoute( return new Response("Run not found", { status: 404 }); } + // Resolve the target across both run tables by id (the scalar parent/root + // pointer may be cross-table in the mixed window); "self" is the run itself. const targetRun = params.target === "self" - ? run - : params.target === "parent" - ? run.parentTaskRun - : run.rootTaskRun; + ? { friendlyId: run.friendlyId, streamBasinName: run.streamBasinName } + : await resolveStreamTargetById( + params.target === "parent" ? run.parentTaskRunId : run.rootTaskRunId, + authentication.environment.id + ); if (!targetRun?.friendlyId) { return new Response("Target not found", { status: 404 }); diff --git a/apps/webapp/app/routes/resources.runs.$runParam.ts b/apps/webapp/app/routes/resources.runs.$runParam.ts index 38e17531f6f..878f611e97c 100644 --- a/apps/webapp/app/routes/resources.runs.$runParam.ts +++ b/apps/webapp/app/routes/resources.runs.$runParam.ts @@ -7,6 +7,7 @@ import { requireUserId } from "~/services/session.server"; import { v3RunParamsSchema } from "~/utils/pathBuilder"; import { machinePresetFromName, machinePresetFromRun } from "~/v3/machinePresets.server"; import { runStore } from "~/v3/runStore.server"; +import { hydrateParentAndRoot } from "~/v3/runHierarchy.server"; import { FINAL_ATTEMPT_STATUSES, isFinalRunStatus } from "~/v3/taskStatus"; export type RunInspectorData = UseDataFunctionReturn; @@ -102,16 +103,11 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }, }, }, - parentTaskRun: { - select: { - friendlyId: true, - }, - }, - rootTaskRun: { - select: { - friendlyId: true, - }, - }, + // Scalar parent/root pointers, NOT the table-bound relations: a relation + // select resolves null for a cross-table parent/root (a v2 run's legacy + // parent or vice versa in the mixed window). Resolve by id below. + parentTaskRunId: true, + rootTaskRunId: true, }, }, $replica @@ -121,6 +117,15 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw new Response("Not found", { status: 404 }); } + // Resolve parent/root across both run tables by id (RunStore routes by id + // format), scoped to this run's environment. + const { parentTaskRun, rootTaskRun } = await hydrateParentAndRoot( + { parentTaskRunId: run.parentTaskRunId, rootTaskRunId: run.rootTaskRunId }, + { runtimeEnvironmentId: run.runtimeEnvironment.id }, + { friendlyId: true }, + $replica + ); + const isFinished = isFinalRunStatus(run.status); const finishedAttempt = isFinished @@ -187,8 +192,8 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { baseCostInCents: run.baseCostInCents, maxAttempts: run.maxAttempts ?? undefined, version: run.lockedToVersion?.version, - parentTaskRunId: run.parentTaskRun?.friendlyId ?? undefined, - rootTaskRunId: run.rootTaskRun?.friendlyId ?? undefined, + parentTaskRunId: parentTaskRun?.friendlyId ?? undefined, + rootTaskRunId: rootTaskRun?.friendlyId ?? undefined, }, queue: { name: run.queue, diff --git a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts index 02d0ec957f2..39a38e14d81 100644 --- a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts +++ b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts @@ -1,5 +1,5 @@ import { RunId } from "@trigger.dev/core/v3/isomorphic"; -import type { PrismaClientOrTransaction, TaskRun } from "@trigger.dev/database"; +import type { Prisma, PrismaClientOrTransaction, TaskRun } from "@trigger.dev/database"; import { env } from "~/env.server"; import { logger } from "~/services/logger.server"; import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server"; @@ -11,6 +11,8 @@ import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.se import { claimOrAwait } from "~/v3/mollifier/idempotencyClaim.server"; import { makeResolveMollifierFlag } from "~/v3/mollifier/mollifierGate.server"; import { runStore } from "~/v3/runStore.server"; +import { shouldUseV2RunTable } from "~/v3/runTableV2.server"; +import { v2RunsMayExist } from "~/v3/runTableV2Status.server"; import type { TraceEventConcern, TriggerTaskRequest } from "../types"; // In-memory per-org mollifier-enabled check, shared with `evaluateGate` @@ -20,6 +22,18 @@ import type { TraceEventConcern, TriggerTaskRequest } from "../types"; // handleTriggerRequest. const resolveOrgMollifierFlag = makeResolveMollifierFlag(); +// Reserved task slot for the cross-table one-time-use-token claim. The DB +// constraint `@@unique([oneTimeUseToken])` is TASK-INDEPENDENT, so the claim +// must be keyed on the token alone, not (task, token): a single token can +// authorise more than one task, and two presentations for different tasks +// straddling a `runTableV2` flip would otherwise build different claim keys and +// both proceed. Folding the token into one constant task slot makes the claim +// key (envId, token)-scoped, matching the DB constraint's scope. Paired with +// the `otu:` idempotencyKey prefix, collision with a real task's idempotency +// claim would require a task literally named this AND an idempotency key of the +// form `otu:`. +const ONE_TIME_USE_TOKEN_CLAIM_TASK = "__one_time_use_token__"; + // Claim ownership context returned to the caller when the // IdempotencyKeyConcern won a pre-gate claim. Caller MUST publish the // winning runId on pipeline success (`publishClaim`) or release the @@ -136,6 +150,73 @@ export class IdempotencyKeyConcern { return synthetic as unknown as TaskRun; } + // Return an already-resolved idempotent run as a cache hit, blocking the + // parent on the run's waitpoint when this is a triggerAndWait + // (`resumeParentOnCompletion`). Shared by the direct PG/buffer existing-run + // path and the claim-`resolved` path (a concurrent same-key trigger that won + // the claim): a v2-cutover triggerAndWait that loses the claim must still + // block its parent, because the per-table unique constraints don't dedup + // across TaskRun/task_run_v2 — the claim is what serialises these. + private async returnCachedIdempotentRun( + request: TriggerTaskRequest, + parentStore: string | undefined, + existingRun: Prisma.TaskRunGetPayload<{ include: { associatedWaitpoint: true } }>, + idempotencyKey: string + ): Promise { + const parentRunId = request.body.options?.parentRunId; + const resumeParentOnCompletion = request.body.options?.resumeParentOnCompletion; + + //We're using `andWait` so we need to block the parent run with a waitpoint + if (resumeParentOnCompletion && parentRunId) { + // Get or create waitpoint lazily (existing run may not have one if it was standalone) + let associatedWaitpoint = existingRun.associatedWaitpoint; + if (!associatedWaitpoint) { + associatedWaitpoint = await this.engine.getOrCreateRunWaitpoint({ + runId: existingRun.id, + projectId: request.environment.projectId, + environmentId: request.environment.id, + }); + } + + await this.traceEventConcern.traceIdempotentRun( + request, + parentStore, + { + existingRun, + idempotencyKey, + incomplete: associatedWaitpoint.status === "PENDING", + isError: associatedWaitpoint.outputIsError, + }, + async (event) => { + const spanId = + request.options?.parentAsLinkType === "replay" + ? event.spanId + : event.traceparent?.spanId + ? `${event.traceparent.spanId}:${event.spanId}` + : event.spanId; + + //block run with waitpoint + await this.engine.blockRunWithWaitpoint({ + runId: RunId.fromFriendlyId(parentRunId), + waitpoints: associatedWaitpoint!.id, + spanIdToComplete: spanId, + batch: request.options?.batchId + ? { + id: request.options.batchId, + index: request.options.batchIndex ?? 0, + } + : undefined, + projectId: request.environment.projectId, + organizationId: request.environment.organizationId, + tx: this.prisma, + }); + } + ); + } + + return { isCached: true, run: existingRun }; + } + async handleTriggerRequest( request: TriggerTaskRequest, parentStore: string | undefined @@ -147,9 +228,103 @@ export class IdempotencyKeyConcern { new Date(Date.now() + 24 * 60 * 60 * 1000 * 30); // 30 days if (!idempotencyKey) { + // A one-time-use token with NO idempotency key would otherwise skip the + // claim path below entirely. During a `runTableV2` flag flip, two + // concurrent presentations of the same token can mint into DIFFERENT + // physical tables (cuid -> TaskRun, ksuid -> task_run_v2); the per-table + // unique constraint on `oneTimeUseToken` can't see across the two tables, + // so neither INSERT raises P2002 and one token spawns two runs. For + // v2-cutover orgs, serialise on the token via a Redis claim so the first + // presentation wins and the rest are rejected as already-used. Not + // excluded for resumeParentOnCompletion: for v2 orgs the idempotency-keyed + // claim covers triggerAndWait too (claimEligible short-circuits on + // shouldUseV2RunTable), so the token claim is consistent in doing the same; + // the loser is rejected (not returned a cached run), so there is no + // waitpoint-blocking subtlety to avoid. + const oneTimeUseToken = request.options?.oneTimeUseToken; + if (oneTimeUseToken) { + const orgFeatureFlags = + (request.environment.organization?.featureFlags as + | Record + | null + | undefined) ?? null; + if ( + shouldUseV2RunTable(orgFeatureFlags, { + nativeRealtimeEnabled: env.REALTIME_BACKEND_NATIVE_ENABLED === "1", + }) + ) { + // Key the claim on (envId, token), task-independent, to match the DB's + // task-independent oneTimeUseToken constraint (see the constant's + // comment). The TTL is a fixed pipeline-dwell bound, NOT the customer + // idempotencyKeyTTL: there is no idempotency key in this path, so a + // client-supplied TTL has no meaning here, and a tiny value would + // expire the claim mid-flight and reopen the cross-table dup window. + const claimKey = `otu:${oneTimeUseToken}`; + const outcome = await claimOrAwait({ + envId: request.environment.id, + taskIdentifier: ONE_TIME_USE_TOKEN_CLAIM_TASK, + idempotencyKey: claimKey, + ttlSeconds: env.TRIGGER_MOLLIFIER_CLAIM_TTL_SECONDS, + safetyNetMs: env.TRIGGER_MOLLIFIER_CLAIM_WAIT_MS, + pollStepMs: env.TRIGGER_MOLLIFIER_CLAIM_POLL_MS, + }); + if (outcome.kind === "resolved") { + // A concurrent presentation of the same one-time token already won + // and committed a run. Reject this one exactly as the within-table + // path does (the per-table oneTimeUseToken unique constraint raises + // P2002 -> RunOneTimeUseTokenError -> this same 4xx), preserving the + // "token already used" contract while closing the cross-table gap. + throw new ServiceValidationError( + `Cannot trigger ${request.taskId} with a one-time use token as it has already been used.` + ); + } else if (outcome.kind === "timed_out") { + throw new ServiceValidationError( + "One-time-use token claim resolution timed out", + 503 + ); + } else if (outcome.kind === "claimed") { + // We own the claim. The trigger pipeline MUST publish (on success) + // or release (on error) it — wired through the returned `claim`, + // exactly like the idempotency-keyed path. + return { + isCached: false, + idempotencyKey, + idempotencyKeyExpiresAt, + claim: { + envId: request.environment.id, + taskIdentifier: ONE_TIME_USE_TOKEN_CLAIM_TASK, + idempotencyKey: claimKey, + token: outcome.token, + }, + }; + } + } + } return { isCached: false, idempotencyKey, idempotencyKeyExpiresAt }; } + // Resolve whether THIS org currently mints v2 runs ONCE, for the pre-gate + // claim further down (claimEligible). + const orgFeatureFlags = + (request.environment.organization?.featureFlags as + | Record + | null + | undefined) ?? null; + const orgUsesV2 = shouldUseV2RunTable(orgFeatureFlags, { + nativeRealtimeEnabled: env.REALTIME_BACKEND_NATIVE_ENABLED === "1", + }); + + // Scope the idempotency dedup read on whether a v2 run could exist at all, + // NOT on whether this org currently mints v2. A run's table is fixed by its + // id format, so an org that was on v2 then flipped off still holds v2 runs an + // idempotency key can match; gating the read on orgUsesV2 would miss them and + // let a duplicate through. v2RunsMayExist is monotonic (native on now, OR + // task_run_v2 already has rows), so turning the native master switch off + // after v2 runs exist does NOT re-scope the read back to legacy and hide + // them. While no v2 run has ever existed it stays "legacy" and skips the + // empty task_run_v2 query on the trigger hot path. + const anyV2RunsPossible = v2RunsMayExist(env.REALTIME_BACKEND_NATIVE_ENABLED === "1"); + const existingRun = idempotencyKey ? await runStore.findRun( { @@ -161,6 +336,7 @@ export class IdempotencyKeyConcern { include: { associatedWaitpoint: true, }, + tables: anyV2RunsPossible ? "both" : "legacy", }, this.prisma ) @@ -219,66 +395,18 @@ export class IdempotencyKeyConcern { return { isCached: false, idempotencyKey, idempotencyKeyExpiresAt }; } - // We have an idempotent run, so we return it - const parentRunId = request.body.options?.parentRunId; - const resumeParentOnCompletion = request.body.options?.resumeParentOnCompletion; - - //We're using `andWait` so we need to block the parent run with a waitpoint - if (resumeParentOnCompletion && parentRunId) { - // Get or create waitpoint lazily (existing run may not have one if it was standalone) - let associatedWaitpoint = existingRun.associatedWaitpoint; - if (!associatedWaitpoint) { - associatedWaitpoint = await this.engine.getOrCreateRunWaitpoint({ - runId: existingRun.id, - projectId: request.environment.projectId, - environmentId: request.environment.id, - }); - } - - await this.traceEventConcern.traceIdempotentRun( - request, - parentStore, - { - existingRun, - idempotencyKey, - incomplete: associatedWaitpoint.status === "PENDING", - isError: associatedWaitpoint.outputIsError, - }, - async (event) => { - const spanId = - request.options?.parentAsLinkType === "replay" - ? event.spanId - : event.traceparent?.spanId - ? `${event.traceparent.spanId}:${event.spanId}` - : event.spanId; - - //block run with waitpoint - await this.engine.blockRunWithWaitpoint({ - runId: RunId.fromFriendlyId(parentRunId), - waitpoints: associatedWaitpoint!.id, - spanIdToComplete: spanId, - batch: request.options?.batchId - ? { - id: request.options.batchId, - index: request.options.batchIndex ?? 0, - } - : undefined, - projectId: request.environment.projectId, - organizationId: request.environment.organizationId, - tx: this.prisma, - }); - } - ); - } - - return { isCached: true, run: existingRun }; + // We have an idempotent run, so we return it (blocking the parent on its + // waitpoint for triggerAndWait). + return this.returnCachedIdempotentRun(request, parentStore, existingRun, idempotencyKey); } // Pre-gate claim — closes the PG+buffer race during gate transition. // All same-key triggers serialise here before evaluateGate decides - // PG-pass-through vs mollify. Skipped for triggerAndWait - // (resumeParentOnCompletion) — that path bypasses the gate entirely - // and its existing PG-side dedup is sufficient. + // PG-pass-through vs mollify. For mollifier-only orgs this is skipped for + // triggerAndWait (resumeParentOnCompletion) — that path bypasses the gate + // and its PG-side dedup is sufficient there. v2-cutover orgs do NOT skip it + // (see the claimEligible comment below): cross-table dedup has no shared + // unique constraint, so the claim must cover triggerAndWait too. // // Also gated on the same per-org mollifier flag the gate uses: when // `TRIGGER_MOLLIFIER_ENABLED=1` globally for staged rollout, the buffer @@ -298,20 +426,39 @@ export class IdempotencyKeyConcern { // trigger hot path. Excluding them keeps the claim aligned with the // gate — if the gate would never mollify the request, there's no // buffer to serialise against. + // Also serialise when the org is cut over to the v2 run table, even if it + // isn't on the mollifier. Concurrent same-key triggers that straddle a + // `runTableV2` flag flip can mint into DIFFERENT physical tables (cuid -> + // TaskRun, ksuid -> task_run_v2); the per-table idempotency unique + // constraints can't see each other, so neither INSERT raises P2002 and two + // runs share one key. The Redis claim is the only backstop in that window. + // v2-cutover orgs: an idempotency-keyed trigger can straddle a `runTableV2` + // flag flip into different physical tables (cuid -> TaskRun, ksuid -> + // task_run_v2), and the per-table idempotency-key unique constraints can't + // see across the two tables, so this claim (keyed on the idempotency key) + // is the only backstop that serialises same-key triggers across the flip, + // including triggerAndWait (resumeParentOnCompletion) and debounce. The + // resumeParentOnCompletion/debounce/oneTimeUseToken exclusions below are + // mollifier-gate alignment optimisations (those requests always return + // pass_through from the gate, so there's no buffer to serialise against); + // they don't apply to v2 orgs, which short-circuit to claimEligible via + // shouldUseV2RunTable regardless. oneTimeUseToken triggers with NO + // idempotency key are serialised separately by the token claim in the + // early-return block above; the residual same-token-with-two-different-keys + // case is not covered here (each key claims its own slot) and would require + // a pathological client. shouldUseV2RunTable is checked first so a v2 org + // skips the mollifier-flag resolve entirely. const claimEligible = - !request.body.options?.resumeParentOnCompletion && - !request.body.options?.debounce && - !request.options?.oneTimeUseToken && - (await resolveOrgMollifierFlag({ - envId: request.environment.id, - orgId: request.environment.organizationId, - taskId: request.taskId, - orgFeatureFlags: - ((request.environment.organization?.featureFlags as - | Record - | null - | undefined) ?? null), - })); + orgUsesV2 || + (!request.body.options?.resumeParentOnCompletion && + !request.body.options?.debounce && + !request.options?.oneTimeUseToken && + (await resolveOrgMollifierFlag({ + envId: request.environment.id, + orgId: request.environment.organizationId, + taskId: request.taskId, + orgFeatureFlags, + }))); if (claimEligible) { const ttlSeconds = Math.max( 1, @@ -342,7 +489,15 @@ export class IdempotencyKeyConcern { this.prisma ); if (writerRun) { - return { isCached: true, run: writerRun }; + // The concurrent winner already committed. Return it as a cache hit, + // and for triggerAndWait block our parent on the winner's waitpoint + // (the claim is what serialises v2 cross-table triggerAndWait). + return this.returnCachedIdempotentRun( + request, + parentStore, + writerRun, + idempotencyKey + ); } const buffered = await this.findBufferedRunWithIdempotency( request.environment.id, diff --git a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts index 031411844b4..a9920bbfd6b 100644 --- a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts @@ -10,6 +10,8 @@ import { PerformTaskRunAlertsService } from "~/v3/services/alerts/performTaskRun import { DefaultQueueManager } from "../concerns/queues.server"; import type { TriggerTaskRequest } from "../types"; import { runStore } from "~/v3/runStore.server"; +import { canMintV2Run } from "~/v3/runTableV2Status.server"; +import { env } from "~/env.server"; export type TriggerFailedTaskRequest = { /** The task identifier (e.g. "my-task") */ @@ -67,7 +69,19 @@ export class TriggerFailedTaskService { } async call(request: TriggerFailedTaskRequest): Promise { - const failedRunFriendlyId = RunId.generate().friendlyId; + // Mint the failed run on the same physical table the org's other runs use: + // a v2 org's failed run is a KSUID (-> task_run_v2), not a cuid in legacy + // TaskRun. Otherwise every trigger-time failure (queue limits, validation, + // payload errors) would land in the wrong table and, when it has a parent or + // batch, create an ongoing cross-table edge on the failure path. Mirrors the + // mint gate in triggerTask.server.ts. + const failedRunFriendlyId = ( + canMintV2Run(request.environment.organization.featureFlags, { + nativeRealtimeEnabled: env.REALTIME_BACKEND_NATIVE_ENABLED === "1", + }) + ? RunId.generateKsuid() + : RunId.generate() + ).friendlyId; const taskRunError: TaskRunError = { type: "INTERNAL_ERROR" as const, code: request.errorCode ?? TaskRunErrorCodes.UNSPECIFIED_ERROR, @@ -268,7 +282,25 @@ export class TriggerFailedTaskService { batch?: { id: string; index: number }; errorCode?: TaskRunErrorCodes; }): Promise { - const failedRunFriendlyId = RunId.generate().friendlyId; + // Keep the failed run on the org's table even on this degraded path. The + // caller couldn't fully resolve the environment, so load the org flags by id + // to decide; if even that fails, default to a legacy id (safe: RunStore + // routes by id format either way, and an unresolvable org is a rare edge). + let useV2RunTable = false; + try { + const org = await this.prisma.organization.findFirst({ + where: { id: opts.organizationId }, + select: { featureFlags: true }, + }); + useV2RunTable = canMintV2Run((org?.featureFlags as Record) ?? null, { + nativeRealtimeEnabled: env.REALTIME_BACKEND_NATIVE_ENABLED === "1", + }); + } catch { + // Leave useV2RunTable=false (legacy id). + } + const failedRunFriendlyId = ( + useV2RunTable ? RunId.generateKsuid() : RunId.generate() + ).friendlyId; try { // Best-effort parent run lookup for rootTaskRunId/depth diff --git a/apps/webapp/app/runEngine/services/triggerTask.server.ts b/apps/webapp/app/runEngine/services/triggerTask.server.ts index 89a938da8bf..370a996a1eb 100644 --- a/apps/webapp/app/runEngine/services/triggerTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerTask.server.ts @@ -25,6 +25,7 @@ import { logger } from "~/services/logger.server"; import { parseDelay } from "~/utils/delays"; import { handleMetadataPacket } from "~/utils/packets"; import { startSpan } from "~/v3/tracing.server"; +import { canMintV2Run } from "~/v3/runTableV2Status.server"; import type { TriggerTaskServiceOptions, TriggerTaskServiceResult, @@ -151,7 +152,19 @@ export class RunEngineTriggerTaskService { span.setAttribute("taskId", taskId); span.setAttribute("attempt", attempt); - const runFriendlyId = options?.runFriendlyId ?? RunId.generate().friendlyId; + // The single per-org cutover point: an opted-in org mints a KSUID id + // (routing the run to task_run_v2), everyone else keeps a legacy id + // (TaskRun). The flag is a pure in-memory read of the org's + // featureFlags already loaded on `environment` — no DB query on the + // trigger hot path. Downstream routing is by id format only. + const runFriendlyId = + options?.runFriendlyId ?? + (canMintV2Run(environment.organization.featureFlags, { + nativeRealtimeEnabled: env.REALTIME_BACKEND_NATIVE_ENABLED === "1", + }) + ? RunId.generateKsuid() + : RunId.generate() + ).friendlyId; const triggerRequest = { taskId, friendlyId: runFriendlyId, @@ -705,17 +718,24 @@ export class RunEngineTriggerTaskService { } }, ); - // Pipeline returned successfully — publish the claim if we held - // one. Waiters polling for our key resolve to this runId. - if (idempotencyClaim && result?.run?.friendlyId) { - await publishMollifierClaim({ - envId: idempotencyClaim.envId, - taskIdentifier: idempotencyClaim.taskIdentifier, - idempotencyKey: idempotencyClaim.idempotencyKey, - token: idempotencyClaim.token, - runId: result.run.friendlyId, - ttlSeconds: env.TRIGGER_MOLLIFIER_CLAIM_TTL_SECONDS, - }); + // Pipeline returned — resolve the claim if we held one. On success (a run + // with a friendlyId) publish it so waiters resolve to this runId; + // otherwise release it. Never leave a held claim unresolved on the success + // path: an orphaned claim would block concurrent waiters for the full + // safety-net window even though this request did not produce a run. + if (idempotencyClaim) { + if (result?.run?.friendlyId) { + await publishMollifierClaim({ + envId: idempotencyClaim.envId, + taskIdentifier: idempotencyClaim.taskIdentifier, + idempotencyKey: idempotencyClaim.idempotencyKey, + token: idempotencyClaim.token, + runId: result.run.friendlyId, + ttlSeconds: env.TRIGGER_MOLLIFIER_CLAIM_TTL_SECONDS, + }); + } else { + await releaseMollifierClaim(idempotencyClaim); + } } return result; } catch (err) { diff --git a/apps/webapp/app/services/metadata/updateMetadata.server.ts b/apps/webapp/app/services/metadata/updateMetadata.server.ts index 2af44d747bd..d1beba9c42d 100644 --- a/apps/webapp/app/services/metadata/updateMetadata.server.ts +++ b/apps/webapp/app/services/metadata/updateMetadata.server.ts @@ -354,18 +354,14 @@ export class UpdateMetadataService { metadata: true, metadataType: true, metadataVersion: true, - parentTaskRun: { - select: { - id: true, - status: true, - }, - }, - rootTaskRun: { - select: { - id: true, - status: true, - }, - }, + // Scalar parent/root pointers, NOT the parentTaskRun/rootTaskRun + // relations: a relation select is bound to one physical run table and + // resolves to null when the parent/root lives in the other table (a + // v2 child of a legacy parent in the mixed window). The scalar id is + // table-agnostic, and #ingestRunOperations only needs the id — the + // flusher routes by id format across both tables. + parentTaskRunId: true, + rootTaskRunId: true, }, }, this._prisma @@ -380,11 +376,11 @@ export class UpdateMetadataService { } if (body.parentOperations && body.parentOperations.length > 0) { - this.#ingestRunOperations(taskRun.parentTaskRun?.id ?? taskRun.id, body.parentOperations); + this.#ingestRunOperations(taskRun.parentTaskRunId ?? taskRun.id, body.parentOperations); } if (body.rootOperations && body.rootOperations.length > 0) { - this.#ingestRunOperations(taskRun.rootTaskRun?.id ?? taskRun.id, body.rootOperations); + this.#ingestRunOperations(taskRun.rootTaskRunId ?? taskRun.id, body.rootOperations); } const result = await this.#updateRunMetadata({ diff --git a/apps/webapp/app/services/runsBackfiller.server.ts b/apps/webapp/app/services/runsBackfiller.server.ts index 50e041ee64b..09386c9495c 100644 --- a/apps/webapp/app/services/runsBackfiller.server.ts +++ b/apps/webapp/app/services/runsBackfiller.server.ts @@ -41,6 +41,13 @@ export class RunsBackfillerService { span.setAttribute("cursor", cursor ?? ""); span.setAttribute("batchSize", batchSize ?? 0); + // Keyset on (createdAt, id). Runs now live across two physical tables + // (legacy TaskRun with cuid ids, task_run_v2 with ksuid ids), and `id` + // alone is not a valid order across them: cuid and ksuid sort in + // different ranges. RunStore merges the two tables only on a time-based + // key, so order by createdAt and tiebreak on id within a timestamp. + const keyset = cursor ? decodeBackfillCursor(cursor) : undefined; + const runs = await runStore.findRuns( { where: { @@ -51,11 +58,16 @@ export class RunsBackfillerService { status: { in: FINAL_RUN_STATUSES, }, - ...(cursor ? { id: { gt: cursor } } : {}), - }, - orderBy: { - id: "asc", + ...(keyset + ? { + OR: [ + { createdAt: { gt: keyset.createdAt } }, + { createdAt: keyset.createdAt, id: { gt: keyset.id } }, + ], + } + : {}), }, + orderBy: [{ createdAt: "asc" }, { id: "asc" }], take: batchSize, }, this.prisma @@ -94,8 +106,32 @@ export class RunsBackfillerService { lastRunId: lastRun.id, }); - // Return the last run ID to continue from - return lastRun.id; + // Return a (createdAt, id) cursor to continue from on the next batch. + return encodeBackfillCursor(lastRun.createdAt, lastRun.id); }); } } + +// The backfill cursor is an opaque "_" string. The admin +// worker passes it back verbatim across batches; only this service interprets +// it. An ISO timestamp contains no "_" and run ids are base62/base36, so the +// first "_" cleanly splits the two halves. +const BACKFILL_CURSOR_SEPARATOR = "_"; + +export function encodeBackfillCursor(createdAt: Date, id: string): string { + return `${createdAt.toISOString()}${BACKFILL_CURSOR_SEPARATOR}${id}`; +} + +export function decodeBackfillCursor(cursor: string): { createdAt: Date; id: string } { + const separatorIndex = cursor.indexOf(BACKFILL_CURSOR_SEPARATOR); + const createdAt = separatorIndex === -1 ? new Date(NaN) : new Date(cursor.slice(0, separatorIndex)); + const id = separatorIndex === -1 ? "" : cursor.slice(separatorIndex + 1); + + if (Number.isNaN(createdAt.getTime()) || id.length === 0) { + throw new Error( + `RunsBackfillerService: malformed cursor "${cursor}" (expected "_")` + ); + } + + return { createdAt, id }; +} diff --git a/apps/webapp/app/services/runsReplicationService.server.ts b/apps/webapp/app/services/runsReplicationService.server.ts index 31d8a3844cf..604056de8e7 100644 --- a/apps/webapp/app/services/runsReplicationService.server.ts +++ b/apps/webapp/app/services/runsReplicationService.server.ts @@ -227,6 +227,11 @@ export class RunsReplicationService { slotName: options.slotName, publicationName: options.publicationName, table: "TaskRun", + // task_run_v2 is a column-identical clone of TaskRun, so its WAL rows + // flow through the same handler/transform into the same ClickHouse table. + // Co-publishing it keeps the ClickHouse mirror complete once orgs cut over + // to v2 run ids; until then the table is empty and this is a no-op. + additionalTables: ["task_run_v2"], redisOptions: options.redisOptions, autoAcknowledge: false, publicationActions: ["insert", "update", "delete"], diff --git a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts index d32652a0b3b..9602a1267df 100644 --- a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts +++ b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts @@ -169,16 +169,13 @@ export class ClickHouseRunsRepository implements IRunsRepository { async listRuns(options: ListRunsOptions) { const { runIds, pagination } = await this.listRunIds(options); - let runs = await runStore.findRuns( + const hydrated = await runStore.findRuns( { where: { id: { in: runIds, }, }, - orderBy: { - id: "desc", - }, select: { id: true, friendlyId: true, @@ -216,6 +213,15 @@ export class ClickHouseRunsRepository implements IRunsRepository { this.options.prisma ); + // ClickHouse already ranked `runIds`. An `IN (...)` hydration comes back + // unordered, and a single SQL `orderBy` can't span the two physical run + // tables (legacy TaskRun + task_run_v2), so restore ClickHouse's ranking + // in memory. + const runById = new Map(hydrated.map((run) => [run.id, run])); + let runs = runIds + .map((id) => runById.get(id)) + .filter((run): run is NonNullable => run !== undefined); + // ClickHouse is slightly delayed, so we're going to do in-memory status filtering too if (options.statuses && options.statuses.length > 0) { runs = runs.filter((run) => options.statuses!.includes(run.status)); diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 6b75b9ef903..2a51a9be8ea 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -16,6 +16,7 @@ export const FEATURE_FLAG = { computeMigrationFreePercentage: "computeMigrationFreePercentage", computeMigrationPaidPercentage: "computeMigrationPaidPercentage", computeMigrationRequireTemplate: "computeMigrationRequireTemplate", + runTableV2: "runTableV2", } as const; export const FeatureFlagCatalog = { @@ -43,6 +44,12 @@ export const FeatureFlagCatalog = { // When on, migrated orgs build their compute template in required mode at deploy // (fails the deploy on error) instead of shadow. Strict boolean (see above). [FEATURE_FLAG.computeMigrationRequireTemplate]: z.boolean(), + // Per-org cutover to the parallel task_run_v2 table. When on, new runs for the + // org mint a KSUID id (routing them to task_run_v2); off (the default) keeps + // minting legacy ids. Strict boolean (see above): coercing a stringified + // "false" to true would cut an org over by mistake, and runs created on v2 + // stay on v2. + [FEATURE_FLAG.runTableV2]: z.boolean(), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; @@ -52,6 +59,11 @@ export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; export const GLOBAL_LOCKED_FLAGS: FeatureFlagKey[] = [ FEATURE_FLAG.defaultWorkerInstanceGroupId, FEATURE_FLAG.taskEventRepository, + // runTableV2 is resolved per-org only (`shouldUseV2RunTable` reads + // `Organization.featureFlags`, never the global FeatureFlag table), so a + // global toggle would be a silent no-op. Lock it on the global page to + // avoid that footgun; per-org control stays on the org dialog. + FEATURE_FLAG.runTableV2, ]; // Flags that are read-only on the org-level dialog. @@ -83,6 +95,45 @@ export function validatePartialFeatureFlags(values: Record) { return FeatureFlagCatalogSchema.partial().safeParse(values); } +/** + * Cross-field invariant on a RESOLVED org flag set: `runTableV2` may only be on + * when the org's `realtimeBackend` is "native". + * + * New v2 runs mint a KSUID id (routing them to task_run_v2) and are only + * observable in realtime on the native backend; Electric is bound to + * public."TaskRun", so a v2 run minted while the org is still on Electric is + * invisible in realtime. `shouldUseV2RunTable` already enforces this at read + * time, but this guard blocks the dangerous combination at WRITE time so it can + * never be configured, including the enable-race where `runTableV2` is flipped + * on before `realtimeBackend=native` has propagated past the realtime cache. + * + * Pass the FINAL resolved set (after any merge) so it also rejects turning + * `realtimeBackend` off/to "electric" while `runTableV2` is still on. + */ +export function validateFeatureFlagInvariants( + flags: Record +): { ok: true } | { ok: false; error: string } { + const runTableV2 = FeatureFlagCatalog[FEATURE_FLAG.runTableV2].safeParse( + flags[FEATURE_FLAG.runTableV2] + ); + if (!(runTableV2.success && runTableV2.data === true)) { + return { ok: true }; + } + + const backend = FeatureFlagCatalog[FEATURE_FLAG.realtimeBackend].safeParse( + flags[FEATURE_FLAG.realtimeBackend] + ); + if (backend.success && backend.data === "native") { + return { ok: true }; + } + + return { + ok: false, + error: + 'runTableV2 can only be enabled when realtimeBackend is "native". Set realtimeBackend="native" first (and let it propagate past the realtime cache), then enable runTableV2.', + }; +} + // Utility types for catalog-driven UI rendering export type FlagControlType = | { type: "boolean" } diff --git a/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts index 47c9733c927..b8a629d4240 100644 --- a/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts +++ b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts @@ -5,7 +5,7 @@ import type { MollifierBuffer, } from "@trigger.dev/redis-worker"; import { logger } from "~/services/logger.server"; -import { getMollifierBuffer } from "./mollifierBuffer.server"; +import { getIdempotencyClaimBuffer } from "./mollifierBuffer.server"; // Tunables. The TTL on the claim key is bounded by typical trigger-pipeline // dwell; long enough that a slow PG insert doesn't expire mid-flight, @@ -58,13 +58,14 @@ export type ClaimOrAwaitInput = IdempotencyLookupInput & { // attempt sees the eventual PG/buffer state via existing // IdempotencyKeyConcern PG-first lookup. export async function claimOrAwait(input: ClaimOrAwaitInput): Promise { - const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + const buffer = input.buffer === undefined ? getIdempotencyClaimBuffer() : input.buffer; if (!buffer) { - // Mollifier disabled / buffer construction failed. Fall open — - // caller proceeds with the trigger pipeline (PG unique constraint - // backstop). The token is never read in this case (publish/release - // are buffer-null no-ops downstream), so we skip the default - // `randomUUID()` to keep the mollifier-OFF hot path allocation-free + // No claim backend at all — both the mollifier buffer and the + // standalone claim buffer are unavailable (the general Redis host is + // unconfigured). Fall open: the caller proceeds with the trigger + // pipeline (PG unique constraint backstop). The token is never read in + // this case (publish/release are buffer-null no-ops downstream), so we + // skip the default `randomUUID()` to keep this hot path allocation-free // for idempotency-keyed triggers — `triggerTask` is the // highest-throughput code path in the system. A test-injected // generator is still honoured for deterministic assertions. @@ -164,7 +165,7 @@ export async function publishClaim(input: { ttlSeconds?: number; buffer?: MollifierBuffer | null; }): Promise { - const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + const buffer = input.buffer === undefined ? getIdempotencyClaimBuffer() : input.buffer; if (!buffer) return; const ttlSeconds = input.ttlSeconds ?? DEFAULT_CLAIM_TTL_SECONDS; try { @@ -197,7 +198,7 @@ export async function releaseClaim(input: { token: string; buffer?: MollifierBuffer | null; }): Promise { - const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + const buffer = input.buffer === undefined ? getIdempotencyClaimBuffer() : input.buffer; if (!buffer) return; try { await buffer.releaseClaim({ diff --git a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts index 2f7af70d0f2..d89bc8a8a94 100644 --- a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts @@ -33,3 +33,43 @@ export function getMollifierBuffer(): MollifierBuffer | null { if (env.TRIGGER_MOLLIFIER_ENABLED !== "1") return null; return singleton("mollifierBuffer", initializeMollifierBuffer); } + +// A claim-only buffer for the pre-gate idempotency claim when the mollifier +// itself is disabled. The mollifier Redis may be unprovisioned in deployments +// that don't run the mollifier, so this points at the general webapp Redis. +// Only the claim methods (claimIdempotency / readClaim / publishClaim / +// releaseClaim) are exercised; they live under the distinct `mollifier:claim:*` +// namespace and carry their own short TTLs, so sharing the general Redis is safe. +function initializeIdempotencyClaimBuffer(): MollifierBuffer { + logger.debug("Initializing standalone idempotency-claim buffer", { + host: env.REDIS_HOST, + }); + + return new MollifierBuffer({ + redisOptions: { + keyPrefix: "", + host: env.REDIS_HOST, + port: env.REDIS_PORT, + username: env.REDIS_USERNAME, + password: env.REDIS_PASSWORD, + enableAutoPipelining: true, + ...(env.REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }, + }); +} + +// Resolve the buffer backing the pre-gate idempotency claim. When the +// mollifier is enabled, reuse its buffer so claims share the mollifier's Redis. +// Otherwise return a claim-only buffer on the general Redis: a `runTableV2` +// cutover org needs the claim to serialise concurrent same-key triggers that +// would otherwise straddle the flag flip into different physical tables (cuid +// -> TaskRun, ksuid -> task_run_v2), whose per-table unique constraints can't +// see each other. Returns null only when the general Redis host is +// unconfigured, in which case the claim falls open (no coordination) exactly +// as before. +export function getIdempotencyClaimBuffer(): MollifierBuffer | null { + const mollifier = getMollifierBuffer(); + if (mollifier) return mollifier; + if (!env.REDIS_HOST) return null; + return singleton("idempotencyClaimBuffer", initializeIdempotencyClaimBuffer); +} diff --git a/apps/webapp/app/v3/runHierarchy.server.ts b/apps/webapp/app/v3/runHierarchy.server.ts new file mode 100644 index 00000000000..5d71442df1c --- /dev/null +++ b/apps/webapp/app/v3/runHierarchy.server.ts @@ -0,0 +1,86 @@ +import type { Prisma, PrismaClientOrTransaction, PrismaReplicaClient } from "@trigger.dev/database"; +import type { FindRunTableScope } from "@internal/run-store"; +import { runStore } from "~/v3/runStore.server"; + +type ReadClient = PrismaClientOrTransaction | PrismaReplicaClient; + +/** + * Resolve a run's parent and root runs across BOTH physical run tables. + * + * A run's `parentTaskRunId`/`rootTaskRunId` are plain scalar ids whose target + * may live in either `TaskRun` (legacy cuid) or `task_run_v2` (new ksuid) — for + * example a v2 child of a legacy parent, created while the org's `runTableV2` + * flag was mid-flip. A single Prisma relation select (`parentTaskRun { ... }`) + * is bound to one table and silently returns `null` for such a cross-table + * parent/root. Resolving each by id instead lets RunStore route to the correct + * table by id format. Pass the same `select` the caller would have used on the + * relation. + * + * The lookups are scoped to the run's `runtimeEnvironmentId`: the parent/root + * pointers are plain scalars with no FK enforcement, so a stale or malformed + * pointer could otherwise resolve to a run in another environment and leak its + * metadata. The relation select this replaces was implicitly same-environment. + */ +export async function hydrateParentAndRoot( + ids: { parentTaskRunId: string | null; rootTaskRunId: string | null }, + scope: { runtimeEnvironmentId: string; tables?: FindRunTableScope }, + select: S, + client?: ReadClient +): Promise<{ + parentTaskRun: Prisma.TaskRunGetPayload<{ select: S }> | null; + rootTaskRun: Prisma.TaskRunGetPayload<{ select: S }> | null; +}> { + const [parentTaskRun, rootTaskRun] = await Promise.all([ + ids.parentTaskRunId + ? runStore.findRun( + { id: ids.parentTaskRunId, runtimeEnvironmentId: scope.runtimeEnvironmentId }, + { select, tables: scope.tables }, + client + ) + : Promise.resolve(null), + ids.rootTaskRunId + ? runStore.findRun( + { id: ids.rootTaskRunId, runtimeEnvironmentId: scope.runtimeEnvironmentId }, + { select, tables: scope.tables }, + client + ) + : Promise.resolve(null), + ]); + + return { + parentTaskRun: parentTaskRun as Prisma.TaskRunGetPayload<{ select: S }> | null, + rootTaskRun: rootTaskRun as Prisma.TaskRunGetPayload<{ select: S }> | null, + }; +} + +/** + * A run's direct child runs across BOTH physical tables. Children reference the + * parent by the scalar `parentTaskRunId`, and a v2 parent can have legacy cuid + * children (or vice versa) in the mixed window, so this is a non-id predicate + * read that `findRuns` resolves against both tables. Scoped to the run's + * `runtimeEnvironmentId` so a stale/malformed `parentTaskRunId` pointer can't + * surface children from another environment. + */ +export async function hydrateChildRuns( + parentRunId: string, + scope: { runtimeEnvironmentId: string; tables?: FindRunTableScope }, + select: S, + client?: ReadClient +): Promise[]> { + return runStore.findRuns( + { + where: { + parentTaskRunId: parentRunId, + runtimeEnvironmentId: scope.runtimeEnvironmentId, + }, + select, + // parentTaskRunId is a non-id predicate, so this reads BOTH tables by + // default. Callers that know no v2 run can exist (native realtime off, so + // task_run_v2 is empty deployment-wide) pass tables:"legacy" to skip the + // empty query. Scope on the deployment switch, NOT a per-org flag: a run's + // table is fixed by id format, so a flipped-off org still has v2 children. + tables: scope.tables, + }, + client + ) as Promise[]>; +} diff --git a/apps/webapp/app/v3/runTableV2.server.ts b/apps/webapp/app/v3/runTableV2.server.ts new file mode 100644 index 00000000000..5fa089fbd6e --- /dev/null +++ b/apps/webapp/app/v3/runTableV2.server.ts @@ -0,0 +1,63 @@ +import { FEATURE_FLAG, FeatureFlagCatalog } from "~/v3/featureFlags"; + +export type ShouldUseV2RunTableOptions = { + /** + * Whether the native realtime backend is enabled for this deployment + * (`env.REALTIME_BACKEND_NATIVE_ENABLED === "1"`). Passed in rather than read + * from env here so this stays a pure, env-free function the caller can + * unit-test directly. + */ + nativeRealtimeEnabled: boolean; +}; + +/** + * Per-org cutover switch for the parallel `task_run_v2` run table. + * + * Read in memory from `Organization.featureFlags` (already loaded on the + * AuthenticatedEnvironment at API-key auth, so this adds no DB query) at the + * single run-id mint site in the trigger path. On → mint a KSUID id, which + * routes the run to `task_run_v2`; off (the default) → mint a legacy id, which + * routes to `TaskRun`. + * + * GATED ON NATIVE REALTIME. The Electric realtime backend serves shapes bound + * to a single table (`TaskRun`) and is being retired; only the native backend + * is table-agnostic and can observe a `task_run_v2` run in realtime + * (subscribeToRun / useRealtimeRun / poll). Routing a run to v2 while the org is + * still served by Electric would make that run silently invisible in realtime, + * so v2 requires BOTH the deployment master switch (`nativeRealtimeEnabled`) and + * the org's `realtimeBackend` flag set to "native". This is a temporary + * coupling: once Electric is removed and native is the only/default backend, + * drop the native check. + * + * RunStore never reads this flag: it routes purely by id format. The flag only + * decides which id scheme is minted upstream. Disabling it sends only NEW runs + * back to legacy; runs already created on v2 stay readable there (routed by id). + */ +export function shouldUseV2RunTable( + orgFeatureFlags: unknown, + options: ShouldUseV2RunTableOptions +): boolean { + if (orgFeatureFlags === null || typeof orgFeatureFlags !== "object") { + return false; + } + const flags = orgFeatureFlags as Record; + + // Native realtime is a hard prerequisite (see doc comment): a v2 run is only + // observable in realtime on the native backend. + if (!options.nativeRealtimeEnabled) { + return false; + } + const backend = FeatureFlagCatalog[FEATURE_FLAG.realtimeBackend].safeParse( + flags[FEATURE_FLAG.realtimeBackend] + ); + if (!(backend.success && backend.data === "native")) { + return false; + } + + const override = flags[FEATURE_FLAG.runTableV2]; + if (override === undefined) { + return false; + } + const parsed = FeatureFlagCatalog[FEATURE_FLAG.runTableV2].safeParse(override); + return parsed.success ? parsed.data : false; +} diff --git a/apps/webapp/app/v3/runTableV2Status.server.ts b/apps/webapp/app/v3/runTableV2Status.server.ts new file mode 100644 index 00000000000..07cfbf2d4d5 --- /dev/null +++ b/apps/webapp/app/v3/runTableV2Status.server.ts @@ -0,0 +1,114 @@ +import { prisma } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { singleton } from "~/utils/singleton"; +import { shouldUseV2RunTable, type ShouldUseV2RunTableOptions } from "~/v3/runTableV2.server"; + +/** + * Cached, periodically-refreshed facts about the `task_run_v2` table, read OFF + * the trigger hot path (no per-request DB query) to gate v2 minting and + * cross-table read scoping. + */ +type RunTableV2Status = { + /** + * Is `task_run_v2` in the ClickHouse logical-replication publication? + * + * Postgres only decodes a table's changes for transactions that BEGIN after + * the decoder sees `ALTER PUBLICATION ... ADD TABLE`, and that ADD TABLE is run + * lazily by the replication leader on its own startup, NOT by a migration. So a + * v2 run minted before the table is published is permanently absent from + * ClickHouse with no backfill, and the run list / metrics / tags / bulk actions + * are ClickHouse-only. Mint v2 ONLY when this is true; otherwise mint legacy + * (fail-safe), self-healing once the leader publishes the table. + */ + published: boolean; + /** + * Has any v2 run ever existed (monotonic in practice)? Cross-table READ scoping + * uses this (OR the native master switch) rather than the master switch alone, + * so disabling native realtime cannot re-scope reads back to legacy and hide + * already-minted v2 runs from idempotency dedup and hierarchy reads. + */ + hasRows: boolean; +}; + +const REFRESH_INTERVAL_MS = 30_000; + +const status = singleton("runTableV2Status", initialize); + +function initialize(): RunTableV2Status { + const state: RunTableV2Status = { published: false, hasRows: false }; + + // No background poller under vitest: this module is imported by the mint/read + // sites, so a live DB poll + setInterval at import time would query the test + // database and leak a timer for the test run, and the async refresh could race + // tests that drive the cached status directly. Tests exercise the gates by + // mutating the cached state, so the poller would only get in the way. + if (env.NODE_ENV === "test") { + return state; + } + + // The publication only exists when runs replication is configured. Without it + // no v2 run can be captured by ClickHouse, so leave published=false: minting + // stays on legacy regardless of org flags. + if (!env.RUN_REPLICATION_CLICKHOUSE_URL) { + return state; + } + + const refresh = async () => { + try { + const published = await prisma.$queryRaw>` + SELECT EXISTS ( + SELECT 1 FROM pg_publication_tables + WHERE pubname = ${env.RUN_REPLICATION_PUBLICATION_NAME} + AND schemaname = 'public' + AND tablename = 'task_run_v2' + ) AS present`; + state.published = published[0]?.present ?? false; + + // hasRows is monotonic; once true, stop probing. + if (!state.hasRows) { + const hasRows = await prisma.$queryRaw>` + SELECT EXISTS (SELECT 1 FROM task_run_v2 LIMIT 1) AS present`; + state.hasRows = hasRows[0]?.present ?? false; + } + } catch (error) { + logger.warn("runTableV2Status refresh failed; keeping last-known status", { + error: error instanceof Error ? error.message : String(error), + }); + } + }; + + void refresh(); + const timer = setInterval(() => void refresh(), REFRESH_INTERVAL_MS); + timer.unref?.(); + + return state; +} + +/** `task_run_v2` is in the ClickHouse replication publication (cached, off the hot path). */ +export function isV2RunTablePublished(): boolean { + return status.published; +} + +/** + * Whether a v2 run could be relevant to a cross-table READ: native realtime is on + * (v2 is being minted now) OR `task_run_v2` already holds rows. Scope cross-table + * reads on this, not the native master switch alone, so turning native off cannot + * hide already-minted v2 runs. + */ +export function v2RunsMayExist(nativeRealtimeEnabled: boolean): boolean { + return nativeRealtimeEnabled || status.hasRows; +} + +/** + * Mint gate: mint a v2 (KSUID) run only when the org is cut over to v2 AND + * `task_run_v2` is in the ClickHouse publication, so a v2 run can never be + * silently lost from ClickHouse by being minted before the replication leader + * publishes the table. Fails safe to legacy until then; self-heals once published. + */ +export function canMintV2Run( + orgFeatureFlags: unknown, + options: ShouldUseV2RunTableOptions +): boolean { + return shouldUseV2RunTable(orgFeatureFlags, options) && isV2RunTablePublished(); +} diff --git a/apps/webapp/test/featureFlagInvariants.test.ts b/apps/webapp/test/featureFlagInvariants.test.ts new file mode 100644 index 00000000000..ff4c4d48ce0 --- /dev/null +++ b/apps/webapp/test/featureFlagInvariants.test.ts @@ -0,0 +1,44 @@ +import { describe, expect, it } from "vitest"; +import { validateFeatureFlagInvariants } from "~/v3/featureFlags"; + +describe("validateFeatureFlagInvariants (runTableV2 requires native realtime)", () => { + it("allows runTableV2 on when realtimeBackend is native", () => { + expect( + validateFeatureFlagInvariants({ runTableV2: true, realtimeBackend: "native" }).ok + ).toBe(true); + }); + + it("rejects runTableV2 on while realtimeBackend is electric", () => { + expect( + validateFeatureFlagInvariants({ runTableV2: true, realtimeBackend: "electric" }).ok + ).toBe(false); + }); + + it("rejects runTableV2 on while realtimeBackend is shadow", () => { + expect( + validateFeatureFlagInvariants({ runTableV2: true, realtimeBackend: "shadow" }).ok + ).toBe(false); + }); + + it("rejects runTableV2 on when realtimeBackend is unset (defaults to electric)", () => { + expect(validateFeatureFlagInvariants({ runTableV2: true }).ok).toBe(false); + }); + + it("allows runTableV2 off or absent regardless of backend", () => { + expect(validateFeatureFlagInvariants({ runTableV2: false }).ok).toBe(true); + expect( + validateFeatureFlagInvariants({ runTableV2: false, realtimeBackend: "electric" }).ok + ).toBe(true); + expect(validateFeatureFlagInvariants({}).ok).toBe(true); + expect(validateFeatureFlagInvariants({ realtimeBackend: "electric" }).ok).toBe(true); + }); + + it("ignores a stringified runTableV2 (strict boolean) and does not constrain", () => { + // runTableV2 is a strict z.boolean(); a stringified "true" fails the parse, + // so the invariant treats it as not-enabled (the write would be rejected by + // the flag schema itself before reaching here). + expect( + validateFeatureFlagInvariants({ runTableV2: "true", realtimeBackend: "electric" }).ok + ).toBe(true); + }); +}); diff --git a/apps/webapp/test/mollifierClaimResolution.test.ts b/apps/webapp/test/mollifierClaimResolution.test.ts index f61cda0d04e..e9115570af9 100644 --- a/apps/webapp/test/mollifierClaimResolution.test.ts +++ b/apps/webapp/test/mollifierClaimResolution.test.ts @@ -13,6 +13,11 @@ vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); const h = vi.hoisted(() => ({ buffer: null as unknown, orgFlag: true })); vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({ getMollifierBuffer: () => h.buffer, + // claimOrAwait/publishClaim/releaseClaim resolve their backend through + // getIdempotencyClaimBuffer (the mollifier buffer when enabled, else a + // standalone Redis claim buffer). In tests both resolve to the scripted + // buffer handle so the claim path is fully controllable. + getIdempotencyClaimBuffer: () => h.buffer, })); // Stub `mollifierGate.server` so loading the concern doesn't drag in // `env.server` (which fails to parse without a populated environment in @@ -29,7 +34,14 @@ import type { TriggerTaskRequest } from "~/runEngine/types"; function makeConcern(prisma: { findFirst: () => Promise }) { return new IdempotencyKeyConcern( - { taskRun: { findFirst: prisma.findFirst } } as never, + { + taskRun: { findFirst: prisma.findFirst }, + // The cross-table existing-run lookup reads BOTH physical tables. These + // tests use legacy ids that never match a v2 row, so task_run_v2 always + // misses and findFirstAcrossTables returns the scripted taskRun result — + // keeping the per-call scripting on `prisma.findFirst` intact. + taskRunV2: { findFirst: async () => null }, + } as never, {} as never, // engine — unused on this path {} as never, // traceEventConcern — unused on this path ); diff --git a/apps/webapp/test/mollifierResetIdempotencyKey.test.ts b/apps/webapp/test/mollifierResetIdempotencyKey.test.ts index 4909087d70c..5f0abd81f65 100644 --- a/apps/webapp/test/mollifierResetIdempotencyKey.test.ts +++ b/apps/webapp/test/mollifierResetIdempotencyKey.test.ts @@ -22,6 +22,7 @@ import { ServiceValidationError } from "~/v3/services/baseService.server"; type FakePrisma = { taskRun: { updateMany: (...args: unknown[]) => Promise<{ count: number }> }; + taskRunV2: { updateMany: (...args: unknown[]) => Promise<{ count: number }> }; }; function makePrisma(pgCount: number): FakePrisma { @@ -29,6 +30,12 @@ function makePrisma(pgCount: number): FakePrisma { taskRun: { updateMany: vi.fn(async () => ({ count: pgCount })), }, + // clearIdempotencyKey(byPredicate) clears across BOTH physical run tables. + // These tests use a legacy key that only ever matches TaskRun, so + // task_run_v2 always clears nothing. + taskRunV2: { + updateMany: vi.fn(async () => ({ count: 0 })), + }, }; } @@ -138,6 +145,12 @@ describe("ResetIdempotencyKeyService — buffer-outage handling", () => { return updateManyCalls === 1 ? { count: 0 } : { count: 1 }; }), }, + // task_run_v2 side of the both-tables byPredicate clear; never matches + // here, so it stays at 0 and the updateManyCalls assertion tracks only + // the legacy delegate. + taskRunV2: { + updateMany: vi.fn(async () => ({ count: 0 })), + }, }; const resetIdempotency = vi.fn(async () => ({ clearedRunId: null as string | null })); bufferMock.current = { resetIdempotency }; diff --git a/apps/webapp/test/oneTimeUseTokenClaim.test.ts b/apps/webapp/test/oneTimeUseTokenClaim.test.ts new file mode 100644 index 00000000000..9b8e78cd21a --- /dev/null +++ b/apps/webapp/test/oneTimeUseTokenClaim.test.ts @@ -0,0 +1,168 @@ +import { describe, expect, it, vi } from "vitest"; + +// Stub `~/db.server` before importing the concern — the real module eagerly +// calls `prisma.$connect()` at singleton construction. The concern under test +// receives its prisma via the constructor, and the one-time-token path below +// reaches the claim before any DB read, so the stub is never exercised. +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +// claimOrAwait resolves its backend through getIdempotencyClaimBuffer; script +// it via a hoisted handle so each test controls the claim outcome. +const h = vi.hoisted(() => ({ buffer: null as unknown, v2: true })); +vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({ + getMollifierBuffer: () => h.buffer, + getIdempotencyClaimBuffer: () => h.buffer, +})); +// v2 routing is gated on native realtime (deployment env switch + per-org +// `realtimeBackend` flag); that gate is covered by runTableV2.test.ts. Here we +// mock it so each test controls whether the org is cut over to v2, isolating +// the one-time-token claim logic from the gating mechanism. +vi.mock("~/v3/runTableV2.server", () => ({ + shouldUseV2RunTable: () => h.v2, +})); +// The one-time-token claim runs BEFORE the mollifier-flag resolve, but the +// concern still imports the gate module; stub it so loading doesn't pull in +// extra feature-flag wiring. +vi.mock("~/v3/mollifier/mollifierGate.server", () => ({ + makeResolveMollifierFlag: () => async () => false, +})); + +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { IdempotencyKeyConcern } from "~/runEngine/concerns/idempotencyKeys.server"; +import type { TriggerTaskRequest } from "~/runEngine/types"; + +function makeConcern() { + return new IdempotencyKeyConcern( + { + taskRun: { findFirst: async () => null }, + taskRunV2: { findFirst: async () => null }, + } as never, + {} as never, // engine — unused on this path + {} as never // traceEventConcern — unused on this path + ); +} + +function makeOtuRequest( + overrides: { + featureFlags?: Record; + oneTimeUseToken?: string | undefined; + resumeParentOnCompletion?: boolean; + } = {} +): TriggerTaskRequest { + return { + taskId: "my-task", + environment: { + id: "env_a", + organizationId: "org_1", + organization: { featureFlags: overrides.featureFlags ?? { runTableV2: true } }, + }, + // No idempotencyKey on purpose — this is the path the per-table + // oneTimeUseToken unique constraint cannot cover across two tables. + options: { oneTimeUseToken: "oneTimeUseToken" in overrides ? overrides.oneTimeUseToken : "tok-1" }, + body: { + options: overrides.resumeParentOnCompletion ? { resumeParentOnCompletion: true } : {}, + }, + } as unknown as TriggerTaskRequest; +} + +describe("IdempotencyKeyConcern · one-time-use token cross-table claim", () => { + it("v2 org: a one-time token with no idempotency key takes a claim keyed on the token", async () => { + const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const })); + h.buffer = { + claimIdempotency, + readClaim: vi.fn(async () => null), + } as unknown as MollifierBuffer; + + const result = await makeConcern().handleTriggerRequest(makeOtuRequest(), undefined); + + expect(result.isCached).toBe(false); + if (result.isCached === false) { + // The trigger pipeline must publish/release this claim. It is keyed on + // the namespaced token AND a reserved, task-independent slot — matching + // the task-independent oneTimeUseToken DB constraint, NOT request.taskId. + expect(result.claim?.idempotencyKey).toBe("otu:tok-1"); + expect(result.claim?.envId).toBe("env_a"); + expect(result.claim?.taskIdentifier).toBe("__one_time_use_token__"); + } + expect(claimIdempotency).toHaveBeenCalledTimes(1); + expect(claimIdempotency.mock.calls[0][0]).toMatchObject({ + idempotencyKey: "otu:tok-1", + taskIdentifier: "__one_time_use_token__", + }); + }); + + it("v2 org: a concurrent winner (claim resolved) rejects the second presentation as already-used", async () => { + // The winner committed a run under the token; the loser must be rejected + // exactly like the within-table P2002 path, NOT allowed to mint a duplicate + // into the other table. + h.buffer = { + claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_winner" })), + readClaim: vi.fn(async () => null), + } as unknown as MollifierBuffer; + + await expect( + makeConcern().handleTriggerRequest(makeOtuRequest(), undefined) + ).rejects.toThrow(/already been used/i); + }); + + it("org not cut over to v2: skips the token claim entirely (no Redis round-trip)", async () => { + h.v2 = false; + const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const })); + h.buffer = { + claimIdempotency, + readClaim: vi.fn(async () => null), + } as unknown as MollifierBuffer; + + try { + const result = await makeConcern().handleTriggerRequest(makeOtuRequest(), undefined); + expect(result.isCached).toBe(false); + if (result.isCached === false) { + expect(result.claim).toBeUndefined(); + } + expect(claimIdempotency).not.toHaveBeenCalled(); + } finally { + h.v2 = true; // restore for the other tests in this file + } + }); + + it("triggerAndWait one-time token IS claimed (v2 orgs serialise it like the keyed claim)", async () => { + const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const })); + h.buffer = { + claimIdempotency, + readClaim: vi.fn(async () => null), + } as unknown as MollifierBuffer; + + const result = await makeConcern().handleTriggerRequest( + makeOtuRequest({ resumeParentOnCompletion: true }), + undefined + ); + + expect(result.isCached).toBe(false); + if (result.isCached === false) { + // resumeParentOnCompletion is NOT excluded from the token claim: for a v2 + // org the cross-table dup hole is identical, and the loser is rejected + // (no cached-run waitpoint subtlety to avoid). + expect(result.claim?.idempotencyKey).toBe("otu:tok-1"); + } + expect(claimIdempotency).toHaveBeenCalledTimes(1); + }); + + it("no one-time token: ordinary no-idempotency-key trigger is unaffected", async () => { + const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const })); + h.buffer = { + claimIdempotency, + readClaim: vi.fn(async () => null), + } as unknown as MollifierBuffer; + + const result = await makeConcern().handleTriggerRequest( + makeOtuRequest({ oneTimeUseToken: undefined }), + undefined + ); + + expect(result.isCached).toBe(false); + if (result.isCached === false) { + expect(result.claim).toBeUndefined(); + } + expect(claimIdempotency).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/runTableFkDriftGuard.test.ts b/apps/webapp/test/runTableFkDriftGuard.test.ts new file mode 100644 index 00000000000..df69c8274db --- /dev/null +++ b/apps/webapp/test/runTableFkDriftGuard.test.ts @@ -0,0 +1,98 @@ +import { readdirSync, readFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { describe, expect, it } from "vitest"; + +// internal-packages/database/prisma/migrations, resolved from this test file +// (apps/webapp/test) up to the repo root. +const MIGRATIONS_DIR = join( + dirname(fileURLToPath(import.meta.url)), + "../../../internal-packages/database/prisma/migrations" +); + +// The migration that physically dropped every incoming foreign key to TaskRun, +// decoupling the run tables so a run can live in either TaskRun or task_run_v2. +const DROP_FKS_MIGRATION = "20260619120042_drop_taskrun_incoming_fks"; + +/** + * Guard against the Prisma FK-drift footgun for the parallel run tables. + * + * schema.prisma still declares the (deliberately dropped) incoming relations to + * TaskRun AND mirror relations to task_run_v2, so a routine `prisma migrate dev` + * for any unrelated change regenerates a migration that re-adds those foreign + * keys. Re-adding them is destructive: + * - a re-added TaskRun incoming FK silently re-couples the two tables, defeating + * the whole parallel-table design; and + * - any FK referencing task_run_v2 fails on existing legacy-pointing child rows + * and then rejects every cross-table child insert. + * + * Whoever generates a migration must strip these (the established practice). + * This test fails CI if an unstripped migration ever lands, so the parity can't + * silently drift back. + */ +describe("run-table FK-drift guard", () => { + const migrationDirs = readdirSync(MIGRATIONS_DIR, { withFileTypes: true }) + .filter((entry) => entry.isDirectory()) + .map((entry) => entry.name) + .sort(); + + const sqlOf = (name: string) => + readFileSync(join(MIGRATIONS_DIR, name, "migration.sql"), "utf8"); + + // A statement that ADDs a foreign key referencing `table`. Checked per + // statement (split on ;) so FOREIGN KEY in one statement can't pair with + // REFERENCES in a later one. The REFERENCES match is QUALIFICATION-AGNOSTIC: + // Prisma emits the schema-qualified form `REFERENCES "public"."TaskRun"` in + // every generated migration in this repo (including the implicit m2m join + // tables _WaitpointRunConnections / _TaskRunToTaskRunTag), so matching only + // the bare `"TaskRun"` would silently miss the real regeneration vector. + const addsForeignKeyReferencing = (sql: string, table: string) => + sql + .split(";") + .some( + (stmt) => + /FOREIGN KEY/i.test(stmt) && + new RegExp(`REFERENCES\\s+(?:"[A-Za-z0-9_]+"\\.)?"${table}"`, "i").test(stmt) + ); + + it("finds the migrations directory and the FK-drop migration", () => { + expect(migrationDirs.length).toBeGreaterThan(0); + expect(migrationDirs).toContain(DROP_FKS_MIGRATION); + }); + + it("the matcher catches both bare and schema-qualified REFERENCES forms", () => { + // Prisma actually emits the qualified form; both must be caught so the + // qualified form can never regress undetected. + const qualifiedV2 = + 'ALTER TABLE "TaskRunAttempt" ADD CONSTRAINT "TaskRunAttempt_taskRunId_v2_fkey" FOREIGN KEY ("taskRunId") REFERENCES "public"."task_run_v2"("id") ON DELETE CASCADE;'; + const qualifiedM2M = + 'ALTER TABLE "_WaitpointRunConnections" ADD CONSTRAINT "_WaitpointRunConnections_A_fkey" FOREIGN KEY ("A") REFERENCES "public"."TaskRun"("id") ON DELETE CASCADE;'; + const bareTaskRun = + 'ALTER TABLE "TaskRunDependency" ADD CONSTRAINT "x_fkey" FOREIGN KEY ("taskRunId") REFERENCES "TaskRun"("id");'; + const unrelated = + 'ALTER TABLE "Foo" ADD CONSTRAINT "y_fkey" FOREIGN KEY ("barId") REFERENCES "public"."Bar"("id");'; + expect(addsForeignKeyReferencing(qualifiedV2, "task_run_v2")).toBe(true); + expect(addsForeignKeyReferencing(qualifiedM2M, "TaskRun")).toBe(true); + expect(addsForeignKeyReferencing(bareTaskRun, "TaskRun")).toBe(true); + expect(addsForeignKeyReferencing(unrelated, "TaskRun")).toBe(false); + expect(addsForeignKeyReferencing(unrelated, "task_run_v2")).toBe(false); + }); + + it("no migration EVER adds a foreign key referencing task_run_v2", () => { + const offenders = migrationDirs.filter((dir) => addsForeignKeyReferencing(sqlOf(dir), "task_run_v2")); + expect( + offenders, + `These migrations add a destructive FK referencing task_run_v2 (a child row can point at a legacy run, so the constraint fails on existing data): ${offenders.join(", ")}. Strip the *_v2_fkey constraints from the generated migration.` + ).toEqual([]); + }); + + it("no migration after the FK-drop re-adds an incoming foreign key to TaskRun", () => { + const dropIdx = migrationDirs.indexOf(DROP_FKS_MIGRATION); + const after = migrationDirs.slice(dropIdx + 1); + const offenders = after.filter((dir) => addsForeignKeyReferencing(sqlOf(dir), "TaskRun")); + expect( + offenders, + `These migrations re-add an incoming FK to TaskRun that was deliberately dropped (it re-couples the run tables): ${offenders.join(", ")}. Strip the TaskRun *_fkey constraints from the generated migration.` + ).toEqual([]); + }); +}); diff --git a/apps/webapp/test/runTableV2.test.ts b/apps/webapp/test/runTableV2.test.ts new file mode 100644 index 00000000000..7aa528b34a9 --- /dev/null +++ b/apps/webapp/test/runTableV2.test.ts @@ -0,0 +1,50 @@ +import { describe, expect, it } from "vitest"; +import { shouldUseV2RunTable } from "~/v3/runTableV2.server"; + +// v2 is gated on the org being served realtime by the NATIVE backend (Electric +// can't observe task_run_v2). That requires the deployment master switch +// (nativeRealtimeEnabled) AND the per-org `realtimeBackend` flag set to "native". +const NATIVE_ON = { nativeRealtimeEnabled: true }; +const NATIVE_OFF = { nativeRealtimeEnabled: false }; +const onNative = (extra: Record = {}) => ({ realtimeBackend: "native", ...extra }); + +describe("shouldUseV2RunTable", () => { + it("defaults to false when the org has no flags", () => { + expect(shouldUseV2RunTable(null, NATIVE_ON)).toBe(false); + expect(shouldUseV2RunTable(undefined, NATIVE_ON)).toBe(false); + expect(shouldUseV2RunTable({}, NATIVE_ON)).toBe(false); + }); + + it("returns true only when runTableV2 is boolean true AND the org is on native realtime", () => { + expect(shouldUseV2RunTable(onNative({ runTableV2: true }), NATIVE_ON)).toBe(true); + expect(shouldUseV2RunTable(onNative({ runTableV2: false }), NATIVE_ON)).toBe(false); + }); + + it("requires the native realtime backend (Electric can't observe v2 runs)", () => { + // runTableV2 on, but the org is not on native realtime → no v2 (it would be + // realtime-invisible). + expect(shouldUseV2RunTable({ runTableV2: true }, NATIVE_ON)).toBe(false); + expect(shouldUseV2RunTable({ runTableV2: true, realtimeBackend: "electric" }, NATIVE_ON)).toBe( + false + ); + expect(shouldUseV2RunTable({ runTableV2: true, realtimeBackend: "shadow" }, NATIVE_ON)).toBe( + false + ); + // On native per-org, but the deployment master switch is off → effectively + // still Electric → no v2. + expect(shouldUseV2RunTable(onNative({ runTableV2: true }), NATIVE_OFF)).toBe(false); + }); + + it("rejects a stringified flag value (strict boolean, no coercion)", () => { + // A stringified "false" must not coerce to true and cut the org over. + expect(shouldUseV2RunTable(onNative({ runTableV2: "true" }), NATIVE_ON)).toBe(false); + expect(shouldUseV2RunTable(onNative({ runTableV2: "false" }), NATIVE_ON)).toBe(false); + expect(shouldUseV2RunTable(onNative({ runTableV2: 1 }), NATIVE_ON)).toBe(false); + }); + + it("ignores unrelated flags and non-object inputs", () => { + expect(shouldUseV2RunTable(onNative({ mollifierEnabled: true }), NATIVE_ON)).toBe(false); + expect(shouldUseV2RunTable("runTableV2", NATIVE_ON)).toBe(false); + expect(shouldUseV2RunTable(42, NATIVE_ON)).toBe(false); + }); +}); diff --git a/apps/webapp/test/runTableV2Status.test.ts b/apps/webapp/test/runTableV2Status.test.ts new file mode 100644 index 00000000000..786a4dde43d --- /dev/null +++ b/apps/webapp/test/runTableV2Status.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from "vitest"; +import { canMintV2Run, v2RunsMayExist } from "~/v3/runTableV2Status.server"; + +// The module caches its status in a globalThis singleton ("runTableV2Status"). +// Under vitest (NODE_ENV=test) it skips the background poller entirely and +// initializes to { published:false, hasRows:false } — so no live DB query, no +// leaked interval, and nothing races these assertions. Mutate that cached +// object to exercise the gates deterministically. +function setStatus(published: boolean, hasRows: boolean) { + const singletons = (globalThis as any).__trigger_singletons; + // Force module init (the singleton is created on first getter call/import). + v2RunsMayExist(false); + singletons.runTableV2Status.published = published; + singletons.runTableV2Status.hasRows = hasRows; +} + +const CUTOVER_FLAGS = { realtimeBackend: "native", runTableV2: true }; + +describe("canMintV2Run (mint gate: org cut over AND task_run_v2 published)", () => { + it("mints v2 only when the org is cut over AND the table is published", () => { + setStatus(true, true); + expect(canMintV2Run(CUTOVER_FLAGS, { nativeRealtimeEnabled: true })).toBe(true); + }); + + it("fails safe to legacy when the org is cut over but the table is NOT published", () => { + setStatus(false, true); + expect(canMintV2Run(CUTOVER_FLAGS, { nativeRealtimeEnabled: true })).toBe(false); + }); + + it("stays legacy when the org is not cut over, even if published", () => { + setStatus(true, true); + expect( + canMintV2Run({ realtimeBackend: "electric", runTableV2: false }, { nativeRealtimeEnabled: true }) + ).toBe(false); + expect(canMintV2Run(CUTOVER_FLAGS, { nativeRealtimeEnabled: false })).toBe(false); + }); +}); + +describe("v2RunsMayExist (read scope: native on OR table has rows)", () => { + it("is true when native realtime is on (v2 being minted now)", () => { + setStatus(false, false); + expect(v2RunsMayExist(true)).toBe(true); + }); + + it("is true when task_run_v2 already has rows even with native OFF (rollback safety)", () => { + setStatus(false, true); + expect(v2RunsMayExist(false)).toBe(true); + }); + + it("is false only when native is off AND no v2 run has ever existed", () => { + setStatus(false, false); + expect(v2RunsMayExist(false)).toBe(false); + }); +}); diff --git a/apps/webapp/test/runsReplicationService.taskRunV2.test.ts b/apps/webapp/test/runsReplicationService.taskRunV2.test.ts new file mode 100644 index 00000000000..af11bf906a7 --- /dev/null +++ b/apps/webapp/test/runsReplicationService.taskRunV2.test.ts @@ -0,0 +1,257 @@ +import { ClickHouse } from "@internal/clickhouse"; +import { replicationContainerTest } from "@internal/testcontainers"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import { setTimeout } from "node:timers/promises"; +import { z } from "zod"; +import { RunsReplicationService } from "~/services/runsReplicationService.server"; +import { createInMemoryTracing } from "./utils/tracing"; +import { TestReplicationClickhouseFactory } from "./utils/testReplicationClickhouseFactory"; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunsReplicationService (task_run_v2)", () => { + replicationContainerTest( + "co-publishes task_run_v2 and streams its rows to the same ClickHouse table", + async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => { + // Both tables are in the publication; both need FULL identity so the + // delete transform can read the old row. INSERTs (this test) carry the + // full new tuple regardless, but we mirror the production setup. + await prisma.$executeRawUnsafe(`ALTER TABLE public."TaskRun" REPLICA IDENTITY FULL;`); + await prisma.$executeRawUnsafe(`ALTER TABLE public."task_run_v2" REPLICA IDENTITY FULL;`); + + const clickhouse = new ClickHouse({ + url: clickhouseContainer.getConnectionUrl(), + name: "runs-replication", + compression: { request: true }, + logLevel: "warn", + }); + + const { tracer } = createInMemoryTracing(); + + const runsReplicationService = new RunsReplicationService({ + clickhouseFactory: new TestReplicationClickhouseFactory(clickhouse), + pgConnectionUrl: postgresContainer.getConnectionUri(), + serviceName: "runs-replication", + slotName: "task_runs_to_clickhouse_v1", + publicationName: "task_runs_to_clickhouse_v1_publication", + redisOptions, + maxFlushConcurrency: 1, + flushIntervalMs: 100, + flushBatchSize: 1, + leaderLockTimeoutMs: 5000, + leaderLockExtendIntervalMs: 1000, + ackIntervalSeconds: 5, + tracer, + logLevel: "warn", + }); + + await runsReplicationService.start(); + + try { + const organization = await prisma.organization.create({ + data: { title: "test", slug: "test" }, + }); + const project = await prisma.project.create({ + data: { + name: "test", + slug: "test", + organizationId: organization.id, + externalRef: "test", + }, + }); + const runtimeEnvironment = await prisma.runtimeEnvironment.create({ + data: { + slug: "test", + type: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + apiKey: "test", + pkApiKey: "test", + shortcode: "test", + }, + }); + + // A v2 run lives in task_run_v2, keyed by a KSUID id. + const ksuid = RunId.generateKsuid(); + const run = await prisma.taskRunV2.create({ + data: { + id: ksuid.id, + friendlyId: ksuid.friendlyId, + taskIdentifier: "my-task", + payload: JSON.stringify({ foo: "bar" }), + payloadType: "application/json", + traceId: "v2trace", + spanId: "v2span", + queue: "test", + workerQueue: "us-east-1-next", + region: "us-east-1", + planType: "free", + runtimeEnvironmentId: runtimeEnvironment.id, + projectId: project.id, + organizationId: organization.id, + environmentType: "DEVELOPMENT", + engine: "V2", + }, + }); + + const queryRuns = clickhouse.reader.query({ + name: "runs-replication", + query: "SELECT * FROM trigger_dev.task_runs_v2 WHERE run_id = {runId: String}", + schema: z.any(), + params: z.object({ runId: z.string() }), + }); + + // ClickHouse replication is asynchronous: poll until the row lands + // (bounded) instead of a fixed sleep, which is flaky under lag variance. + let queryError: unknown = null; + let result: Array> | undefined; + const deadline = Date.now() + 10_000; + do { + [queryError, result] = await queryRuns({ runId: run.id }); + if (!queryError && result?.length === 1) break; + await setTimeout(200); + } while (Date.now() < deadline); + + expect(queryError).toBeNull(); + expect(result?.length).toBe(1); + expect(result?.[0]).toEqual( + expect.objectContaining({ + run_id: run.id, + friendly_id: run.friendlyId, + task_identifier: "my-task", + environment_id: runtimeEnvironment.id, + project_id: project.id, + organization_id: organization.id, + environment_type: "DEVELOPMENT", + engine: "V2", + }) + ); + } finally { + await runsReplicationService.stop(); + } + } + ); + + replicationContainerTest( + "streams a task_run_v2 DELETE with a complete old row (REPLICA IDENTITY FULL) so the tombstone carries org id", + async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => { + await prisma.$executeRawUnsafe(`ALTER TABLE public."TaskRun" REPLICA IDENTITY FULL;`); + // The migration sets this in production; the testcontainer builds via + // db push, so apply it here. Without FULL, the DELETE's old tuple is just + // the PK and organization_id below would be empty (tombstone dropped). + await prisma.$executeRawUnsafe(`ALTER TABLE public."task_run_v2" REPLICA IDENTITY FULL;`); + + const clickhouse = new ClickHouse({ + url: clickhouseContainer.getConnectionUrl(), + name: "runs-replication", + compression: { request: true }, + logLevel: "warn", + }); + + const { tracer } = createInMemoryTracing(); + + const runsReplicationService = new RunsReplicationService({ + clickhouseFactory: new TestReplicationClickhouseFactory(clickhouse), + pgConnectionUrl: postgresContainer.getConnectionUri(), + serviceName: "runs-replication", + slotName: "task_runs_to_clickhouse_v1", + publicationName: "task_runs_to_clickhouse_v1_publication", + redisOptions, + maxFlushConcurrency: 1, + flushIntervalMs: 100, + flushBatchSize: 1, + leaderLockTimeoutMs: 5000, + leaderLockExtendIntervalMs: 1000, + ackIntervalSeconds: 5, + tracer, + logLevel: "warn", + }); + + await runsReplicationService.start(); + + try { + const organization = await prisma.organization.create({ + data: { title: "test", slug: "test" }, + }); + const project = await prisma.project.create({ + data: { name: "test", slug: "test", organizationId: organization.id, externalRef: "test" }, + }); + const runtimeEnvironment = await prisma.runtimeEnvironment.create({ + data: { + slug: "test", + type: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + apiKey: "test", + pkApiKey: "test", + shortcode: "test", + }, + }); + + const ksuid = RunId.generateKsuid(); + const run = await prisma.taskRunV2.create({ + data: { + id: ksuid.id, + friendlyId: ksuid.friendlyId, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceId: "v2del", + spanId: "v2del", + queue: "test", + workerQueue: "us-east-1-next", + region: "us-east-1", + planType: "free", + runtimeEnvironmentId: runtimeEnvironment.id, + projectId: project.id, + organizationId: organization.id, + environmentType: "DEVELOPMENT", + engine: "V2", + }, + }); + + const latestRow = clickhouse.reader.query({ + name: "runs-replication", + query: + "SELECT run_id, organization_id, environment_id, _is_deleted FROM trigger_dev.task_runs_v2 WHERE run_id = {runId: String} ORDER BY _version DESC LIMIT 1", + schema: z.any(), + params: z.object({ runId: z.string() }), + }); + + // Wait for the INSERT to land. + let result: Array> | undefined; + let insertDeadline = Date.now() + 10_000; + do { + const [, rows] = await latestRow({ runId: run.id }); + result = rows; + if (result?.length === 1 && Number(result[0]._is_deleted) === 0) break; + await setTimeout(200); + } while (Date.now() < insertDeadline); + expect(result?.length).toBe(1); + + // Delete the v2 run and wait for the tombstone. + await prisma.taskRunV2.delete({ where: { id: run.id } }); + + const deleteDeadline = Date.now() + 10_000; + do { + const [, rows] = await latestRow({ runId: run.id }); + result = rows; + if (result?.length === 1 && Number(result[0]._is_deleted) === 1) break; + await setTimeout(200); + } while (Date.now() < deleteDeadline); + + // The tombstone must carry the full old row (org/env), not just the PK. + expect(Number(result?.[0]?._is_deleted)).toBe(1); + expect(result?.[0]).toEqual( + expect.objectContaining({ + run_id: run.id, + organization_id: organization.id, + environment_id: runtimeEnvironment.id, + }) + ); + } finally { + await runsReplicationService.stop(); + } + } + ); +}); diff --git a/apps/webapp/test/updateMetadata.test.ts b/apps/webapp/test/updateMetadata.test.ts index b78a1a50a9f..7d5314a1d45 100644 --- a/apps/webapp/test/updateMetadata.test.ts +++ b/apps/webapp/test/updateMetadata.test.ts @@ -1,5 +1,6 @@ import { containerTest } from "@internal/testcontainers"; import { parsePacket } from "@trigger.dev/core/v3"; +import { isKsuidId, RunId } from "@trigger.dev/core/v3/isomorphic"; import { setTimeout } from "timers/promises"; import { describe } from "vitest"; import { PostgresRunStore } from "@internal/run-store"; @@ -1291,4 +1292,119 @@ describe("UpdateMetadataService.call", () => { service.stopFlushing(); } ); + + containerTest( + "routes parent metadata operations to a parent in the OTHER run table (cross-table hierarchy)", + async ({ prisma }) => { + const service = new UpdateMetadataService({ + prisma, + runStore: new PostgresRunStore({ prisma, readOnlyPrisma: prisma }), + flushIntervalMs: 100, + flushEnabled: true, + flushLoggingEnabled: true, + maximumSize: 1024 * 1024 * 1, + logLevel: "debug", + }); + + try { + const organization = await prisma.organization.create({ + data: { title: "test", slug: "test" }, + }); + const project = await prisma.project.create({ + data: { + name: "test", + slug: "test", + organizationId: organization.id, + externalRef: "test", + }, + }); + const runtimeEnvironment = await prisma.runtimeEnvironment.create({ + data: { + slug: "test", + type: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + apiKey: "test", + pkApiKey: "test", + shortcode: "test", + }, + }); + + // Legacy parent (cuid id) lives in TaskRun. This is the mixed-window + // hierarchy: an org flips runTableV2 on while a pre-flip parent is live, + // and its post-flip child mints a ksuid into task_run_v2. + const parentId = RunId.generate(); + expect(isKsuidId(parentId.id)).toBe(false); + const parentTaskRun = await prisma.taskRun.create({ + data: { + id: parentId.id, + friendlyId: parentId.friendlyId, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceId: "t", + spanId: "s", + queue: "test", + runtimeEnvironmentId: runtimeEnvironment.id, + projectId: project.id, + organizationId: organization.id, + environmentType: "DEVELOPMENT", + engine: "V2", + }, + }); + + // v2 child (ksuid id) lives in task_run_v2 and points at the legacy + // parent by the scalar parentTaskRunId (no cross-table FK). + const childId = RunId.generateKsuid(); + expect(isKsuidId(childId.id)).toBe(true); + await prisma.taskRunV2.create({ + data: { + id: childId.id, + friendlyId: childId.friendlyId, + taskIdentifier: "my-child-task", + payload: "{}", + payloadType: "application/json", + traceId: "t", + spanId: "s", + queue: "test", + runtimeEnvironmentId: runtimeEnvironment.id, + projectId: project.id, + organizationId: organization.id, + environmentType: "DEVELOPMENT", + engine: "V2", + parentTaskRunId: parentTaskRun.id, + }, + }); + + // The child applies metadata.parent operations. Pre-fix, the table-bound + // parentTaskRun relation resolved null (parent is in the OTHER table), so + // the ops fell back to the child's own id — corrupting the child and + // never touching the parent. + await service.call(childId.id, { + parentOperations: [ + { type: "set", key: "foo", value: "bar" }, + { type: "append", key: "bar", value: "baz" }, + ], + }); + + // Wait for the buffered operations to flush. + await setTimeout(1000); + + // The PARENT (in TaskRun) must have received the operations. + const updatedParent = await prisma.taskRun.findFirst({ where: { id: parentTaskRun.id } }); + expect( + await parsePacket({ + data: updatedParent?.metadata ?? undefined, + dataType: updatedParent?.metadataType ?? "application/json", + }) + ).toEqual({ foo: "bar", bar: ["baz"] }); + + // The CHILD (in task_run_v2) must NOT have been polluted with parent ops. + const updatedChild = await prisma.taskRunV2.findFirst({ where: { id: childId.id } }); + expect(updatedChild?.metadata ?? null).toBeNull(); + } finally { + service.stopFlushing(); + } + } + ); }); diff --git a/apps/webapp/test/utils/replicationUtils.ts b/apps/webapp/test/utils/replicationUtils.ts index 358da0c2cf6..713bd242892 100644 --- a/apps/webapp/test/utils/replicationUtils.ts +++ b/apps/webapp/test/utils/replicationUtils.ts @@ -17,6 +17,10 @@ export async function setupClickhouseReplication({ redisOptions: RedisOptions; }) { await prisma.$executeRawUnsafe(`ALTER TABLE public."TaskRun" REPLICA IDENTITY FULL;`); + // task_run_v2 is co-published with TaskRun; it needs FULL identity too so + // UPDATE/DELETE WAL events carry the old row (the delete transform reads + // organizationId/environmentType off it). Mirrors the TaskRun line above. + await prisma.$executeRawUnsafe(`ALTER TABLE public."task_run_v2" REPLICA IDENTITY FULL;`); const clickhouse = new ClickHouse({ url: clickhouseUrl, diff --git a/internal-packages/database/prisma/migrations/20260616151544_create_task_run_v2/migration.sql b/internal-packages/database/prisma/migrations/20260616151544_create_task_run_v2/migration.sql new file mode 100644 index 00000000000..22a8bcf2293 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260616151544_create_task_run_v2/migration.sql @@ -0,0 +1,121 @@ +-- CreateTable +CREATE TABLE "public"."task_run_v2" ( + "id" TEXT NOT NULL, + "number" INTEGER NOT NULL DEFAULT 0, + "friendlyId" TEXT NOT NULL, + "engine" "public"."RunEngineVersion" NOT NULL DEFAULT 'V1', + "status" "public"."TaskRunStatus" NOT NULL DEFAULT 'PENDING', + "statusReason" TEXT, + "idempotencyKey" TEXT, + "idempotencyKeyExpiresAt" TIMESTAMP(3), + "idempotencyKeyOptions" JSONB, + "debounce" JSONB, + "taskIdentifier" TEXT NOT NULL, + "isTest" BOOLEAN NOT NULL DEFAULT false, + "payload" TEXT NOT NULL, + "payloadType" TEXT NOT NULL DEFAULT 'application/json', + "context" JSONB, + "traceContext" JSONB, + "traceId" TEXT NOT NULL, + "spanId" TEXT NOT NULL, + "runtimeEnvironmentId" TEXT NOT NULL, + "environmentType" "public"."RuntimeEnvironmentType", + "projectId" TEXT NOT NULL, + "organizationId" TEXT, + "queue" TEXT NOT NULL, + "lockedQueueId" TEXT, + "masterQueue" TEXT NOT NULL DEFAULT 'main', + "region" TEXT, + "secondaryMasterQueue" TEXT, + "attemptNumber" INTEGER, + "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP(3) NOT NULL, + "runTags" TEXT[], + "taskVersion" TEXT, + "sdkVersion" TEXT, + "cliVersion" TEXT, + "startedAt" TIMESTAMP(3), + "executedAt" TIMESTAMP(3), + "completedAt" TIMESTAMP(3), + "machinePreset" TEXT, + "usageDurationMs" INTEGER NOT NULL DEFAULT 0, + "costInCents" DOUBLE PRECISION NOT NULL DEFAULT 0, + "baseCostInCents" DOUBLE PRECISION NOT NULL DEFAULT 0, + "lockedAt" TIMESTAMP(3), + "lockedById" TEXT, + "lockedToVersionId" TEXT, + "priorityMs" INTEGER NOT NULL DEFAULT 0, + "concurrencyKey" TEXT, + "delayUntil" TIMESTAMP(3), + "queuedAt" TIMESTAMP(3), + "ttl" TEXT, + "expiredAt" TIMESTAMP(3), + "maxAttempts" INTEGER, + "lockedRetryConfig" JSONB, + "oneTimeUseToken" TEXT, + "taskEventStore" TEXT NOT NULL DEFAULT 'taskEvent', + "queueTimestamp" TIMESTAMP(3), + "scheduleInstanceId" TEXT, + "scheduleId" TEXT, + "bulkActionGroupIds" TEXT[] DEFAULT ARRAY[]::TEXT[], + "logsDeletedAt" TIMESTAMP(3), + "replayedFromTaskRunFriendlyId" TEXT, + "rootTaskRunId" TEXT, + "parentTaskRunId" TEXT, + "parentTaskRunAttemptId" TEXT, + "batchId" TEXT, + "resumeParentOnCompletion" BOOLEAN NOT NULL DEFAULT false, + "depth" INTEGER NOT NULL DEFAULT 0, + "parentSpanId" TEXT, + "runChainState" JSONB, + "seedMetadata" TEXT, + "seedMetadataType" TEXT NOT NULL DEFAULT 'application/json', + "metadata" TEXT, + "metadataType" TEXT NOT NULL DEFAULT 'application/json', + "metadataVersion" INTEGER NOT NULL DEFAULT 1, + "annotations" JSONB, + "isWarmStart" BOOLEAN, + "output" TEXT, + "outputType" TEXT NOT NULL DEFAULT 'application/json', + "error" JSONB, + "planType" TEXT, + "maxDurationInSeconds" INTEGER, + "realtimeStreamsVersion" TEXT NOT NULL DEFAULT 'v1', + "realtimeStreams" TEXT[] DEFAULT ARRAY[]::TEXT[], + "streamBasinName" TEXT, + + CONSTRAINT "task_run_v2_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE UNIQUE INDEX "task_run_v2_friendlyId_key" ON "public"."task_run_v2"("friendlyId"); + +-- CreateIndex +CREATE INDEX "task_run_v2_parentTaskRunId_idx" ON "public"."task_run_v2"("parentTaskRunId"); + +-- CreateIndex +CREATE INDEX "task_run_v2_spanId_idx" ON "public"."task_run_v2"("spanId"); + +-- CreateIndex +CREATE INDEX "task_run_v2_parentSpanId_idx" ON "public"."task_run_v2"("parentSpanId"); + +-- CreateIndex +CREATE INDEX "task_run_v2_runTags_idx" ON "public"."task_run_v2" USING GIN ("runTags" array_ops); + +-- CreateIndex +CREATE INDEX "task_run_v2_runtimeEnvironmentId_batchId_idx" ON "public"."task_run_v2"("runtimeEnvironmentId", "batchId"); + +-- CreateIndex +CREATE INDEX "task_run_v2_runtimeEnvironmentId_createdAt_idx" ON "public"."task_run_v2"("runtimeEnvironmentId", "createdAt" DESC); + +-- CreateIndex +CREATE INDEX "task_run_v2_createdAt_idx" ON "public"."task_run_v2" USING BRIN ("createdAt"); + +-- CreateIndex +CREATE INDEX "task_run_v2_createdAt_id_idx" ON "public"."task_run_v2"("createdAt", "id"); + +-- CreateIndex +CREATE UNIQUE INDEX "task_run_v2_oneTimeUseToken_key" ON "public"."task_run_v2"("oneTimeUseToken"); + +-- CreateIndex +CREATE UNIQUE INDEX "task_run_v2_runtimeEnvironmentId_taskIdentifier_idempotency_key" ON "public"."task_run_v2"("runtimeEnvironmentId", "taskIdentifier", "idempotencyKey"); diff --git a/internal-packages/database/prisma/migrations/20260619120042_drop_taskrun_incoming_fks/migration.sql b/internal-packages/database/prisma/migrations/20260619120042_drop_taskrun_incoming_fks/migration.sql new file mode 100644 index 00000000000..9e7313aade9 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260619120042_drop_taskrun_incoming_fks/migration.sql @@ -0,0 +1,17 @@ +-- Drop all foreign key constraints that reference TaskRun.id from child tables +-- (no schema change, data intact). Integrity moves to app code so a child row +-- can reference a run in either TaskRun (legacy) or task_run_v2 (new) by scalar. +ALTER TABLE "public"."TaskRunAttempt" DROP CONSTRAINT IF EXISTS "TaskRunAttempt_taskRunId_fkey"; +ALTER TABLE "public"."TaskRunDependency" DROP CONSTRAINT IF EXISTS "TaskRunDependency_taskRunId_fkey"; +ALTER TABLE "public"."BatchTaskRunItem" DROP CONSTRAINT IF EXISTS "BatchTaskRunItem_taskRunId_fkey"; +ALTER TABLE "public"."Checkpoint" DROP CONSTRAINT IF EXISTS "Checkpoint_runId_fkey"; +ALTER TABLE "public"."CheckpointRestoreEvent" DROP CONSTRAINT IF EXISTS "CheckpointRestoreEvent_runId_fkey"; +ALTER TABLE "public"."ProjectAlert" DROP CONSTRAINT IF EXISTS "ProjectAlert_taskRunId_fkey"; +ALTER TABLE "public"."BulkActionItem" DROP CONSTRAINT IF EXISTS "BulkActionItem_sourceRunId_fkey"; +ALTER TABLE "public"."BulkActionItem" DROP CONSTRAINT IF EXISTS "BulkActionItem_destinationRunId_fkey"; +ALTER TABLE "public"."_TaskRunToTaskRunTag" DROP CONSTRAINT IF EXISTS "_TaskRunToTaskRunTag_A_fkey"; +ALTER TABLE "public"."TaskRunExecutionSnapshot" DROP CONSTRAINT IF EXISTS "TaskRunExecutionSnapshot_runId_fkey"; +ALTER TABLE "public"."Waitpoint" DROP CONSTRAINT IF EXISTS "Waitpoint_completedByTaskRunId_fkey"; +ALTER TABLE "public"."TaskRunWaitpoint" DROP CONSTRAINT IF EXISTS "TaskRunWaitpoint_taskRunId_fkey"; +ALTER TABLE "public"."_WaitpointRunConnections" DROP CONSTRAINT IF EXISTS "_WaitpointRunConnections_A_fkey"; +ALTER TABLE "public"."PlaygroundConversation" DROP CONSTRAINT IF EXISTS "PlaygroundConversation_runId_fkey"; diff --git a/internal-packages/database/prisma/migrations/20260622120000_task_run_v2_replica_identity_full/migration.sql b/internal-packages/database/prisma/migrations/20260622120000_task_run_v2_replica_identity_full/migration.sql new file mode 100644 index 00000000000..56f189efa99 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260622120000_task_run_v2_replica_identity_full/migration.sql @@ -0,0 +1,9 @@ +-- task_run_v2 is co-published to ClickHouse alongside TaskRun via logical +-- replication. Replication needs REPLICA IDENTITY FULL so UPDATE/DELETE WAL +-- events carry the full OLD row (organizationId, environmentType, ...) that the +-- ClickHouse transform requires. Without it, a v2 run DELETE ships only the +-- primary key, organizationId is undefined, and the run's ClickHouse +-- soft-delete tombstone is silently dropped (the deleted run lingers in +-- analytics). TaskRun is configured the same way; this pins it deterministically +-- for task_run_v2 rather than relying on an out-of-band ops step. +ALTER TABLE "public"."task_run_v2" REPLICA IDENTITY FULL; diff --git a/internal-packages/database/prisma/migrations/20260623090000_task_run_v2_covering_index/migration.sql b/internal-packages/database/prisma/migrations/20260623090000_task_run_v2_covering_index/migration.sql new file mode 100644 index 00000000000..e8f480e39a8 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20260623090000_task_run_v2_covering_index/migration.sql @@ -0,0 +1,15 @@ +-- Bring task_run_v2's run-list index to parity with TaskRun's +-- (TaskRun_runtimeEnvironmentId_createdAt_idx, added in migration +-- 20250611080322): add the INCLUDE (id) covering column and fillfactor 90 so the +-- dashboard run-list query keeps index-only scans and the same page packing once +-- v2 carries volume. Without this, v2 run-list reads do heap fetches the legacy +-- table avoids. +-- +-- task_run_v2 is empty until an org cuts over to v2 run ids (gated on the native +-- realtime backend), and this migration deploys before any opt-in, so the +-- DROP/CREATE is effectively instant and runs safely inside the migration +-- transaction (no CONCURRENTLY needed, unlike the original TaskRun migration +-- which ran against a populated table). +DROP INDEX IF EXISTS "task_run_v2_runtimeEnvironmentId_createdAt_idx"; + +CREATE INDEX "task_run_v2_runtimeEnvironmentId_createdAt_idx" ON "task_run_v2"("runtimeEnvironmentId", "createdAt" DESC) INCLUDE ("id") WITH (fillfactor = 90); diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma index bb80da3a7ec..95c5b4f3bca 100644 --- a/internal-packages/database/prisma/schema.prisma +++ b/internal-packages/database/prisma/schema.prisma @@ -366,6 +366,7 @@ model RuntimeEnvironment { backgroundWorkers BackgroundWorker[] backgroundWorkerTasks BackgroundWorkerTask[] taskRuns TaskRun[] + taskRunsV2 TaskRunV2[] @relation("taskRunsV2") taskQueues TaskQueue[] batchTaskRuns BatchTaskRun[] environmentVariableValues EnvironmentVariableValue[] @@ -453,6 +454,7 @@ model Project { backgroundWorkers BackgroundWorker[] backgroundWorkerTasks BackgroundWorkerTask[] taskRuns TaskRun[] + taskRunsV2 TaskRunV2[] @relation("taskRunsV2") runTags TaskRunTag[] taskQueues TaskQueue[] environmentVariables EnvironmentVariable[] @@ -560,6 +562,7 @@ model BackgroundWorker { tasks BackgroundWorkerTask[] attempts TaskRunAttempt[] lockedRuns TaskRun[] + lockedRunsV2 TaskRunV2[] @relation("lockedRunsV2") files BackgroundWorkerFile[] queues TaskQueue[] promptVersions PromptVersion[] @@ -695,6 +698,7 @@ model BackgroundWorkerTask { attempts TaskRunAttempt[] runs TaskRun[] + runsV2 TaskRunV2[] @relation("lockedRunsV2") queueConfig Json? retryConfig Json? @@ -742,7 +746,9 @@ model PlaygroundConversation { /// The current active run backing this conversation (null if no run yet) runId String? - run TaskRun? @relation(fields: [runId], references: [id], onDelete: SetNull, onUpdate: Cascade) + run TaskRun? @relation(fields: [runId], references: [id], onDelete: SetNull, onUpdate: Cascade, map: "PlaygroundConversation_runId_fkey") + /// Mirror relation to TaskRunV2 reusing the same runId scalar (FK stripped in prod) + runV2 TaskRunV2? @relation("playgroundConversationsV2", fields: [runId], references: [id], onDelete: SetNull, onUpdate: Cascade, map: "PlaygroundConversation_runId_v2_fkey") /// The client data JSON used for this conversation clientData Json? @@ -1095,6 +1101,238 @@ model TaskRun { @@index([createdAt], type: Brin) } +/// Parallel mirror of TaskRun. +/// Structural copy of TaskRun's scalar columns with NO relation fields, so it +/// carries zero foreign-key constraints and requires no edits to other models. +/// FK id columns are kept as plain scalars; integrity is enforced in app code, +/// matching TaskRun's current FK-free state. Not yet written to or read from. +model TaskRunV2 { + id String @id @default(cuid()) + + number Int @default(0) + friendlyId String @unique + + engine RunEngineVersion @default(V1) + + status TaskRunStatus @default(PENDING) + statusReason String? + + idempotencyKey String? + idempotencyKeyExpiresAt DateTime? + /// Stores the user-provided key and scope: { key: string, scope: "run" | "attempt" | "global" } + idempotencyKeyOptions Json? + + /// Debounce options: { key: string, delay: string, createdAt: Date } + debounce Json? + + taskIdentifier String + + isTest Boolean @default(false) + + payload String + payloadType String @default("application/json") + context Json? + traceContext Json? + + traceId String + spanId String + + runtimeEnvironment RuntimeEnvironment @relation("taskRunsV2", fields: [runtimeEnvironmentId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "task_run_v2_runtimeEnvironmentId_fkey") + runtimeEnvironmentId String + + environmentType RuntimeEnvironmentType? + + project Project @relation("taskRunsV2", fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "task_run_v2_projectId_fkey") + projectId String + + organizationId String? + + // The specific queue this run is in + queue String + // The queueId is set when the run is locked to a specific queue + lockedQueueId String? + + /// The main queue that this run is part of + workerQueue String @default("main") @map("masterQueue") + + /// User-facing geo region, stamped at trigger; workerQueue is where it actually ran. + region String? + + /// @deprecated + secondaryMasterQueue String? + + /// From engine v2+ this will be defined after a run has been dequeued (starting at 1) + attemptNumber Int? + + createdAt DateTime @default(now()) + updatedAt DateTime @updatedAt + + attempts TaskRunAttempt[] @relation("attemptsV2") + + /// Denormized column that holds the raw tags + runTags String[] + + /// Denormalized version of the background worker task + taskVersion String? + sdkVersion String? + cliVersion String? + + checkpoints Checkpoint[] @relation("checkpointsV2") + + /// startedAt marks the point at which a run is dequeued from MarQS + startedAt DateTime? + /// executedAt is set when the first attempt is about to execute + executedAt DateTime? + completedAt DateTime? + machinePreset String? + + usageDurationMs Int @default(0) + costInCents Float @default(0) + baseCostInCents Float @default(0) + + lockedAt DateTime? + lockedBy BackgroundWorkerTask? @relation("lockedRunsV2", fields: [lockedById], references: [id], map: "task_run_v2_lockedById_fkey") + lockedById String? + + lockedToVersion BackgroundWorker? @relation("lockedRunsV2", fields: [lockedToVersionId], references: [id], map: "task_run_v2_lockedToVersionId_fkey") + lockedToVersionId String? + + /// The "priority" of the run. This is just a negative offset in ms for the queue timestamp + /// E.g. a value of 60_000 would put the run into the queue 60s ago. + priorityMs Int @default(0) + + concurrencyKey String? + + delayUntil DateTime? + queuedAt DateTime? + ttl String? + expiredAt DateTime? + maxAttempts Int? + lockedRetryConfig Json? + + /// optional token that can be used to authenticate the task run + oneTimeUseToken String? + + ///When this run is finished, the waitpoint will be marked as completed + associatedWaitpoint Waitpoint? @relation("CompletingRunV2") + + ///If there are any blocked waitpoints, the run won't be executed + blockedByWaitpoints TaskRunWaitpoint[] @relation("taskRunWaitpointsV2") + + /// Where the logs are stored + taskEventStore String @default("taskEvent") + + queueTimestamp DateTime? + + batchItems BatchTaskRunItem[] @relation("batchItemsV2") + dependency TaskRunDependency? @relation("dependencyV2") + CheckpointRestoreEvent CheckpointRestoreEvent[] @relation("checkpointRestoreEventsV2") + executionSnapshots TaskRunExecutionSnapshot[] @relation("executionSnapshotsV2") + + alerts ProjectAlert[] @relation("alertsV2") + + scheduleInstanceId String? + scheduleId String? + + bulkActionGroupIds String[] @default([]) + + logsDeletedAt DateTime? + + replayedFromTaskRunFriendlyId String? + + /// This represents the original task that that was triggered outside of a Trigger.dev task + rootTaskRun TaskRunV2? @relation("TaskRootRunV2", fields: [rootTaskRunId], references: [id], onDelete: SetNull, onUpdate: NoAction, map: "task_run_v2_rootTaskRunId_fkey") + rootTaskRunId String? + + /// The root run will have a list of all the descendant runs, children, grand children, etc. + descendantRuns TaskRunV2[] @relation("TaskRootRunV2") + + /// The immediate parent run of this task run + parentTaskRun TaskRunV2? @relation("TaskParentRunV2", fields: [parentTaskRunId], references: [id], onDelete: SetNull, onUpdate: NoAction, map: "task_run_v2_parentTaskRunId_fkey") + parentTaskRunId String? + + /// The immediate child runs of this task run + childRuns TaskRunV2[] @relation("TaskParentRunV2") + + /// The immediate parent attempt of this task run + parentTaskRunAttempt TaskRunAttempt? @relation("TaskParentRunAttemptV2", fields: [parentTaskRunAttemptId], references: [id], onDelete: SetNull, onUpdate: NoAction, map: "task_run_v2_parentTaskRunAttemptId_fkey") + parentTaskRunAttemptId String? + + /// The batch run that this task run is a part of + batch BatchTaskRun? @relation("batchRunsV2", fields: [batchId], references: [id], onDelete: SetNull, onUpdate: NoAction, map: "task_run_v2_batchId_fkey") + batchId String? + + /// whether or not the task run was created because of a triggerAndWait for batchTriggerAndWait + resumeParentOnCompletion Boolean @default(false) + + /// The depth of this task run in the task run hierarchy + depth Int @default(0) + + /// The span ID of the "trigger" span in the parent task run + parentSpanId String? + + /// Holds the state of the run chain for deadlock detection + runChainState Json? + + /// seed run metadata + seedMetadata String? + seedMetadataType String @default("application/json") + + /// Run metadata + metadata String? + metadataType String @default("application/json") + metadataVersion Int @default(1) + + /// Structured annotations: triggerSource, triggerAction, rootTriggerSource, rootScheduleId + annotations Json? + + /// Whether the latest attempt was a warm start. Null until first attempt starts. + isWarmStart Boolean? + + /// Run output + output String? + outputType String @default("application/json") + + /// Run error + error Json? + + /// Organization's billing plan type (cached for fallback when billing API fails) + planType String? + + maxDurationInSeconds Int? + + /// The version of the realtime streams implementation used by the run + realtimeStreamsVersion String @default("v1") + /// Store the stream keys that are being used by the run + realtimeStreams String[] @default([]) + /// S2 basin where this run's realtime streams live. Stamped at create + /// time from `Organization.streamBasinName` so reads can resolve the + /// basin without joining org. Null when the org has no per-org basin + /// (OSS, or pre-backfill); reads fall back to the global basin. + streamBasinName String? + + sourceBulkActionItems BulkActionItem[] @relation("SourceActionItemRunV2") + destinationBulkActionItems BulkActionItem[] @relation("DestinationActionItemRunV2") + + playgroundConversations PlaygroundConversation[] @relation("playgroundConversationsV2") + + @@unique([oneTimeUseToken]) + @@unique([runtimeEnvironmentId, taskIdentifier, idempotencyKey]) + // Finding child runs + @@index([parentTaskRunId]) + // Run page inspector + @@index([spanId]) + @@index([parentSpanId]) + // Finding runs in a batch + @@index([runTags(ops: ArrayOps)], type: Gin) + @@index([runtimeEnvironmentId, batchId]) + @@index([runtimeEnvironmentId, createdAt(sort: Desc)]) + @@index([createdAt], type: Brin) + // Keyset cursor for merged pagination across run tables + @@index([createdAt, id]) + @@map("task_run_v2") +} + model TaskRunTemplate { id String @id @default(cuid()) @@ -1215,7 +1453,9 @@ model TaskRunExecutionSnapshot { /// Run runId String - run TaskRun @relation(fields: [runId], references: [id]) + run TaskRun @relation(fields: [runId], references: [id], map: "TaskRunExecutionSnapshot_runId_fkey") + /// Mirror relation to TaskRunV2 reusing the same runId scalar (FK stripped in prod) + runV2 TaskRunV2? @relation("executionSnapshotsV2", fields: [runId], references: [id], map: "TaskRunExecutionSnapshot_runId_v2_fkey") runStatus TaskRunStatus // Batch @@ -1335,7 +1575,9 @@ model Waitpoint { /// If it's a RUN type waitpoint, this is the associated run completedByTaskRunId String? @unique - completedByTaskRun TaskRun? @relation("CompletingRun", fields: [completedByTaskRunId], references: [id], onDelete: SetNull) + completedByTaskRun TaskRun? @relation("CompletingRun", fields: [completedByTaskRunId], references: [id], onDelete: SetNull, map: "Waitpoint_completedByTaskRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same completedByTaskRunId scalar (FK stripped in prod) + completedByTaskRunV2 TaskRunV2? @relation("CompletingRunV2", fields: [completedByTaskRunId], references: [id], onDelete: SetNull, map: "Waitpoint_completedByTaskRunId_v2_fkey") /// If it's a DATETIME type waitpoint, this is the date. /// If it's a MANUAL waitpoint, this can be set as the `timeout`. @@ -1349,7 +1591,7 @@ model Waitpoint { blockingTaskRuns TaskRunWaitpoint[] /// All runs that have ever been blocked by this waitpoint, used for display purposes - connectedRuns TaskRun[] @relation("WaitpointRunConnections") + connectedRuns TaskRun[] @relation("WaitpointRunConnections") /// When a waitpoint is complete completedExecutionSnapshots TaskRunExecutionSnapshot[] @relation("completedWaitpoints") @@ -1400,7 +1642,9 @@ enum WaitpointStatus { model TaskRunWaitpoint { id String @id @default(cuid()) - taskRun TaskRun @relation(fields: [taskRunId], references: [id]) + taskRun TaskRun @relation(fields: [taskRunId], references: [id], map: "TaskRunWaitpoint_taskRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same taskRunId scalar (FK stripped in prod) + taskRunV2 TaskRunV2? @relation("taskRunWaitpointsV2", fields: [taskRunId], references: [id], map: "TaskRunWaitpoint_taskRunId_v2_fkey") taskRunId String waitpoint Waitpoint @relation(fields: [waitpointId], references: [id]) @@ -1564,7 +1808,7 @@ model TaskRunTag { friendlyId String @unique - runs TaskRun[] + runs TaskRun[] project Project @relation(fields: [projectId], references: [id], onDelete: Cascade, onUpdate: Cascade) projectId String @@ -1581,7 +1825,9 @@ model TaskRunDependency { id String @id @default(cuid()) /// The child run - taskRun TaskRun @relation(fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade) + taskRun TaskRun @relation(fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "TaskRunDependency_taskRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same taskRunId scalar (FK stripped in prod) + taskRunV2 TaskRunV2? @relation("dependencyV2", fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "TaskRunDependency_taskRunId_v2_fkey") taskRunId String @unique checkpointEvent CheckpointRestoreEvent? @relation(fields: [checkpointEventId], references: [id], onDelete: Cascade, onUpdate: Cascade) @@ -1629,7 +1875,9 @@ model TaskRunAttempt { friendlyId String @unique - taskRun TaskRun @relation("attempts", fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade) + taskRun TaskRun @relation("attempts", fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "TaskRunAttempt_taskRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same taskRunId scalar (FK stripped in prod) + taskRunV2 TaskRunV2? @relation("attemptsV2", fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "TaskRunAttempt_taskRunId_v2_fkey") taskRunId String backgroundWorker BackgroundWorker @relation(fields: [backgroundWorkerId], references: [id], onDelete: Cascade, onUpdate: Cascade) @@ -1666,6 +1914,7 @@ model TaskRunAttempt { CheckpointRestoreEvent CheckpointRestoreEvent[] alerts ProjectAlert[] childRuns TaskRun[] @relation("TaskParentRunAttempt") + childRunsV2 TaskRunV2[] @relation("TaskParentRunAttemptV2") @@unique([taskRunId, number]) @@index([taskRunId]) @@ -1867,6 +2116,7 @@ model BatchTaskRun { runtimeEnvironmentId String /// This only includes new runs, not idempotent runs. runs TaskRun[] + runsV2 TaskRunV2[] @relation("batchRunsV2") createdAt DateTime @default(now()) updatedAt DateTime @updatedAt @@ -1950,7 +2200,9 @@ model BatchTaskRunItem { batchTaskRun BatchTaskRun @relation(fields: [batchTaskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade) batchTaskRunId String - taskRun TaskRun @relation(fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade) + taskRun TaskRun @relation(fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "BatchTaskRunItem_taskRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same taskRunId scalar (FK stripped in prod) + taskRunV2 TaskRunV2? @relation("batchItemsV2", fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "BatchTaskRunItem_taskRunId_v2_fkey") taskRunId String taskRunAttempt TaskRunAttempt? @relation(fields: [taskRunAttemptId], references: [id], onDelete: SetNull, onUpdate: Cascade) @@ -2045,7 +2297,9 @@ model Checkpoint { events CheckpointRestoreEvent[] - run TaskRun @relation(fields: [runId], references: [id], onDelete: Cascade, onUpdate: Cascade) + run TaskRun @relation(fields: [runId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "Checkpoint_runId_fkey") + /// Mirror relation to TaskRunV2 reusing the same runId scalar (FK stripped in prod) + runV2 TaskRunV2? @relation("checkpointsV2", fields: [runId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "Checkpoint_runId_v2_fkey") runId String attempt TaskRunAttempt @relation(fields: [attemptId], references: [id], onDelete: Cascade, onUpdate: Cascade) @@ -2080,7 +2334,9 @@ model CheckpointRestoreEvent { checkpoint Checkpoint @relation(fields: [checkpointId], references: [id], onDelete: Cascade, onUpdate: Cascade) checkpointId String - run TaskRun @relation(fields: [runId], references: [id], onDelete: Cascade, onUpdate: Cascade) + run TaskRun @relation(fields: [runId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "CheckpointRestoreEvent_runId_fkey") + /// Mirror relation to TaskRunV2 reusing the same runId scalar (FK stripped in prod) + runV2 TaskRunV2? @relation("checkpointRestoreEventsV2", fields: [runId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "CheckpointRestoreEvent_runId_v2_fkey") runId String attempt TaskRunAttempt @relation(fields: [attemptId], references: [id], onDelete: Cascade, onUpdate: Cascade) @@ -2366,7 +2622,9 @@ model ProjectAlert { taskRunAttempt TaskRunAttempt? @relation(fields: [taskRunAttemptId], references: [id], onDelete: Cascade, onUpdate: Cascade) taskRunAttemptId String? - taskRun TaskRun? @relation(fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade) + taskRun TaskRun? @relation(fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "ProjectAlert_taskRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same taskRunId scalar (FK stripped in prod) + taskRunV2 TaskRunV2? @relation("alertsV2", fields: [taskRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "ProjectAlert_taskRunId_v2_fkey") taskRunId String? workerDeployment WorkerDeployment? @relation(fields: [workerDeploymentId], references: [id], onDelete: Cascade, onUpdate: Cascade) @@ -2547,11 +2805,15 @@ model BulkActionItem { status BulkActionItemStatus @default(PENDING) /// The run that is the source of the action, e.g. when replaying this is the original run - sourceRun TaskRun @relation("SourceActionItemRun", fields: [sourceRunId], references: [id], onDelete: Cascade, onUpdate: Cascade) + sourceRun TaskRun @relation("SourceActionItemRun", fields: [sourceRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "BulkActionItem_sourceRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same sourceRunId scalar (FK stripped in prod) + sourceRunV2 TaskRunV2? @relation("SourceActionItemRunV2", fields: [sourceRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "BulkActionItem_sourceRunId_v2_fkey") sourceRunId String /// The run that's a result of the action, this will be set when the run has been created - destinationRun TaskRun? @relation("DestinationActionItemRun", fields: [destinationRunId], references: [id], onDelete: Cascade, onUpdate: Cascade) + destinationRun TaskRun? @relation("DestinationActionItemRun", fields: [destinationRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "BulkActionItem_destinationRunId_fkey") + /// Mirror relation to TaskRunV2 reusing the same destinationRunId scalar (FK stripped in prod) + destinationRunV2 TaskRunV2? @relation("DestinationActionItemRunV2", fields: [destinationRunId], references: [id], onDelete: Cascade, onUpdate: Cascade, map: "BulkActionItem_destinationRunId_v2_fkey") destinationRunId String? error String? diff --git a/internal-packages/replication/src/client.ts b/internal-packages/replication/src/client.ts index 1a7ddb27236..562bccb5d6d 100644 --- a/internal-packages/replication/src/client.ts +++ b/internal-packages/replication/src/client.ts @@ -23,6 +23,14 @@ export interface LogicalReplicationClientOptions { * The table to replicate (for publication creation). */ table: string; + /** + * Additional tables to co-publish into the same publication. Their WAL + * events stream through the same `data` handler as `table`, so use this only + * when the extra tables share `table`'s row shape and downstream transform + * (e.g. a parallel clone table). On startup they are added to an existing + * publication via ALTER PUBLICATION ... ADD TABLE. + */ + additionalTables?: string[]; /** * The name of the replication slot to use. */ @@ -299,6 +307,8 @@ export class LogicalReplicationClient { startLsn, }); + await this.#warnOnWeakReplicaIdentity(); + const slotCreated = await this.#createSlot(); if (!slotCreated) { @@ -407,6 +417,15 @@ export class LogicalReplicationClient { return this; } + // The full set of tables this client publishes: the primary `table` plus any + // `additionalTables`. Order is stable so the publication's FOR TABLE clause is + // deterministic. + #allTables(): string[] { + return this.options.additionalTables + ? [this.options.table, ...this.options.additionalTables] + : [this.options.table]; + } + async #createPublication(): Promise { if (!this.client) { this.events.emit("error", new LogicalReplicationClientError("Client not connected")); @@ -416,8 +435,10 @@ export class LogicalReplicationClient { const publicationExists = await this.#doesPublicationExist(); if (publicationExists) { - // Validate the existing publication is correctly configured - const validationError = await this.#validatePublicationConfiguration(); + // Reconcile the existing publication: add any configured table it is + // missing (e.g. a clone table added after the publication was first + // created). Returns an error string only for unrecoverable mismatches. + const validationError = await this.#ensurePublicationConfiguration(); if (validationError) { this.logger.error("Publication exists but is misconfigured", { @@ -441,9 +462,13 @@ export class LogicalReplicationClient { return true; } + const tableList = this.#allTables() + .map((table) => `"${table}"`) + .join(", "); + const [createError] = await tryCatch( this.client.query( - `CREATE PUBLICATION "${this.options.publicationName}" FOR TABLE "${this.options.table}" ${ + `CREATE PUBLICATION "${this.options.publicationName}" FOR TABLE ${tableList} ${ this.options.publicationActions ? `WITH (publish = '${this.options.publicationActions.join(", ")}')` : "" @@ -483,32 +508,47 @@ export class LogicalReplicationClient { return res.rows[0].exists; } - async #validatePublicationConfiguration(): Promise { + async #ensurePublicationConfiguration(): Promise { if (!this.client) { - return "Cannot validate publication configuration: client not connected"; + return "Cannot ensure publication configuration: client not connected"; } - // Check if the publication has the correct table + // Which public tables the publication already carries. const tablesRes = await this.client.query( - `SELECT schemaname, tablename - FROM pg_publication_tables + `SELECT schemaname, tablename + FROM pg_publication_tables WHERE pubname = '${this.options.publicationName}';` ); - const tables = tablesRes.rows; - const expectedTable = this.options.table; - - // Check if the table is in the publication - const hasTable = tables.some( - (row) => row.tablename === expectedTable && row.schemaname === "public" + const currentTables = new Set( + tablesRes.rows + .filter((row) => row.schemaname === "public") + .map((row) => row.tablename as string) ); - if (!hasTable) { - if (tables.length === 0) { - return `Publication '${this.options.publicationName}' exists but has NO TABLES configured. Expected table: "public.${expectedTable}". Run: ALTER PUBLICATION ${this.options.publicationName} ADD TABLE "${expectedTable}";`; - } else { - const tableList = tables.map((t) => `"${t.schemaname}"."${t.tablename}"`).join(", "); - return `Publication '${this.options.publicationName}' exists but does not include the required table "public.${expectedTable}". Current tables: ${tableList}. Run: ALTER PUBLICATION ${this.options.publicationName} ADD TABLE "${expectedTable}";`; + // Reconcile rather than reject: add any configured table the publication is + // missing. ALTER PUBLICATION ... ADD TABLE is online and leaves the slot + // position intact, so an existing publication can gain a table (e.g. + // task_run_v2 alongside TaskRun) without a drop/recreate. ADD TABLE on a + // table already published raises duplicate_object (42710); treat that as a + // benign race (another instance won) rather than a failure. + const missingTables = this.#allTables().filter((table) => !currentTables.has(table)); + + for (const table of missingTables) { + this.logger.info("Adding table to existing publication", { + name: this.options.name, + publicationName: this.options.publicationName, + table, + }); + + const [addError] = await tryCatch( + this.client.query( + `ALTER PUBLICATION "${this.options.publicationName}" ADD TABLE "${table}";` + ) + ); + + if (addError && (addError as { code?: string }).code !== "42710") { + return `Failed to add table "public.${table}" to publication '${this.options.publicationName}': ${addError.message}`; } } @@ -567,6 +607,60 @@ export class LogicalReplicationClient { return null; } + /** + * Warn (never fail) when a co-published table lacks REPLICA IDENTITY FULL while + * the publication emits UPDATE/DELETE. Under the default primary-key identity, + * a DELETE's WAL `old` tuple carries only the key, so a consumer that needs + * other columns of the deleted row (e.g. to build a ClickHouse soft-delete + * tombstone with organization/environment ids) silently loses them. This only + * surfaces a misconfiguration (a forgotten ops step or a db-push'd table); it + * never blocks startup. + */ + async #warnOnWeakReplicaIdentity(): Promise { + if (!this.client) { + return; + } + + const publishesOldTuple = + !this.options.publicationActions || + this.options.publicationActions.includes("update") || + this.options.publicationActions.includes("delete"); + if (!publishesOldTuple) { + return; + } + + const tableList = this.#allTables() + .map((table) => `'${table}'`) + .join(", "); + + const [error, res] = await tryCatch( + this.client.query( + `SELECT c.relname, c.relreplident + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'public' AND c.relname IN (${tableList})` + ) + ); + if (error || !res) { + return; // best-effort diagnostic; never block startup + } + + for (const row of res.rows as Array<{ relname: string; relreplident: string }>) { + if (row.relreplident !== "f") { + this.logger.warn( + "Co-published table lacks REPLICA IDENTITY FULL; UPDATE/DELETE WAL events will omit non-key columns of the old row", + { + name: this.options.name, + publicationName: this.options.publicationName, + table: row.relname, + replicaIdentity: row.relreplident, + fix: `ALTER TABLE "public"."${row.relname}" REPLICA IDENTITY FULL;`, + } + ); + } + } + } + async #createSlot(): Promise { if (!this.client) { this.events.emit("error", new LogicalReplicationClientError("Cannot create slot")); diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 977c94a8e83..dd311f70b9d 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -1427,6 +1427,7 @@ export class RunAttemptSystem { completedAt: true, taskEventStore: true, parentTaskRunId: true, + runtimeEnvironmentId: true, delayUntil: true, updatedAt: true, runtimeEnvironment: { @@ -1439,11 +1440,6 @@ export class RunAttemptSystem { id: true, }, }, - childRuns: { - select: { - id: true, - }, - }, }, }, prisma @@ -1548,9 +1544,21 @@ export class RunAttemptSystem { //schedule the cancellation of all the child runs //it will call this function for each child, - //which will recursively cancel all children if they need to be - if (run.childRuns.length > 0) { - for (const childRun of run.childRuns) { + //which will recursively cancel all children if they need to be. + //Resolve children across BOTH run tables: a v2 parent can have a legacy + //cuid child (or vice versa) in the runTableV2 mixed window, and a + //childRuns relation select is bound to the parent's own table, so it + //would silently skip the cross-table children and leave them executing + //and holding concurrency after the parent is cancelled. + const childRuns = await this.$.runStore.findRuns( + { + where: { parentTaskRunId: runId, runtimeEnvironmentId: run.runtimeEnvironmentId }, + select: { id: true }, + }, + prisma + ); + if (childRuns.length > 0) { + for (const childRun of childRuns) { await this.$.worker.enqueue({ id: `cancelRun:${childRun.id}`, job: "cancelRun", diff --git a/internal-packages/run-engine/src/engine/tests/cancelling.test.ts b/internal-packages/run-engine/src/engine/tests/cancelling.test.ts index aecae7a2632..75253684818 100644 --- a/internal-packages/run-engine/src/engine/tests/cancelling.test.ts +++ b/internal-packages/run-engine/src/engine/tests/cancelling.test.ts @@ -1,5 +1,6 @@ import { containerTest, assertNonNullable } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; +import { isKsuidId, RunId } from "@trigger.dev/core/v3/isomorphic"; import { expect } from "vitest"; import { RunEngine } from "../index.js"; import { setTimeout } from "timers/promises"; @@ -227,6 +228,123 @@ describe("RunEngine cancelling", () => { } ); + containerTest( + "Cancelling a parent cascades to a child in the OTHER run table (cross-table mixed window)", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const parentTask = "parent-task"; + const childTask = "child-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, [parentTask, childTask]); + + // Parent gets a cuid id (-> TaskRun); child gets a ksuid id + // (-> task_run_v2). This is exactly the hierarchy a runTableV2 flip + // creates while a pre-flip parent is still live. + const parentId = RunId.generate(); + const childId = RunId.generateKsuid(); + + const parentRun = await engine.trigger( + { + number: 1, + friendlyId: parentId.friendlyId, + environment: authenticatedEnvironment, + taskIdentifier: parentTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "tp", + spanId: "sp", + workerQueue: "main", + queue: `task/${parentTask}`, + isTest: false, + tags: [], + }, + prisma + ); + + const childRun = await engine.trigger( + { + number: 1, + friendlyId: childId.friendlyId, + environment: authenticatedEnvironment, + taskIdentifier: childTask, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "tc", + spanId: "sc", + workerQueue: "main", + queue: `task/${childTask}`, + isTest: false, + tags: [], + parentTaskRunId: parentRun.id, + }, + prisma + ); + + // The hierarchy genuinely straddles the two physical run tables. + expect(isKsuidId(parentRun.id)).toBe(false); + expect(isKsuidId(childRun.id)).toBe(true); + + // Cancel the (queued) parent. Pre-fix, cancelRun read children through + // the table-bound childRuns relation, which cannot see the v2 child, so + // the cascade skipped it and it kept its place in the queue. Post-fix, + // the cross-table findRuns finds the child and cancels it too. + await engine.cancelRun({ + runId: parentRun.id, + completedAt: new Date(), + reason: "Cancelled by the user", + }); + + // The child cancellation is enqueued as a job; wait for the worker to process it + // (poll instead of a fixed sleep so the test isn't flaky under slow CI). + let childData = await engine.getRunExecutionData({ runId: childRun.id }); + const deadline = Date.now() + 5_000; + while (childData?.run.status !== "CANCELED" && Date.now() < deadline) { + await setTimeout(50); + childData = await engine.getRunExecutionData({ runId: childRun.id }); + } + expect(childData?.run.status).toBe("CANCELED"); + } finally { + await engine.quit(); + } + } + ); + containerTest("Cancelling a run (not executing)", async ({ prisma, redisOptions }) => { //create environment const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); diff --git a/internal-packages/run-store/src/PostgresRunStore.test.ts b/internal-packages/run-store/src/PostgresRunStore.test.ts index 47876b70c8d..d8f92a70b97 100644 --- a/internal-packages/run-store/src/PostgresRunStore.test.ts +++ b/internal-packages/run-store/src/PostgresRunStore.test.ts @@ -1,6 +1,7 @@ import { postgresTest } from "@internal/testcontainers"; +import { isKsuidId, RunId } from "@trigger.dev/core/v3/isomorphic"; import type { PrismaClient } from "@trigger.dev/database"; -import { describe, expect } from "vitest"; +import { describe, expect, vi } from "vitest"; import { PostgresRunStore } from "./PostgresRunStore.js"; import type { CreateCancelledRunInput, CreateFailedRunInput, CreateRunInput } from "./types.js"; @@ -1772,3 +1773,1429 @@ describe("PostgresRunStore — read", () => { expect(found[0]?.payloadType).toBe("application/json"); }); }); + +describe("PostgresRunStore — table routing by id format", () => { + // Seed a run directly into one physical table, choosing the delegate by id + // format the same way the store does. Returns the ids used. + async function seedRoutedRun( + prisma: PrismaClient, + params: { + id: string; + friendlyId: string; + organizationId: string; + projectId: string; + runtimeEnvironmentId: string; + status?: string; + idempotencyKey?: string; + taskIdentifier?: string; + createdAt?: Date; + parentTaskRunId?: string; + rootTaskRunId?: string; + } + ) { + const delegate = isKsuidId(params.id) + ? (prisma.taskRunV2 as unknown as typeof prisma.taskRun) + : prisma.taskRun; + + await delegate.create({ + data: { + id: params.id, + engine: "V2", + status: (params.status as any) ?? "PENDING", + friendlyId: params.friendlyId, + runtimeEnvironmentId: params.runtimeEnvironmentId, + environmentType: "DEVELOPMENT", + organizationId: params.organizationId, + projectId: params.projectId, + taskIdentifier: params.taskIdentifier ?? "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: `trace_${params.id}`, + spanId: `span_${params.id}`, + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + ...(params.idempotencyKey !== undefined && { idempotencyKey: params.idempotencyKey }), + ...(params.createdAt !== undefined && { createdAt: params.createdAt }), + ...(params.parentTaskRunId !== undefined && { parentTaskRunId: params.parentTaskRunId }), + ...(params.rootTaskRunId !== undefined && { rootTaskRunId: params.rootTaskRunId }), + }, + }); + } + + postgresTest( + "createRun with a cuid id lands a row in TaskRun and NOT in task_run_v2", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const cuid = RunId.generate(); + expect(isKsuidId(cuid.id)).toBe(false); + + await store.createRun({ + data: { + id: cuid.id, + engine: "V2", + status: "PENDING", + friendlyId: cuid.friendlyId, + runtimeEnvironmentId: environment.id, + environmentType: "DEVELOPMENT", + organizationId: organization.id, + projectId: project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + traceContext: {}, + traceId: "trace_cuid", + spanId: "span_cuid", + queue: "task/my-task", + isTest: false, + taskEventStore: "taskEvent", + depth: 0, + }, + snapshot: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Run was created", + runStatus: "PENDING", + environmentId: environment.id, + environmentType: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + }, + }); + + // cuid run is in TaskRun, not in task_run_v2. + const legacyRow = await prisma.taskRun.findUnique({ where: { id: cuid.id } }); + expect(legacyRow).not.toBeNull(); + const cuidInV2 = await prisma.taskRunV2.findUnique({ where: { id: cuid.id } }); + expect(cuidInV2).toBeNull(); + } + ); + + postgresTest( + "createRun routes a KSUID id to task_run_v2: the scalar row lands there and not in TaskRun", + async ({ prisma }) => { + // This test exercises the routing decision in isolation by writing the + // scalar row directly to the table `createRun` would pick for a KSUID + // `data.id`, then asserts the row landed in task_run_v2 and not in TaskRun. + // The full v2 create path (run + nested snapshot + waitpoint) is covered + // by the "v2 nested writes" suite below. + const { organization, project, environment } = await seedEnvironment(prisma); + + const ksuid = RunId.generateKsuid(); + expect(isKsuidId(ksuid.id)).toBe(true); + + await seedRoutedRun(prisma, { + id: ksuid.id, + friendlyId: ksuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + const v2Row = await prisma.taskRunV2.findUnique({ where: { id: ksuid.id } }); + expect(v2Row).not.toBeNull(); + const ksuidInLegacy = await prisma.taskRun.findUnique({ where: { id: ksuid.id } }); + expect(ksuidInLegacy).toBeNull(); + } + ); + + postgresTest( + "findRun and updateMetadata route to task_run_v2 for a KSUID run and to TaskRun for a cuid run", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const ksuid = RunId.generateKsuid(); + const cuid = RunId.generate(); + + await seedRoutedRun(prisma, { + id: ksuid.id, + friendlyId: ksuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + await seedRoutedRun(prisma, { + id: cuid.id, + friendlyId: cuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + // By-id read finds each run in its own table. + const foundKsuid = await store.findRun({ id: ksuid.id }, { select: { id: true } }); + expect(foundKsuid?.id).toBe(ksuid.id); + const foundCuid = await store.findRun({ id: cuid.id }, { select: { id: true } }); + expect(foundCuid?.id).toBe(cuid.id); + + // By-id write (updateMetadata) lands in the correct table. + const ksuidResult = await store.updateMetadata( + ksuid.id, + { + metadata: '{"routed":"v2"}', + metadataType: "application/json", + metadataVersion: { increment: 1 }, + updatedAt: new Date(), + }, + {} + ); + expect(ksuidResult.count).toBe(1); + + const cuidResult = await store.updateMetadata( + cuid.id, + { + metadata: '{"routed":"legacy"}', + metadataType: "application/json", + metadataVersion: { increment: 1 }, + updatedAt: new Date(), + }, + {} + ); + expect(cuidResult.count).toBe(1); + + // The write hit task_run_v2 for the KSUID run … + const v2Row = await prisma.taskRunV2.findUniqueOrThrow({ + where: { id: ksuid.id }, + select: { metadata: true }, + }); + expect(v2Row.metadata).toBe('{"routed":"v2"}'); + + // … and TaskRun for the cuid run. + const legacyRow = await prisma.taskRun.findUniqueOrThrow({ + where: { id: cuid.id }, + select: { metadata: true }, + }); + expect(legacyRow.metadata).toBe('{"routed":"legacy"}'); + } + ); + + postgresTest( + "findRun resolves a non-id predicate (idempotency key) against a run in either table", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // A KSUID run carrying an idempotency key lands in task_run_v2 … + const ksuid = RunId.generateKsuid(); + await seedRoutedRun(prisma, { + id: ksuid.id, + friendlyId: ksuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-v2", + taskIdentifier: "my-task", + }); + + // … and a cuid run carrying a different key lands in legacy TaskRun. + const cuid = RunId.generate(); + await seedRoutedRun(prisma, { + id: cuid.id, + friendlyId: cuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-legacy", + taskIdentifier: "my-task", + }); + + // The lookup carries no id/friendlyId, so it must read BOTH tables — + // this is the mixed-window idempotency dedup. Miss either table and a + // reused key produces a duplicate run. + const v2Hit = await store.findRun({ + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-v2", + taskIdentifier: "my-task", + }); + expect(v2Hit?.id).toBe(ksuid.id); + + const legacyHit = await store.findRun({ + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-legacy", + taskIdentifier: "my-task", + }); + expect(legacyHit?.id).toBe(cuid.id); + + // A key in neither table returns null — no false dedup. + const miss = await store.findRun({ + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-missing", + taskIdentifier: "my-task", + }); + expect(miss).toBeNull(); + + // findRunOrThrow takes the same both-table path: it finds the v2 row … + const thrown = await store.findRunOrThrow({ + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-v2", + taskIdentifier: "my-task", + }); + expect(thrown.id).toBe(ksuid.id); + + // … and throws when neither table matches. + await expect( + store.findRunOrThrow({ + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-missing", + taskIdentifier: "my-task", + }) + ).rejects.toThrow(); + } + ); + + postgresTest( + "findRun tables:'legacy' skips the task_run_v2 query (idempotency hot-path scope)", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // A v2 (ksuid) run carrying an idempotency key — it lives only in + // task_run_v2. + const ksuid = RunId.generateKsuid(); + await seedRoutedRun(prisma, { + id: ksuid.id, + friendlyId: ksuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-scope", + taskIdentifier: "my-task", + }); + + const where = { + runtimeEnvironmentId: environment.id, + idempotencyKey: "idem-scope", + taskIdentifier: "my-task", + }; + + // Default (both tables) and an explicit "both" find the v2 run. + expect((await store.findRun(where))?.id).toBe(ksuid.id); + expect( + (await store.findRun(where, { select: { id: true }, tables: "both" }))?.id + ).toBe(ksuid.id); + + // "legacy" scope skips task_run_v2 entirely, so the v2 run is NOT found. + // This is the hot-path optimisation for an org not cut over to v2: its + // runs only live in TaskRun, so the second (v2) query is always empty and + // can be skipped. (If a caller mis-scopes a genuinely-v2 org to legacy it + // would miss the run — hence it is gated on shouldUseV2RunTable upstream.) + expect(await store.findRun(where, { select: { id: true }, tables: "legacy" })).toBeNull(); + } + ); + + postgresTest( + "findRuns tables:'legacy' skips task_run_v2 (cross-table children hot-path scope)", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // A legacy (cuid) child and a v2 (ksuid) child of the same parent — the + // cross-table mixed-window hierarchy. + const parentId = RunId.generate().id; + const legacyChild = RunId.generate(); + const v2Child = RunId.generateKsuid(); + for (const child of [legacyChild, v2Child]) { + await seedRoutedRun(prisma, { + id: child.id, + friendlyId: child.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + parentTaskRunId: parentId, + taskIdentifier: "my-task", + }); + } + + const where = { parentTaskRunId: parentId, runtimeEnvironmentId: environment.id }; + + // Default (both tables) returns both children — required once an org is on v2. + const both = (await store.findRuns({ where, select: { id: true } })) as { id: string }[]; + expect(new Set(both.map((r) => r.id))).toEqual(new Set([legacyChild.id, v2Child.id])); + + // "legacy" scope skips task_run_v2 — the hot-path optimisation for an org + // not cut over to v2 (no v2 children exist), so only the legacy child. + const legacy = (await store.findRuns({ + where, + select: { id: true }, + tables: "legacy", + })) as { id: string }[]; + expect(legacy.map((r) => r.id)).toEqual([legacyChild.id]); + } + ); + + postgresTest( + "clearIdempotencyKey fans out across both tables (byPredicate hits v2; byFriendlyIds partitions a mixed array)", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // byPredicate carries no id, so it must reach task_run_v2 to clear a v2 run. + const v2Pred = RunId.generateKsuid(); + await seedRoutedRun(prisma, { + id: v2Pred.id, + friendlyId: v2Pred.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + idempotencyKey: "kp-v2", + taskIdentifier: "my-task", + }); + + const predResult = await store.clearIdempotencyKey({ + byPredicate: { + idempotencyKey: "kp-v2", + taskIdentifier: "my-task", + runtimeEnvironmentId: environment.id, + }, + }); + expect(predResult.count).toBe(1); + expect( + ( + await prisma.taskRunV2.findFirst({ + where: { id: v2Pred.id }, + select: { idempotencyKey: true }, + }) + )?.idempotencyKey + ).toBeNull(); + + // byFriendlyIds with a MIXED (ksuid + cuid) array must clear rows in BOTH + // physical tables — the partition + sum is the cross-table behaviour. + const v2F = RunId.generateKsuid(); + const legacyF = RunId.generate(); + await seedRoutedRun(prisma, { + id: v2F.id, + friendlyId: v2F.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + idempotencyKey: "kf-v2", + taskIdentifier: "my-task", + }); + await seedRoutedRun(prisma, { + id: legacyF.id, + friendlyId: legacyF.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + idempotencyKey: "kf-legacy", + taskIdentifier: "my-task", + }); + + const friendlyResult = await store.clearIdempotencyKey({ + byFriendlyIds: [v2F.friendlyId, legacyF.friendlyId], + }); + expect(friendlyResult.count).toBe(2); + expect( + ( + await prisma.taskRunV2.findFirst({ + where: { id: v2F.id }, + select: { idempotencyKey: true }, + }) + )?.idempotencyKey + ).toBeNull(); + expect( + ( + await prisma.taskRun.findFirst({ + where: { id: legacyF.id }, + select: { idempotencyKey: true }, + }) + )?.idempotencyKey + ).toBeNull(); + } + ); + + postgresTest( + "expireRunsBatch with a mixed array updates both tables and returns the combined count", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const ksuid = RunId.generateKsuid(); + const cuid = RunId.generate(); + + await seedRoutedRun(prisma, { + id: ksuid.id, + friendlyId: ksuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + await seedRoutedRun(prisma, { + id: cuid.id, + friendlyId: cuid.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + const now = new Date("2026-06-19T12:00:00.000Z"); + const error = { type: "STRING_ERROR" as const, raw: "Run expired because the TTL was reached" }; + + const count = await store.expireRunsBatch([ksuid.id, cuid.id], { error, now }); + + expect(count).toBe(2); + + const v2Row = await prisma.taskRunV2.findUniqueOrThrow({ + where: { id: ksuid.id }, + select: { status: true, completedAt: true, expiredAt: true }, + }); + expect(v2Row.status).toBe("EXPIRED"); + expect(v2Row.completedAt).toEqual(now); + expect(v2Row.expiredAt).toEqual(now); + + const legacyRow = await prisma.taskRun.findUniqueOrThrow({ + where: { id: cuid.id }, + select: { status: true, completedAt: true, expiredAt: true }, + }); + expect(legacyRow.status).toBe("EXPIRED"); + expect(legacyRow.completedAt).toEqual(now); + expect(legacyRow.expiredAt).toEqual(now); + } + ); + + postgresTest( + "findRuns (unordered) returns runs from BOTH TaskRun and task_run_v2 in one env", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // Two legacy (cuid) runs + two new (ksuid) runs in the SAME env. + const legacyA = RunId.generate(); + const legacyB = RunId.generate(); + const v2A = RunId.generateKsuid(); + const v2B = RunId.generateKsuid(); + expect(isKsuidId(legacyA.id)).toBe(false); + expect(isKsuidId(v2A.id)).toBe(true); + + for (const run of [legacyA, legacyB, v2A, v2B]) { + await seedRoutedRun(prisma, { + id: run.id, + friendlyId: run.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + } + + const found = await store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true }, + }); + + // ALL four runs come back, regardless of which physical table they live in. + expect(found.map((r) => r.id).sort()).toEqual( + [legacyA.id, legacyB.id, v2A.id, v2B.id].sort() + ); + } + ); + + postgresTest( + "findRuns (ordered+limited) 2-way merges both tables to the globally-correct first N", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // Interleave createdAt across the two tables so a per-table take+slice + // would be WRONG: the newest run is in v2, the 2nd-newest in legacy, etc. + // t5 (v2) > t4 (legacy) > t3 (v2) > t2 (legacy) > t1 (v2) > t0 (legacy) + const base = new Date("2026-06-01T00:00:00.000Z").getTime(); + const at = (i: number) => new Date(base + i * 60_000); + + const legacy0 = RunId.generate(); // t0 (oldest) + const v2_1 = RunId.generateKsuid(); // t1 + const legacy2 = RunId.generate(); // t2 + const v2_3 = RunId.generateKsuid(); // t3 + const legacy4 = RunId.generate(); // t4 + const v2_5 = RunId.generateKsuid(); // t5 (newest) + + const seeded: Array<{ id: string; friendlyId: string; t: number }> = [ + { id: legacy0.id, friendlyId: legacy0.friendlyId, t: 0 }, + { id: v2_1.id, friendlyId: v2_1.friendlyId, t: 1 }, + { id: legacy2.id, friendlyId: legacy2.friendlyId, t: 2 }, + { id: v2_3.id, friendlyId: v2_3.friendlyId, t: 3 }, + { id: legacy4.id, friendlyId: legacy4.friendlyId, t: 4 }, + { id: v2_5.id, friendlyId: v2_5.friendlyId, t: 5 }, + ]; + + for (const run of seeded) { + await seedRoutedRun(prisma, { + id: run.id, + friendlyId: run.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + createdAt: at(run.t), + }); + } + + const found = await store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true }, + orderBy: { createdAt: "desc" }, + take: 3, + }); + + // The globally-newest 3 — drawn from BOTH tables in true createdAt order, + // NOT three rows from one table. + expect(found.map((r) => r.id)).toEqual([v2_5.id, legacy4.id, v2_3.id]); + } + ); + + postgresTest( + "findRuns scoping: a run in another env is NOT returned from either table", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // A second env in the same project. + const otherEnv = await prisma.runtimeEnvironment.create({ + data: { + type: "PREVIEW", + slug: "other", + projectId: project.id, + organizationId: organization.id, + apiKey: "tr_other_apikey", + pkApiKey: "pk_other_apikey", + shortcode: "other_short_code", + }, + }); + + // One legacy + one v2 run in the TARGET env. + const legacyTarget = RunId.generate(); + const v2Target = RunId.generateKsuid(); + // One legacy + one v2 run in the OTHER env (must never surface). + const legacyOther = RunId.generate(); + const v2Other = RunId.generateKsuid(); + + await seedRoutedRun(prisma, { + id: legacyTarget.id, + friendlyId: legacyTarget.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + await seedRoutedRun(prisma, { + id: v2Target.id, + friendlyId: v2Target.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + await seedRoutedRun(prisma, { + id: legacyOther.id, + friendlyId: legacyOther.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: otherEnv.id, + }); + await seedRoutedRun(prisma, { + id: v2Other.id, + friendlyId: v2Other.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: otherEnv.id, + }); + + const found = await store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true }, + }); + + // The same `where` fences BOTH tables: only the target env's runs come back. + expect(found.map((r) => r.id).sort()).toEqual([legacyTarget.id, v2Target.id].sort()); + const foundIds = new Set(found.map((r) => r.id)); + expect(foundIds.has(legacyOther.id)).toBe(false); + expect(foundIds.has(v2Other.id)).toBe(false); + } + ); + + postgresTest( + "findRuns (include) returns hydrated relations from both tables", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const legacy = RunId.generate(); + const v2 = RunId.generateKsuid(); + + await seedRoutedRun(prisma, { + id: legacy.id, + friendlyId: legacy.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + await seedRoutedRun(prisma, { + id: v2.id, + friendlyId: v2.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + const found = await store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + include: { runtimeEnvironment: true }, + }); + + expect(found).toHaveLength(2); + // Both rows — legacy and v2 — carry the hydrated relation. + for (const run of found) { + expect(run.runtimeEnvironment).not.toBeNull(); + expect(run.runtimeEnvironment.id).toBe(environment.id); + expect(run.runtimeEnvironment.slug).toBe("dev"); + } + } + ); + + // NOTE: `findRuns(take, no orderBy)` across both tables used to cap the + // concatenation to `take` (non-deterministic — could drop one table's rows). + // It now throws (see the guard test below, next to the skip/cursor guards). + + postgresTest( + "findRuns (ordered+limited) by id alone is rejected: id is not a total cross-table order", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const legacy = RunId.generate(); + await seedRoutedRun(prisma, { + id: legacy.id, + friendlyId: legacy.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + await expect( + store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true }, + orderBy: { id: "asc" }, + take: 10, + }) + ).rejects.toThrow(/total order/i); + } + ); + + postgresTest( + "findRuns (ordered+limited) rejects a Prisma cursor it cannot span across two tables", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const legacy = RunId.generate(); + await seedRoutedRun(prisma, { + id: legacy.id, + friendlyId: legacy.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + + await expect( + store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true }, + orderBy: { createdAt: "desc" }, + take: 5, + cursor: { id: legacy.id }, + }) + ).rejects.toThrow(/cursor/i); + } + ); + + postgresTest( + "findRuns rejects `skip` (offset pagination cannot span the two tables)", + async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + await expect( + store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true }, + skip: 10, + take: 5, + }) + ).rejects.toThrow(/skip/i); + } + ); + + postgresTest( + "findRuns rejects `take` without `orderBy` across both tables (non-deterministic cap)", + async ({ prisma }) => { + const { environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // A both-table predicate (no id-list) with `take` but no `orderBy` would + // cap each table independently and silently drop one table's overflow. + await expect( + store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true }, + take: 5, + }) + ).rejects.toThrow(/take.*orderBy/i); + + // The same read WITH an `orderBy` is a valid bounded cross-table merge. + await expect( + store.findRuns({ + where: { runtimeEnvironmentId: environment.id }, + select: { id: true, createdAt: true }, + orderBy: { createdAt: "desc" }, + take: 5, + }) + ).resolves.toBeDefined(); + } + ); + + postgresTest( + "findRuns with an id-list partitions by id format and skips the table with no candidate ids", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const cuid = RunId.generate(); + const ksuid = RunId.generateKsuid(); + for (const r of [cuid, ksuid]) { + await seedRoutedRun(prisma, { + id: r.id, + friendlyId: r.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + } + + const ids = (rows: unknown) => + (rows as Array<{ id: string }>).map((r) => r.id).sort(); + + // Mixed list: both tables queried, both runs returned. + expect( + ids(await store.findRuns({ where: { id: { in: [cuid.id, ksuid.id] } }, select: { id: true } })) + ).toEqual([cuid.id, ksuid.id].sort()); + + // cuid-only list: the task_run_v2 query is skipped, legacy run still returned. + const v2Spy = vi.spyOn(prisma.taskRunV2, "findMany"); + const legacyOnly = await store.findRuns({ where: { id: { in: [cuid.id] } }, select: { id: true } }); + expect(ids(legacyOnly)).toEqual([cuid.id]); + expect(v2Spy).not.toHaveBeenCalled(); + v2Spy.mockRestore(); + + // ksuid-only list: the TaskRun query is skipped, v2 run still returned. + const legacySpy = vi.spyOn(prisma.taskRun, "findMany"); + const v2Only = await store.findRuns({ where: { id: { in: [ksuid.id] } }, select: { id: true } }); + expect(ids(v2Only)).toEqual([ksuid.id]); + expect(legacySpy).not.toHaveBeenCalled(); + legacySpy.mockRestore(); + + // Empty list matches nothing. + expect(ids(await store.findRuns({ where: { id: { in: [] } }, select: { id: true } }))).toEqual([]); + } + ); + + postgresTest( + "findRuns with a single-format id-list + non-time orderBy + take orders natively without the cross-table guard", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // Two v2 (ksuid) runs. + const k1 = RunId.generateKsuid(); + const k2 = RunId.generateKsuid(); + for (const r of [k1, k2]) { + await seedRoutedRun(prisma, { + id: r.id, + friendlyId: r.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + } + + // An all-ksuid id-list addresses task_run_v2 alone, so ordering by `id` + // (or any non-time key) with `take` must NOT trip the cross-table + // time-key guard — id is a valid total order within a single table. + const rows = (await store.findRuns({ + where: { id: { in: [k1.id, k2.id] } }, + select: { id: true }, + orderBy: { id: "asc" }, + take: 10, + })) as Array<{ id: string }>; + expect(rows.map((r) => r.id)).toEqual([k1.id, k2.id].sort()); + + // Same for an all-cuid id-list (legacy table only). + const c1 = RunId.generate(); + const c2 = RunId.generate(); + for (const r of [c1, c2]) { + await seedRoutedRun(prisma, { + id: r.id, + friendlyId: r.friendlyId, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + } + const legacyRows = (await store.findRuns({ + where: { id: { in: [c1.id, c2.id] } }, + select: { id: true }, + orderBy: { id: "desc" }, + take: 10, + })) as Array<{ id: string }>; + expect(legacyRows.map((r) => r.id)).toEqual([c1.id, c2.id].sort().reverse()); + } + ); + + postgresTest( + "merged keyset cursor enumerates every row exactly once at a tied createdAt across both tables (collation boundary)", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + // All four rows share the SAME createdAt, so pagination relies entirely on + // the id tiebreak. Hand-crafted ids straddle the collation divergence: a + // 27-char ksuid leading with an UPPERCASE letter routes to task_run_v2, + // and a lowercase cuid routes to TaskRun. Under the DB's en_US collation + // "c" < "Z", but by raw code unit "Z" < "c" — if the in-memory merge and + // the Postgres keyset disagree, a row is skipped or duplicated here. + const sameTime = new Date("2026-06-01T00:00:00.000Z"); + const seeds = [ + "Z" + "0".repeat(26), // ksuid -> task_run_v2 (uppercase lead) + "A" + "1".repeat(26), // ksuid -> task_run_v2 + "c" + "z".repeat(24), // cuid -> TaskRun (25 chars) + "c" + "a".repeat(24), // cuid -> TaskRun + ]; + for (const id of seeds) { + await seedRoutedRun(prisma, { + id, + friendlyId: `run_${id}`, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + createdAt: sameTime, + }); + } + + // Paginate exactly like runsBackfiller: orderBy [createdAt asc, id asc], + // take 1 (forces the tie boundary on every page), cursor = (createdAt, id). + const seen: string[] = []; + let cursor: { createdAt: Date; id: string } | undefined; + for (let guard = 0; guard < 25; guard++) { + const page = (await store.findRuns({ + where: { + runtimeEnvironmentId: environment.id, + ...(cursor + ? { + OR: [ + { createdAt: { gt: cursor.createdAt } }, + { createdAt: cursor.createdAt, id: { gt: cursor.id } }, + ], + } + : {}), + }, + select: { id: true, createdAt: true }, + orderBy: [{ createdAt: "asc" }, { id: "asc" }], + take: 1, + })) as Array<{ id: string; createdAt: Date }>; + if (page.length === 0) break; + seen.push(page[0].id); + cursor = { createdAt: page[0].createdAt, id: page[0].id }; + } + + // Every seeded row enumerated exactly once: no skip, no duplicate. + expect(seen.slice().sort()).toEqual(seeds.slice().sort()); + expect(new Set(seen).size).toBe(seeds.length); + } + ); + + postgresTest( + "cross-table run hierarchy resolves parent by id and children by predicate across both tables", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + const base = { + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }; + + // Legacy cuid PARENT in TaskRun, v2 ksuid CHILD in task_run_v2 pointing at + // it (a hierarchy straddling a runTableV2 flip). This is what the + // presenters resolve via hydrateParentAndRoot / hydrateChildRuns. + const parent = RunId.generate(); + const child = RunId.generateKsuid(); + await seedRoutedRun(prisma, { ...base, id: parent.id, friendlyId: parent.friendlyId }); + await seedRoutedRun(prisma, { + ...base, + id: child.id, + friendlyId: child.friendlyId, + parentTaskRunId: parent.id, + rootTaskRunId: parent.id, + }); + + // child -> parent: by-id read routes to the legacy table. + const resolvedParent = await store.findRun({ id: parent.id }, { select: { id: true } }); + expect(resolvedParent?.id).toBe(parent.id); + // parent -> children: a parentTaskRunId predicate spans both tables and + // finds the v2 child of the legacy parent. + const children = (await store.findRuns({ + where: { parentTaskRunId: parent.id }, + select: { id: true }, + })) as Array<{ id: string }>; + expect(children.map((c) => c.id)).toEqual([child.id]); + + // Mirror: ksuid parent in task_run_v2, cuid child in TaskRun. + const parent2 = RunId.generateKsuid(); + const child2 = RunId.generate(); + await seedRoutedRun(prisma, { ...base, id: parent2.id, friendlyId: parent2.friendlyId }); + await seedRoutedRun(prisma, { + ...base, + id: child2.id, + friendlyId: child2.friendlyId, + parentTaskRunId: parent2.id, + rootTaskRunId: parent2.id, + }); + const resolvedParent2 = await store.findRun({ id: parent2.id }, { select: { id: true } }); + expect(resolvedParent2?.id).toBe(parent2.id); + const children2 = (await store.findRuns({ + where: { parentTaskRunId: parent2.id }, + select: { id: true }, + })) as Array<{ id: string }>; + expect(children2.map((c) => c.id)).toEqual([child2.id]); + } + ); +}); + +describe("PostgresRunStore — v2 nested writes (run + related rows via nested Prisma create)", () => { + // `task_run_v2` is a full clone of `TaskRun` down to its relations, so the nested Prisma + // create/include used by createRun/lifecycle methods targets it unchanged via the runModel cast. + // The child->run foreign keys (TaskRunExecutionSnapshot.runId, Waitpoint.completedByTaskRunId, …) + // are dropped in production and by the testcontainer harness, so a child row can reference a run + // in EITHER physical table (TaskRun or task_run_v2) by plain scalar id without a FK violation. + + function runAssociatedWaitpoint(params: { + id: string; + friendlyId: string; + projectId: string; + environmentId: string; + }) { + return { + id: params.id, + friendlyId: params.friendlyId, + type: "RUN" as const, + status: "PENDING" as const, + idempotencyKey: `idem_${params.id}`, + userProvidedIdempotencyKey: false, + projectId: params.projectId, + environmentId: params.environmentId, + }; + } + + postgresTest( + "createRun for a KSUID run lands the run in task_run_v2, creates its snapshot keyed to the v2 run id, and creates the associated waitpoint", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const ksuid = RunId.generateKsuid(); + expect(isKsuidId(ksuid.id)).toBe(true); + + const input: CreateRunInput = { + ...buildCreateRunInput({ + runId: ksuid.id, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }), + associatedWaitpoint: runAssociatedWaitpoint({ + id: "wp_v2_create_1", + friendlyId: "wp_v2_create_friendly_1", + projectId: project.id, + environmentId: environment.id, + }), + }; + input.data.friendlyId = ksuid.friendlyId; + + const run = await store.createRun(input); + + // Returns the TaskRunWithWaitpoint shape with the associated waitpoint included. + expect(run.id).toBe(ksuid.id); + expect(run.status).toBe("PENDING"); + expect(run.associatedWaitpoint).not.toBeNull(); + expect(run.associatedWaitpoint?.id).toBe("wp_v2_create_1"); + + // The run row landed in task_run_v2, not TaskRun. + const v2Row = await prisma.taskRunV2.findUnique({ where: { id: ksuid.id } }); + expect(v2Row).not.toBeNull(); + const legacyRow = await prisma.taskRun.findUnique({ where: { id: ksuid.id } }); + expect(legacyRow).toBeNull(); + + // The execution snapshot is keyed to the v2 run id (in the shared snapshot table). + const snapshots = await prisma.taskRunExecutionSnapshot.findMany({ + where: { runId: ksuid.id }, + }); + expect(snapshots).toHaveLength(1); + expect(snapshots[0]?.executionStatus).toBe("RUN_CREATED"); + expect(snapshots[0]?.runStatus).toBe("PENDING"); + + // The waitpoint points back at the v2 run via the scalar FK column. + const waitpoint = await prisma.waitpoint.findUnique({ where: { id: "wp_v2_create_1" } }); + expect(waitpoint?.completedByTaskRunId).toBe(ksuid.id); + } + ); + + postgresTest( + "v2 lifecycle: startAttempt then completeAttemptSuccess creates the completion snapshot keyed to the v2 run id", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const ksuid = RunId.generateKsuid(); + + const input = buildCreateRunInput({ + runId: ksuid.id, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + input.data.friendlyId = ksuid.friendlyId; + + await store.createRun(input); + + const started = await store.startAttempt( + ksuid.id, + { attemptNumber: 1, isWarmStart: false }, + { select: { id: true, status: true, attemptNumber: true } } + ); + expect(started.status).toBe("EXECUTING"); + expect(started.attemptNumber).toBe(1); + + const completedAt = new Date("2026-06-19T11:00:00.000Z"); + const completed = await store.completeAttemptSuccess( + ksuid.id, + { + completedAt, + output: '{"ok":true}', + outputType: "application/json", + usageDurationMs: 250, + costInCents: 4, + snapshot: { + executionStatus: "FINISHED", + description: "Task completed successfully", + runStatus: "COMPLETED_SUCCESSFULLY", + attemptNumber: 1, + environmentId: environment.id, + environmentType: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + }, + }, + { select: { id: true, status: true, completedAt: true, usageDurationMs: true, costInCents: true } } + ); + + expect(completed.id).toBe(ksuid.id); + expect(completed.status).toBe("COMPLETED_SUCCESSFULLY"); + expect(completed.completedAt).toEqual(completedAt); + expect(completed.usageDurationMs).toBe(250); + expect(completed.costInCents).toBe(4); + + // The run row updated in task_run_v2. + const v2Row = await prisma.taskRunV2.findUniqueOrThrow({ + where: { id: ksuid.id }, + select: { status: true }, + }); + expect(v2Row.status).toBe("COMPLETED_SUCCESSFULLY"); + + // The completion snapshot is keyed to the v2 run id. + const finished = await prisma.taskRunExecutionSnapshot.findMany({ + where: { runId: ksuid.id, executionStatus: "FINISHED" }, + }); + expect(finished).toHaveLength(1); + expect(finished[0]?.runStatus).toBe("COMPLETED_SUCCESSFULLY"); + } + ); + + postgresTest( + "createFailedRun for a KSUID run lands the run in task_run_v2 and creates the associated waitpoint keyed to the v2 run id", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const ksuid = RunId.generateKsuid(); + const completedAt = new Date("2026-06-19T00:00:00.000Z"); + const error = { type: "STRING_ERROR", raw: "system failure" }; + + const input: CreateFailedRunInput = { + data: { + id: ksuid.id, + engine: "V2", + status: "SYSTEM_FAILURE", + friendlyId: ksuid.friendlyId, + runtimeEnvironmentId: environment.id, + environmentType: "DEVELOPMENT", + organizationId: organization.id, + projectId: project.id, + taskIdentifier: "my-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "trace_v2_failed", + spanId: "span_v2_failed", + queue: "task/my-task", + isTest: false, + completedAt, + error: error as unknown as import("@trigger.dev/database").Prisma.InputJsonObject, + depth: 0, + taskEventStore: "taskEvent", + }, + associatedWaitpoint: runAssociatedWaitpoint({ + id: "wp_v2_failed_1", + friendlyId: "wp_v2_failed_friendly_1", + projectId: project.id, + environmentId: environment.id, + }), + }; + + const run = await store.createFailedRun(input); + + expect(run.id).toBe(ksuid.id); + expect(run.status).toBe("SYSTEM_FAILURE"); + expect(run.associatedWaitpoint).not.toBeNull(); + expect(run.associatedWaitpoint?.id).toBe("wp_v2_failed_1"); + + const v2Row = await prisma.taskRunV2.findUnique({ where: { id: ksuid.id } }); + expect(v2Row).not.toBeNull(); + const legacyRow = await prisma.taskRun.findUnique({ where: { id: ksuid.id } }); + expect(legacyRow).toBeNull(); + + const waitpoint = await prisma.waitpoint.findUnique({ where: { id: "wp_v2_failed_1" } }); + expect(waitpoint?.completedByTaskRunId).toBe(ksuid.id); + } + ); + + postgresTest( + "createRun for a legacy cuid run with an associated waitpoint creates the run, its snapshot, and the waitpoint (regression: identical rows/shape)", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const cuid = RunId.generate(); + expect(isKsuidId(cuid.id)).toBe(false); + + const input: CreateRunInput = { + ...buildCreateRunInput({ + runId: cuid.id, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }), + associatedWaitpoint: runAssociatedWaitpoint({ + id: "wp_legacy_create_1", + friendlyId: "wp_legacy_create_friendly_1", + projectId: project.id, + environmentId: environment.id, + }), + }; + input.data.friendlyId = cuid.friendlyId; + + const run = await store.createRun(input); + + // Same TaskRunWithWaitpoint shape as before. + expect(run.id).toBe(cuid.id); + expect(run.status).toBe("PENDING"); + expect(run.associatedWaitpoint?.id).toBe("wp_legacy_create_1"); + + // Legacy run is in TaskRun, not task_run_v2. + const legacyRow = await prisma.taskRun.findUnique({ where: { id: cuid.id } }); + expect(legacyRow).not.toBeNull(); + const v2Row = await prisma.taskRunV2.findUnique({ where: { id: cuid.id } }); + expect(v2Row).toBeNull(); + + // Snapshot keyed to the run, waitpoint linked back via the FK column. + const snapshots = await prisma.taskRunExecutionSnapshot.findMany({ + where: { runId: cuid.id }, + }); + expect(snapshots).toHaveLength(1); + expect(snapshots[0]?.executionStatus).toBe("RUN_CREATED"); + + const waitpoint = await prisma.waitpoint.findUnique({ where: { id: "wp_legacy_create_1" } }); + expect(waitpoint?.completedByTaskRunId).toBe(cuid.id); + + // The FK still being live for the legacy table proves the waitpoint really + // resolves to a TaskRun row (the regression path is unchanged). + const reloaded = await prisma.taskRun.findUniqueOrThrow({ + where: { id: cuid.id }, + include: { associatedWaitpoint: true }, + }); + expect(reloaded.associatedWaitpoint?.id).toBe("wp_legacy_create_1"); + } + ); + + postgresTest( + "createRun is atomic: a second create with the same id throws and leaves no dangling snapshot", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const cuid = RunId.generate(); + const input = buildCreateRunInput({ + runId: cuid.id, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + input.data.friendlyId = cuid.friendlyId; + + await store.createRun(input); + + const before = await prisma.taskRunExecutionSnapshot.count({ where: { runId: cuid.id } }); + expect(before).toBe(1); + + // A second createRun with the same id fails the unique-id insert and + // propagates the error. Because the run row and its snapshot are written by + // one nested Prisma create, the rollback leaves no extra snapshot behind. + await expect(store.createRun(input)).rejects.toThrow(); + + const after = await prisma.taskRunExecutionSnapshot.count({ where: { runId: cuid.id } }); + expect(after).toBe(1); + } + ); + + postgresTest( + "lockRunToWorker for a KSUID run returns the run with runtimeEnvironment hydrated via include (no manual stitch)", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const ksuid = RunId.generateKsuid(); + expect(isKsuidId(ksuid.id)).toBe(true); + + const input = buildCreateRunInput({ + runId: ksuid.id, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + input.data.friendlyId = ksuid.friendlyId; + + await store.createRun(input); + + const backgroundWorker = await prisma.backgroundWorker.create({ + data: { + friendlyId: "worker_friendly_v2", + version: "20260601.1", + runtimeEnvironmentId: environment.id, + projectId: project.id, + contentHash: "abc123v2", + sdkVersion: "3.0.0", + cliVersion: "3.0.0", + metadata: {}, + }, + }); + + const workerTask = await prisma.backgroundWorkerTask.create({ + data: { + friendlyId: "task_friendly_v2", + slug: "my-task", + filePath: "src/my-task.ts", + exportName: "myTask", + workerId: backgroundWorker.id, + runtimeEnvironmentId: environment.id, + projectId: project.id, + }, + }); + + const queue = await prisma.taskQueue.create({ + data: { + friendlyId: "queue_friendly_v2", + name: "task/my-task", + runtimeEnvironmentId: environment.id, + projectId: project.id, + }, + }); + + const lockedAt = new Date("2026-06-19T13:00:00.000Z"); + const startedAt = new Date("2026-06-19T13:00:01.000Z"); + const snapshotId = "snap_lock_v2_1"; + + const locked = await store.lockRunToWorker(ksuid.id, { + lockedAt, + lockedById: workerTask.id, + lockedToVersionId: backgroundWorker.id, + lockedQueueId: queue.id, + startedAt, + baseCostInCents: 5, + machinePreset: "small-1x", + taskVersion: "20260601.1", + sdkVersion: "3.0.0", + cliVersion: "3.0.0", + maxDurationInSeconds: null, + snapshot: { + id: snapshotId, + previousSnapshotId: undefined, + environmentId: environment.id, + environmentType: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + completedWaitpointIds: [], + completedWaitpointOrder: [], + }, + }); + + expect(locked.status).toBe("DEQUEUED"); + // The relation is hydrated by the nested `include`, not stitched manually. + expect(locked.runtimeEnvironment).toBeDefined(); + expect(locked.runtimeEnvironment.id).toBe(environment.id); + + // The run row landed (and was updated) in task_run_v2. + const v2Row = await prisma.taskRunV2.findUniqueOrThrow({ + where: { id: ksuid.id }, + select: { status: true }, + }); + expect(v2Row.status).toBe("DEQUEUED"); + + // The dequeue snapshot is keyed to the v2 run id. + const snap = await prisma.taskRunExecutionSnapshot.findUnique({ where: { id: snapshotId } }); + expect(snap?.executionStatus).toBe("PENDING_EXECUTING"); + } + ); + + postgresTest( + "findRun with a runtimeEnvironment include resolves the relation for a KSUID run", + async ({ prisma }) => { + const { organization, project, environment } = await seedEnvironment(prisma); + + const store = new PostgresRunStore({ prisma, readOnlyPrisma: prisma }); + + const ksuid = RunId.generateKsuid(); + const input = buildCreateRunInput({ + runId: ksuid.id, + organizationId: organization.id, + projectId: project.id, + runtimeEnvironmentId: environment.id, + }); + input.data.friendlyId = ksuid.friendlyId; + + await store.createRun(input); + + const run = await store.findRun({ id: ksuid.id }, { include: { runtimeEnvironment: true } }); + + expect(run?.id).toBe(ksuid.id); + expect(run?.runtimeEnvironment).toBeDefined(); + expect(run?.runtimeEnvironment.id).toBe(environment.id); + } + ); +}); diff --git a/internal-packages/run-store/src/PostgresRunStore.ts b/internal-packages/run-store/src/PostgresRunStore.ts index 2caa5ca85b4..5d474b3951c 100644 --- a/internal-packages/run-store/src/PostgresRunStore.ts +++ b/internal-packages/run-store/src/PostgresRunStore.ts @@ -13,6 +13,7 @@ import type { CreateFailedRunInput, CreateRunInput, ExpireSnapshotInput, + FindRunTableScope, LockRunData, ReadClient, RescheduleSnapshotInput, @@ -21,6 +22,25 @@ import type { TaskRunWithWaitpoint, } from "./types.js"; import type { TaskRunError } from "@trigger.dev/core/v3/schemas"; +import { isKsuidId } from "@trigger.dev/core/v3/isomorphic"; + +// Extract a plain string equality from a Prisma string filter — a bare string +// or `{ equals: "..." }`. Returns undefined for any other operator shape (in, +// not, contains, etc.), which callers treat as "can't narrow to one table". +function stringEquality(filter: unknown): string | undefined { + if (typeof filter === "string") { + return filter; + } + if ( + filter !== null && + typeof filter === "object" && + "equals" in filter && + typeof (filter as { equals?: unknown }).equals === "string" + ) { + return (filter as { equals: string }).equals; + } + return undefined; +} export type PostgresRunStoreOptions = { prisma: PrismaClient; @@ -28,12 +48,17 @@ export type PostgresRunStoreOptions = { }; /** - * Typed write layer for the task-run row, backed by the `taskRun` Prisma model. + * Typed write layer for the task-run row. A run lives in one of two physical + * tables chosen by its id format (`runModel`): the legacy `taskRun`, or the + * `task_run_v2` clone. `task_run_v2` carries the same relation surface as + * `TaskRun`, so a method's nested Prisma create/include (execution snapshot, + * associated waitpoint, `runtimeEnvironment`) targets either table unchanged + * once the delegate comes from `runModel`. * - * Each method is a verbatim relocation of the Prisma statement that lives at a - * specific call site today. Methods write through `(tx ?? this.prisma).taskRun` + * Each method is its original single-table Prisma statement with the run + * delegate routed through `runModel`. Methods write through `tx` when supplied * so callers can opt into an existing transaction. Errors (including unique - * constraint violations) propagate to the caller unchanged. + * constraint violations) propagate unchanged. */ export class PostgresRunStore implements RunStore { private readonly prisma: PrismaClient; @@ -44,13 +69,79 @@ export class PostgresRunStore implements RunStore { this.readOnlyPrisma = options.readOnlyPrisma; } + /** + * A run lives in exactly one physical table, chosen by the FORMAT of its id: + * a KSUID id (new) lives in `task_run_v2`, the legacy cuid id in `TaskRun`. + * `task_run_v2` is an identical clone of `TaskRun` down to its relations, so + * its delegate is cast to the `taskRun` delegate type to reuse the existing + * generic `select`/`include`/nested-write passthrough unchanged. + */ + private runModel(client: PrismaClientOrTransaction, idOrFriendlyId: string) { + return isKsuidId(idOrFriendlyId) + ? (client.taskRunV2 as unknown as typeof client.taskRun) + : client.taskRun; + } + + /** + * The routing key for a single-row read: the `{ id }` or `{ friendlyId }` + * value in the `where` clause. Both carry the same KSUID/cuid body and route + * to the same physical table. Returns `undefined` for a predicate that + * addresses no specific run (e.g. an idempotency-key lookup), which must read + * both tables rather than assume one. + */ + #routingKeyOf(where: Prisma.TaskRunWhereInput): string | undefined { + return typeof where.id === "string" + ? where.id + : typeof where.friendlyId === "string" + ? where.friendlyId + : undefined; + } + + /** + * Read a single row matching a non-id predicate from BOTH physical tables. + * A key-based predicate (idempotency key, "has this env any runs") can match + * a row in either table. Query both in parallel and return the match, + * preferring `task_run_v2` when both are non-null. + * + * Today a run lives in exactly one table (createRun routes by id format), so + * at most one side is non-null and the preference never bites. The later + * slow legacy->v2 migration copies a run into task_run_v2 before operating on + * it, so it transiently lives in BOTH tables with the v2 copy as the + * canonical/operated-on one; preferring v2 returns the current row, not the + * stale legacy source. `task_run_v2` is an identical clone of `TaskRun`, so + * the SAME args (select/include and the security-scoping `where`) run + * unchanged against either delegate. + */ + async #findFirstAcrossTables( + prisma: ReadClient, + where: Prisma.TaskRunWhereInput, + args: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude }, + tables: FindRunTableScope = "both" + ): Promise { + // Legacy-only scope: the caller knows the run can't be in task_run_v2 (e.g. + // idempotency dedup for an org not cut over to v2), so skip the second, + // empty v2 query and keep this a single-table read on the hot path. + if (tables === "legacy") { + return prisma.taskRun.findFirst({ where, ...args }); + } + + const v2Model = prisma.taskRunV2 as unknown as typeof prisma.taskRun; + + const [legacyRun, v2Run] = await Promise.all([ + prisma.taskRun.findFirst({ where, ...args }), + v2Model.findFirst({ where, ...args }), + ]); + + return v2Run ?? legacyRun; + } + async createRun( params: CreateRunInput, tx?: PrismaClientOrTransaction ): Promise { const client = tx ?? this.prisma; - return client.taskRun.create({ + return this.runModel(client, params.data.id).create({ include: { associatedWaitpoint: true, }, @@ -85,7 +176,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const client = tx ?? this.prisma; - return client.taskRun.create({ + return this.runModel(client, params.data.id).create({ data: { ...params.data, executionSnapshots: { @@ -112,7 +203,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const client = tx ?? this.prisma; - return client.taskRun.create({ + return this.runModel(client, params.data.id).create({ include: { associatedWaitpoint: true, }, @@ -135,7 +226,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "EXECUTING", @@ -162,7 +253,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "COMPLETED_SUCCESSFULLY", @@ -198,7 +289,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { machinePreset: data.machinePreset, @@ -216,7 +307,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "PENDING" }, select: args.select, @@ -230,7 +321,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const prisma = tx ?? this.prisma; - await prisma.taskRun.update({ + await this.runModel(prisma, runId).update({ where: { id: runId }, data: { bulkActionGroupIds: { @@ -254,7 +345,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "CANCELED", @@ -284,7 +375,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: data.status, @@ -305,7 +396,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "EXPIRED", @@ -342,15 +433,41 @@ export class PostgresRunStore implements RunStore { return 0; } - return prisma.$executeRaw` - UPDATE "TaskRun" - SET "status" = 'EXPIRED'::"TaskRunStatus", - "completedAt" = ${data.now}, - "expiredAt" = ${data.now}, - "updatedAt" = ${data.now}, - "error" = ${JSON.stringify(data.error)}::jsonb - WHERE "id" IN (${Prisma.join(runIds)}) - `; + // A run lives in exactly one table, chosen by its id format. The array may + // be mixed, so partition it and run the UPDATE once per non-empty partition + // on its own table, then sum the counts. + const v2Ids = runIds.filter((id) => isKsuidId(id)); + const legacyIds = runIds.filter((id) => !isKsuidId(id)); + + const error = JSON.stringify(data.error); + + let count = 0; + + if (legacyIds.length > 0) { + count += await prisma.$executeRaw` + UPDATE "TaskRun" + SET "status" = 'EXPIRED'::"TaskRunStatus", + "completedAt" = ${data.now}, + "expiredAt" = ${data.now}, + "updatedAt" = ${data.now}, + "error" = ${error}::jsonb + WHERE "id" IN (${Prisma.join(legacyIds)}) + `; + } + + if (v2Ids.length > 0) { + count += await prisma.$executeRaw` + UPDATE "task_run_v2" + SET "status" = 'EXPIRED'::"TaskRunStatus", + "completedAt" = ${data.now}, + "expiredAt" = ${data.now}, + "updatedAt" = ${data.now}, + "error" = ${error}::jsonb + WHERE "id" IN (${Prisma.join(v2Ids)}) + `; + } + + return count; } async lockRunToWorker( @@ -360,7 +477,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "DEQUEUED", @@ -404,7 +521,7 @@ export class PostgresRunStore implements RunStore { include: { runtimeEnvironment: true, }, - }); + }) as Promise>; } async parkPendingVersion( @@ -415,7 +532,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "PENDING_VERSION", @@ -431,7 +548,7 @@ export class PostgresRunStore implements RunStore { ): Promise<{ count: number }> { const prisma = tx ?? this.prisma; - const result = await prisma.taskRun.updateMany({ + const result = await this.runModel(prisma, runId).updateMany({ where: { id: runId, status: "PENDING_VERSION" }, data: { status: "PENDING" }, }); @@ -446,7 +563,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "WAITING_TO_RESUME" }, include: args.include, @@ -460,7 +577,7 @@ export class PostgresRunStore implements RunStore { ): Promise> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "EXECUTING" }, select: args.select, @@ -474,7 +591,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { delayUntil: data.delayUntil, @@ -504,7 +621,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data: { status: "PENDING", @@ -520,7 +637,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId }, data, include: { @@ -541,16 +658,17 @@ export class PostgresRunStore implements RunStore { tx?: PrismaClientOrTransaction ): Promise<{ count: number }> { const prisma = tx ?? this.prisma; + const model = this.runModel(prisma, runId); if (options.expectedMetadataVersion !== undefined) { - const result = await prisma.taskRun.updateMany({ + const result = await model.updateMany({ where: { id: runId, metadataVersion: options.expectedMetadataVersion }, data, }); return { count: result.count }; } - await prisma.taskRun.update({ + await model.update({ where: { id: runId }, data, }); @@ -564,7 +682,7 @@ export class PostgresRunStore implements RunStore { const prisma = tx ?? this.prisma; if (params.byId) { - const result = await prisma.taskRun.updateMany({ + const result = await this.runModel(prisma, params.byId.runId).updateMany({ where: { id: params.byId.runId, idempotencyKey: params.byId.idempotencyKey }, data: { idempotencyKey: null, idempotencyKeyExpiresAt: null }, }); @@ -572,23 +690,48 @@ export class PostgresRunStore implements RunStore { } if (params.byPredicate) { + // No run id to route by: a matching run could be in either table during + // the mixed window, so run the predicate against both and sum the counts. + const where = { + idempotencyKey: params.byPredicate.idempotencyKey, + taskIdentifier: params.byPredicate.taskIdentifier, + runtimeEnvironmentId: params.byPredicate.runtimeEnvironmentId, + }; + const data = { idempotencyKey: null, idempotencyKeyExpiresAt: null }; + + const [legacy, v2] = await Promise.all([ + prisma.taskRun.updateMany({ where, data }), + (prisma.taskRunV2 as unknown as typeof prisma.taskRun).updateMany({ where, data }), + ]); + + return { count: legacy.count + v2.count }; + } + + // byFriendlyIds — only clears idempotencyKey, not idempotencyKeyExpiresAt. + // The friendlyId carries the same KSUID/cuid body as the id, so it routes + // the same way; partition the (possibly mixed) array and sum the counts. + const v2FriendlyIds = params.byFriendlyIds.filter((friendlyId) => isKsuidId(friendlyId)); + const legacyFriendlyIds = params.byFriendlyIds.filter((friendlyId) => !isKsuidId(friendlyId)); + + let count = 0; + + if (legacyFriendlyIds.length > 0) { const result = await prisma.taskRun.updateMany({ - where: { - idempotencyKey: params.byPredicate.idempotencyKey, - taskIdentifier: params.byPredicate.taskIdentifier, - runtimeEnvironmentId: params.byPredicate.runtimeEnvironmentId, - }, - data: { idempotencyKey: null, idempotencyKeyExpiresAt: null }, + where: { friendlyId: { in: legacyFriendlyIds } }, + data: { idempotencyKey: null }, }); - return { count: result.count }; + count += result.count; } - // byFriendlyIds — only clears idempotencyKey, not idempotencyKeyExpiresAt - const result = await prisma.taskRun.updateMany({ - where: { friendlyId: { in: params.byFriendlyIds } }, - data: { idempotencyKey: null }, - }); - return { count: result.count }; + if (v2FriendlyIds.length > 0) { + const result = await (prisma.taskRunV2 as unknown as typeof prisma.taskRun).updateMany({ + where: { friendlyId: { in: v2FriendlyIds } }, + data: { idempotencyKey: null }, + }); + count += result.count; + } + + return { count }; } async pushTags( @@ -599,7 +742,7 @@ export class PostgresRunStore implements RunStore { ): Promise<{ updatedAt: Date }> { const prisma = tx ?? this.prisma; - return prisma.taskRun.update({ + return this.runModel(prisma, runId).update({ where: { id: runId, runtimeEnvironmentId: where.runtimeEnvironmentId }, data: { runTags: { push: tags } }, select: { updatedAt: true }, @@ -613,7 +756,7 @@ export class PostgresRunStore implements RunStore { ): Promise { const prisma = tx ?? this.prisma; - await prisma.taskRun.update({ + await this.runModel(prisma, runId).update({ where: { id: runId }, data: { realtimeStreams: { push: streamId } }, }); @@ -621,12 +764,12 @@ export class PostgresRunStore implements RunStore { findRun( where: Prisma.TaskRunWhereInput, - args: { select: S }, + args: { select: S; tables?: FindRunTableScope }, client?: ReadClient ): Promise | null>; findRun( where: Prisma.TaskRunWhereInput, - args: { include: I }, + args: { include: I; tables?: FindRunTableScope }, client?: ReadClient ): Promise | null>; findRun( @@ -635,15 +778,22 @@ export class PostgresRunStore implements RunStore { ): Promise; async findRun( where: Prisma.TaskRunWhereInput, - argsOrClient?: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } | ReadClient, + argsOrClient?: + | { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude; tables?: FindRunTableScope } + | ReadClient, client?: ReadClient ): Promise { - const { args, prisma } = this.#resolveReadArgs(argsOrClient, client); + const { args, prisma, tables } = this.#resolveReadArgs(argsOrClient, client); - return prisma.taskRun.findFirst({ - where, - ...args, - }); + const routingKey = this.#routingKeyOf(where); + if (routingKey !== undefined) { + // by id / friendlyId: the id format picks exactly one table, O(1). + return this.runModel(prisma, routingKey).findFirst({ where, ...args }); + } + + // Non-id predicate (e.g. idempotency-key dedup): the match can be in + // either table, so read both (unless the caller scopes to legacy-only). + return this.#findFirstAcrossTables(prisma, where, args, tables); } findRunOrThrow( @@ -667,10 +817,19 @@ export class PostgresRunStore implements RunStore { ): Promise { const { args, prisma } = this.#resolveReadArgs(argsOrClient, client); - return prisma.taskRun.findFirstOrThrow({ - where, - ...args, - }); + const routingKey = this.#routingKeyOf(where); + if (routingKey !== undefined) { + return this.runModel(prisma, routingKey).findFirstOrThrow({ where, ...args }); + } + + // Non-id predicate: read both tables, then enforce the throw-on-miss + // contract ourselves (neither table's findFirstOrThrow could see the + // other's row). + const run = await this.#findFirstAcrossTables(prisma, where, args); + if (run === null || run === undefined) { + throw new Error("PostgresRunStore.findRunOrThrow: no run matched the predicate"); + } + return run; } findRuns( @@ -681,6 +840,7 @@ export class PostgresRunStore implements RunStore { take?: number; skip?: number; cursor?: Prisma.TaskRunWhereUniqueInput; + tables?: FindRunTableScope; }, client?: ReadClient ): Promise[]>; @@ -692,6 +852,7 @@ export class PostgresRunStore implements RunStore { take?: number; skip?: number; cursor?: Prisma.TaskRunWhereUniqueInput; + tables?: FindRunTableScope; }, client?: ReadClient ): Promise[]>; @@ -702,11 +863,12 @@ export class PostgresRunStore implements RunStore { take?: number; skip?: number; cursor?: Prisma.TaskRunWhereUniqueInput; + tables?: FindRunTableScope; }, client?: ReadClient ): Promise; async findRuns( - args: { + rawArgs: { where: Prisma.TaskRunWhereInput; select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude; @@ -714,12 +876,444 @@ export class PostgresRunStore implements RunStore { take?: number; skip?: number; cursor?: Prisma.TaskRunWhereUniqueInput; + tables?: FindRunTableScope; }, client?: ReadClient ): Promise { const prisma = client ?? this.readOnlyPrisma; - return prisma.taskRun.findMany(args); + // Split the table-scope hint out of the args that get spread into Prisma + // (which would reject an unknown `tables` field) before anything reads them. + const { tables: tableScope = "both", ...args } = rawArgs; + + // A run lives in exactly one physical table, chosen by its id format. An + // `id: { in: [...] }` predicate of a single id format addresses ONE table; + // any other predicate may span both `TaskRun` (legacy cuid) and + // `task_run_v2` (new ksuid). `task_run_v2` is an identical clone of + // `TaskRun` (same relation surface), so the SAME `args` (crucially the SAME + // `where`, the security scope) run unchanged against either delegate. + const legacyModel = prisma.taskRun; + const v2Model = prisma.taskRunV2 as unknown as typeof prisma.taskRun; + + const tablesForWhere = this.#tablesForWhere(args.where); + const queryLegacy = tablesForWhere.queryLegacy; + // A "legacy" scope hint (the caller knows no run can be in task_run_v2 — e.g. + // no v2 run exists anywhere because native realtime is off) skips the empty v2 + // query, the same hot-path optimisation findRun uses. It is meant for non-id + // predicates (parentTaskRunId, idempotencyKey); pairing it with a KSUID-only + // id predicate forces both tables off and returns [] (correct: a v2 id can't + // match a legacy-only read), but id reads already route by format, so the + // hint is redundant there rather than passed by real callers. + const queryV2 = tableScope === "legacy" ? false : tablesForWhere.queryV2; + + // No candidate table (e.g. an empty `id: { in: [] }`) → matches nothing. + if (!queryLegacy && !queryV2) { + return []; + } + + // Exactly one physical table is in play. There's no cross-table merge, so + // delegate to that table's `findMany` with the args verbatim: Postgres + // orders natively (ordering by any column, incl. `id`, is a valid total + // order WITHIN one table) and `skip`/`cursor`/`take` are all + // single-table-valid. Only the both-table path below needs the in-memory + // comparator/merge and its keyset restrictions. + if (queryLegacy !== queryV2) { + const model = queryLegacy ? legacyModel : v2Model; + return model.findMany(args as Prisma.TaskRunFindManyArgs); + } + + // BOTH tables in play. + // + // FORWARD-LOOKING (slow legacy->v2 migration, a later stage): that migration + // copies a run into task_run_v2 before operating on it, so a run can briefly + // live in BOTH tables. When that lands, the cross-table reads below (both the + // ordered #mergeOrdered path AND the unordered concat) must DEDUP BY id, + // keeping the canonical v2 copy, or a doubly-present run is returned twice. + // Dedup needs `id` forced into the projection (and stripped when the caller + // didn't select it), and the "v2 wins" policy is part of the copy protocol, + // so it belongs with the migration PR that introduces the overlap. Today + // createRun routes by id format, so no run is in both tables and concatenation + // is already duplicate-free. + // + // Offset pagination can't be expressed across two tables (applying `skip` to + // each skips N rows from its own result, not N from the merged result), so + // reject it rather than silently double-skip. + if (args.skip !== undefined) { + throw new Error( + "RunStore.findRuns: `skip` (offset pagination) is not supported across the legacy TaskRun " + + "and task_run_v2 tables. Use a where-based keyset (createdAt + id) instead." + ); + } + + const ordered = this.#normalizeOrderBy(args.orderBy); + + // Both tables are queried here (single-table reads were delegated earlier). + // A Prisma `cursor` addresses one row in one table, and a negative `take` + // (Prisma "last N") is meaningless across a 2-way merge — neither can span + // both tables. No caller pairs either with a cross-table read; reject + // loudly rather than silently returning a wrong or empty result. Keyset + // callers carry their cursor in `where`, which both per-table queries honor. + if (args.cursor !== undefined) { + throw new Error( + "RunStore.findRuns: a Prisma `cursor` cannot span both run tables. " + + "Use a where-based keyset (e.g. `where: { createdAt: { lt: X } }`) instead." + ); + } + if (typeof args.take === "number" && args.take < 0) { + throw new Error( + "RunStore.findRuns: a negative `take` (Prisma 'last N') is not supported across both run tables." + ); + } + // `take` without `orderBy` across BOTH tables is non-deterministic: each + // table is capped at `take` independently, then the two capped sets are + // concatenated, so once one table fills `take` the other table's rows are + // silently dropped. Reject it (like `skip`/`cursor` above) rather than + // return a result that may omit one table. Add an `orderBy` for a bounded + // cross-table merge, or scope the predicate to a single table. + if (args.take !== undefined && ordered.length === 0) { + throw new Error( + "RunStore.findRuns: `take` without `orderBy` is not supported across both run tables " + + "(each table is capped independently, so the cap is non-deterministic and may omit one " + + "table's rows). Add an `orderBy` for a bounded cross-table merge, or scope the predicate " + + "to a single table." + ); + } + + // ORDERED + LIMITED → bounded 2-way merge. + if (ordered.length > 0 && args.take !== undefined) { + const comparator = this.#buildCrossTableComparator(ordered); + + // The in-memory comparator reads the order keys off each row, so they + // MUST be in the projection. If the caller's `select` omits one, add it + // for the query and strip it from the output. (`include`/full-row already + // carry every scalar.) + const { args: queryArgs, addedKeys } = this.#withOrderKeysSelected(args, ordered); + + // Take at most `take` from each table: the merged head of two ordered + // streams of length `take` is fully determined by their first `take` rows. + const perTableArgs = { ...queryArgs, take: args.take }; + + const [legacyRows, v2Rows] = (await Promise.all([ + queryLegacy ? legacyModel.findMany(perTableArgs) : Promise.resolve([]), + queryV2 ? v2Model.findMany(perTableArgs) : Promise.resolve([]), + ])) as [Array>, Array>]; + + const merged = this.#mergeOrdered(legacyRows, v2Rows, comparator, args.take); + return this.#stripAddedKeys(merged, addedKeys); + } + + // UNORDERED / NO-LIMIT → run the SAME args against both tables and + // concatenate. A run is in exactly one table, so concatenation is complete + // and has no duplicates. (`take` without `orderBy` was rejected above; + // `orderBy` + `take` took the bounded-merge branch above.) + // + // `orderBy` without `take` still needs the order keys projected so the + // whole-set re-sort below can read them. + const { args: queryArgs, addedKeys } = + ordered.length > 0 + ? this.#withOrderKeysSelected(args, ordered) + : { args, addedKeys: [] as string[] }; + + const [legacyRows, v2Rows] = (await Promise.all([ + queryLegacy ? legacyModel.findMany(queryArgs) : Promise.resolve([]), + queryV2 ? v2Model.findMany(queryArgs) : Promise.resolve([]), + ])) as [Array>, Array>]; + + let combined = legacyRows.concat(v2Rows); + + // `orderBy` without `take`: each table came back ordered, but the + // concatenation is not — re-sort the whole bounded set to honor the order. + if (ordered.length > 0) { + const comparator = this.#buildCrossTableComparator(ordered); + combined = combined.sort(comparator); + } + + return this.#stripAddedKeys(combined, addedKeys); + } + + /** + * Which physical tables a `findRuns` predicate can match. A run id encodes + * its table, so an `id: { in: [...] }` list containing only cuids cannot match + * `task_run_v2` (and a ksuid-only list cannot match `TaskRun`): the table with + * no candidate ids is skipped, avoiding a wasted query against an empty + * `task_run_v2` during rollout. An empty `in` list matches nothing, so both + * are skipped. Any other predicate must consult both tables. + */ + #tablesForWhere(where: Prisma.TaskRunWhereInput): { queryLegacy: boolean; queryV2: boolean } { + const idFilter = where.id; + const idIn = + idFilter !== null && typeof idFilter === "object" && "in" in idFilter + ? (idFilter as { in?: unknown }).in + : undefined; + + if (Array.isArray(idIn)) { + let queryLegacy = false; + let queryV2 = false; + for (const id of idIn) { + if (typeof id === "string" && isKsuidId(id)) { + queryV2 = true; + } else { + queryLegacy = true; + } + if (queryLegacy && queryV2) break; + } + return { queryLegacy, queryV2 }; + } + + // Plain id equality (string or `{ equals: string }`) also pins the table: + // a single id encodes its format, so route to the matching table and skip + // the other (which can't contain it). Mirrors the `id: { in }` partition. + const idEquals = stringEquality(idFilter); + if (idEquals !== undefined) { + return isKsuidId(idEquals) + ? { queryLegacy: false, queryV2: true } + : { queryLegacy: true, queryV2: false }; + } + + // friendlyId equality (`run_`) likewise pins the table by id format. + const friendlyEquals = stringEquality(where.friendlyId); + if (friendlyEquals !== undefined) { + const rawId = friendlyEquals.startsWith("run_") + ? friendlyEquals.slice("run_".length) + : friendlyEquals; + return isKsuidId(rawId) + ? { queryLegacy: false, queryV2: true } + : { queryLegacy: true, queryV2: false }; + } + + return { queryLegacy: true, queryV2: true }; + } + + /** + * The cross-table merge/sort compares order-key VALUES read off each returned + * row, so every scalar order key must be present in the projection. When the + * caller passes a `select` that omits an order key, add it (so the row carries + * the value) and record which keys were added so they can be stripped from the + * final output — the caller asked not to see them. A query with `include`, or + * with neither `select` nor `include` (full row), already returns every scalar + * column, so nothing is added. + */ + #withOrderKeysSelected( + args: { + where: Prisma.TaskRunWhereInput; + select?: Prisma.TaskRunSelect; + include?: Prisma.TaskRunInclude; + orderBy?: Prisma.TaskRunOrderByWithRelationInput | Prisma.TaskRunOrderByWithRelationInput[]; + take?: number; + skip?: number; + cursor?: Prisma.TaskRunWhereUniqueInput; + }, + ordered: Array<{ key: string; direction: "asc" | "desc" }> + ): { + args: typeof args; + addedKeys: string[]; + } { + // The merge always tiebreaks on `id`, so it must be readable too. + const requiredKeys = new Set([...ordered.map((entry) => entry.key), "id"]); + + if (!args.select) { + // include / full-row: all scalars are present already. + return { args, addedKeys: [] }; + } + + const select = args.select as Record; + const addedKeys: string[] = []; + const augmentedSelect: Record = { ...select }; + + for (const key of requiredKeys) { + if (!(key in augmentedSelect)) { + augmentedSelect[key] = true; + addedKeys.push(key); + } + } + + if (addedKeys.length === 0) { + return { args, addedKeys: [] }; + } + + return { args: { ...args, select: augmentedSelect as Prisma.TaskRunSelect }, addedKeys }; + } + + /** Remove the order-key columns that were added purely to drive the merge. */ + #stripAddedKeys( + rows: Array>, + addedKeys: string[] + ): Array> { + if (addedKeys.length === 0) { + return rows; + } + + for (const row of rows) { + for (const key of addedKeys) { + delete row[key]; + } + } + + return rows; + } + + /** + * Normalize the optional `orderBy` (single object or array) into an array of + * single-key order entries, preserving precedence. An empty array means "no + * ordering requested". + */ + #normalizeOrderBy( + orderBy: + | Prisma.TaskRunOrderByWithRelationInput + | Prisma.TaskRunOrderByWithRelationInput[] + | undefined + ): Array<{ key: string; direction: "asc" | "desc" }> { + if (orderBy === undefined) { + return []; + } + + const list = Array.isArray(orderBy) ? orderBy : [orderBy]; + const entries: Array<{ key: string; direction: "asc" | "desc" }> = []; + + for (const clause of list) { + for (const [key, value] of Object.entries(clause)) { + // Only scalar `{ field: "asc" | "desc" }` entries are mergeable in + // memory. A relation/nested sort (value is an object) can't be compared + // here — flag it rather than mis-order across the two tables. + if (value === "asc" || value === "desc") { + entries.push({ key, direction: value }); + } else { + throw new Error( + `RunStore.findRuns: cannot merge across tables on a non-scalar orderBy key "${key}". ` + + "Ordered+limited cross-table reads must order by a scalar column (a time/createdAt field, with id as a tiebreak)." + ); + } + } + } + + return entries; + } + + /** + * Build a total-order comparator from the requested scalar order keys. + * + * The cross-table merge is only correct when the order is a TOTAL order over + * the union of both tables. A time-based column (`createdAt`, or any other + * Date column) provides that; `id` alone does NOT — a cuid and a ksuid live + * in different, non-interleaving id spaces, so ordering the union by `id` + * lexicographically is meaningless. Require a time/createdAt key to lead (or + * appear in) the order, and use `id` only as a within-timestamp tiebreak. + */ + #buildCrossTableComparator( + ordered: Array<{ key: string; direction: "asc" | "desc" }> + ): (a: Record, b: Record) => number { + const hasTimeKey = ordered.some((entry) => this.#isTimeOrderKey(entry.key)); + + if (!hasTimeKey) { + const keys = ordered.map((entry) => entry.key).join(", "); + throw new Error( + `RunStore.findRuns: ordered+limited read orders by [${keys}], which is not a valid total order across the ` + + "legacy TaskRun (cuid) and task_run_v2 (ksuid) tables. Order by a time/createdAt column (id may follow as a tiebreak)." + ); + } + + // Ensure `id` is present as a final tiebreak so the merge is deterministic + // when two rows share the leading timestamp. Use the direction of the + // leading order key for the tiebreak. + const comparators = [...ordered]; + if (!comparators.some((entry) => entry.key === "id")) { + comparators.push({ key: "id", direction: ordered[0].direction }); + } + + return (a, b) => { + for (const { key, direction } of comparators) { + const cmp = this.#compareValues(a[key], b[key]); + if (cmp !== 0) { + return direction === "asc" ? cmp : -cmp; + } + } + return 0; + }; + } + + /** + * A column is a valid cross-table total-order lead when it is time-based. + * `createdAt` is the canonical one; the other Date columns the callers use + * (`updatedAt`, `completedAt`, etc.) qualify too. The selected/included row + * must carry the column for the comparator to read it. + */ + #isTimeOrderKey(key: string): boolean { + return ( + key === "createdAt" || + key === "updatedAt" || + key === "completedAt" || + key === "startedAt" || + key === "queuedAt" || + key === "lockedAt" || + key === "delayUntil" || + key === "expiredAt" + ); + } + + /** Ascending comparison of two scalar order values (Date, number, string). */ + #compareValues(a: unknown, b: unknown): number { + if (a === b) return 0; + // Nulls sort last (Prisma's default for `nulls: "last"` is the common case; + // a stable, deterministic placement is what matters for the merge). + if (a === null || a === undefined) return 1; + if (b === null || b === undefined) return -1; + + if (a instanceof Date && b instanceof Date) { + return a.getTime() - b.getTime(); + } + if (typeof a === "number" && typeof b === "number") { + return a - b; + } + // String (id) order MUST match Postgres's collation: this comparator merges + // the two per-table streams IN MEMORY, but the keyset continuation + // (`id > cursor`) that fetches the next page is evaluated BY Postgres. If + // the two disagree, a tied-createdAt boundary that straddles BOTH tables can + // silently skip or duplicate a row. The run-table id columns inherit the + // database collation, which on Trigger.dev Cloud (and the default Postgres + // locale on most systems) is en_US.utf8 — whose ordering of the id charset + // [0-9A-Za-z] matches `localeCompare("en-US")` (verified exhaustively over + // every base62 2-gram) but NOT raw code-unit order (e.g. "c" < "Z" under + // en_US, yet "Z" < "c" by code unit). + // + // CAVEAT (self-hosters): this hard-codes the en_US assumption. A database + // with a different collation ("C"/"POSIX" byte order, or another locale) can + // disagree with localeCompare("en-US") and skip/duplicate a run at the + // narrow tied-createdAt cross-table boundary. The collation-independent fix + // is to force `COLLATE "C"` on the id in BOTH the per-table keyset ORDER BY + // and this comparator (byte order on both sides); deferred because it needs + // the keyset expressed as raw SQL rather than a Prisma `orderBy`. + return String(a).localeCompare(String(b), "en-US"); + } + + /** + * 2-way merge of two already-ordered streams into the first `take` rows of + * their combined order. Bounded: walks at most `take` steps. The two inputs + * are each `findMany`-ordered by the SAME order keys, so a single linear pass + * picking the smaller head under `comparator` yields the globally-correct head. + */ + #mergeOrdered( + left: Array>, + right: Array>, + comparator: (a: Record, b: Record) => number, + take: number + ): Array> { + const out: Array> = []; + let i = 0; + let j = 0; + + while (out.length < take && (i < left.length || j < right.length)) { + if (i >= left.length) { + out.push(right[j++]); + } else if (j >= right.length) { + out.push(left[i++]); + } else if (comparator(left[i], right[j]) <= 0) { + out.push(left[i++]); + } else { + out.push(right[j++]); + } + } + + return out; } /** @@ -732,23 +1326,32 @@ export class PostgresRunStore implements RunStore { */ #resolveReadArgs( argsOrClient: - | { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude } + | { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude; tables?: FindRunTableScope } | ReadClient | undefined, client: ReadClient | undefined ): { args: { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude }; prisma: ReadClient; + tables: FindRunTableScope; } { const isProjection = typeof argsOrClient === "object" && argsOrClient !== null && - ("select" in argsOrClient || "include" in argsOrClient); + ("select" in argsOrClient || "include" in argsOrClient || "tables" in argsOrClient); if (isProjection) { + // Split the table-scope hint out of the args that get spread into Prisma + // (which would reject an unknown `tables` field). + const { tables, ...prismaArgs } = argsOrClient as { + select?: Prisma.TaskRunSelect; + include?: Prisma.TaskRunInclude; + tables?: FindRunTableScope; + }; return { - args: argsOrClient as { select?: Prisma.TaskRunSelect; include?: Prisma.TaskRunInclude }, + args: prismaArgs, prisma: client ?? this.readOnlyPrisma, + tables: tables ?? "both", }; } @@ -756,6 +1359,7 @@ export class PostgresRunStore implements RunStore { return { args: {}, prisma: (argsOrClient as ReadClient | undefined) ?? this.readOnlyPrisma, + tables: "both", }; } } diff --git a/internal-packages/run-store/src/types.ts b/internal-packages/run-store/src/types.ts index 319ef187814..a64476cce32 100644 --- a/internal-packages/run-store/src/types.ts +++ b/internal-packages/run-store/src/types.ts @@ -232,6 +232,18 @@ export type ClearIdempotencyKeyInput = export type TaskRunWithWaitpoint = TaskRun & { associatedWaitpoint: Waitpoint | null }; +/** + * Which physical run tables a non-id `findRun` predicate should read. + * + * Defaults to `"both"` (the safe cross-table behaviour). A caller that KNOWS + * the run can only be in the legacy table — e.g. the idempotency-key dedup for + * an org that is not cut over to `task_run_v2` — can pass `"legacy"` to skip the + * second (empty) `task_run_v2` query and keep the trigger hot path single-table. + * Only meaningful for non-id predicates; id/friendlyId reads already route to + * exactly one table by id format. + */ +export type FindRunTableScope = "both" | "legacy"; + export interface RunStore { // Create createRun(params: CreateRunInput, tx?: PrismaClientOrTransaction): Promise; @@ -332,12 +344,12 @@ export interface RunStore { // Read findRun( where: Prisma.TaskRunWhereInput, - args: { select: S }, + args: { select: S; tables?: FindRunTableScope }, client?: ReadClient ): Promise | null>; findRun( where: Prisma.TaskRunWhereInput, - args: { include: I }, + args: { include: I; tables?: FindRunTableScope }, client?: ReadClient ): Promise | null>; findRun(where: Prisma.TaskRunWhereInput, client?: ReadClient): Promise; @@ -362,6 +374,7 @@ export interface RunStore { take?: number; skip?: number; cursor?: Prisma.TaskRunWhereUniqueInput; + tables?: FindRunTableScope; }, client?: ReadClient ): Promise[]>; @@ -373,6 +386,7 @@ export interface RunStore { take?: number; skip?: number; cursor?: Prisma.TaskRunWhereUniqueInput; + tables?: FindRunTableScope; }, client?: ReadClient ): Promise[]>; @@ -383,6 +397,7 @@ export interface RunStore { take?: number; skip?: number; cursor?: Prisma.TaskRunWhereUniqueInput; + tables?: FindRunTableScope; }, client?: ReadClient ): Promise; diff --git a/internal-packages/testcontainers/src/utils.ts b/internal-packages/testcontainers/src/utils.ts index 4183e85b40b..9cbaeb04ce6 100644 --- a/internal-packages/testcontainers/src/utils.ts +++ b/internal-packages/testcontainers/src/utils.ts @@ -2,6 +2,7 @@ import { createClient } from "@clickhouse/client"; import { PostgreSqlContainer, StartedPostgreSqlContainer } from "@testcontainers/postgresql"; import { RedisContainer, StartedRedisContainer } from "@testcontainers/redis"; import { tryCatch } from "@trigger.dev/core"; +import { PrismaClient } from "@trigger.dev/database"; import Redis from "ioredis"; import path from "path"; import { isDebug } from "std-env"; @@ -48,9 +49,50 @@ export async function pushDatabaseSchema(databaseUrl: string) { } ); + await dropRunForeignKeys(databaseUrl); + return result; } +/** + * Production drops every foreign key that sits on, or points at, the run tables (`TaskRun` and + * `task_run_v2`) — a run's id is just a scalar that may live in either physical table, so the FKs + * can't be enforced. `prisma db push` doesn't know that: it recreates a constraint for every + * relation still declared in schema.prisma, so the template DB ends up with run FKs production + * doesn't have. That makes tests diverge — e.g. inserting a child row (a `TaskRunExecutionSnapshot` + * whose `runId` is a `task_run_v2` id) trips a `..._runId_fkey -> TaskRun` constraint that doesn't + * exist in prod. So after the push we strip those FKs to match production exactly. + * + * This is done dynamically (rather than naming each constraint) so any relation added to the schema + * later has its test-only run FK stripped automatically. It only removes FK constraints, so it + * cannot corrupt valid data — it makes the template DB strictly more faithful to production. + */ +async function dropRunForeignKeys(databaseUrl: string) { + const prisma = new PrismaClient({ + datasources: { db: { url: databaseUrl } }, + }); + + try { + await prisma.$executeRawUnsafe(` +DO $$ +DECLARE r record; +BEGIN + FOR r IN + SELECT conrelid::regclass::text AS tbl, conname + FROM pg_constraint + WHERE contype = 'f' + AND (confrelid IN ('"TaskRun"'::regclass, 'task_run_v2'::regclass) + OR conrelid IN ('"TaskRun"'::regclass, 'task_run_v2'::regclass)) + LOOP + EXECUTE format('ALTER TABLE %s DROP CONSTRAINT %I', r.tbl, r.conname); + END LOOP; +END $$; +`); + } finally { + await prisma.$disconnect(); + } +} + /** * Caps each container's CPU/memory to approximate the 2-core CI runner locally (for timing + flake * reproduction). Set TESTCONTAINERS_CPU (cores per container, e.g. "2") and/or diff --git a/packages/core/src/v3/isomorphic/friendlyId.test.ts b/packages/core/src/v3/isomorphic/friendlyId.test.ts new file mode 100644 index 00000000000..e3221fce7f4 --- /dev/null +++ b/packages/core/src/v3/isomorphic/friendlyId.test.ts @@ -0,0 +1,99 @@ +import { describe, it, expect } from "vitest"; +import { + fromFriendlyId, + generateKsuid, + isKsuidId, + RunId, + toFriendlyId, +} from "./friendlyId.js"; + +const BASE62 = /^[0-9A-Za-z]+$/; + +describe("isKsuidId", () => { + it("is true for a freshly minted ksuid and its friendlyId", () => { + const { id, friendlyId } = RunId.generateKsuid(); + + expect(isKsuidId(id)).toBe(true); + expect(isKsuidId(friendlyId)).toBe(true); + }); + + it("is false for a legacy cuid id and its friendlyId", () => { + const { id, friendlyId } = RunId.generate(); + + // sanity: legacy cuid is 25 chars + expect(id.length).toBe(25); + expect(isKsuidId(id)).toBe(false); + expect(isKsuidId(friendlyId)).toBe(false); + }); + + it("is false for empty, prefix-only, and malformed input", () => { + expect(isKsuidId("")).toBe(false); + expect(isKsuidId("run_")).toBe(false); + + // 27 chars but contains a non-base62 char (`-`) + const twentySevenWithDash = `${"a".repeat(26)}-`; + expect(twentySevenWithDash).toHaveLength(27); + expect(isKsuidId(twentySevenWithDash)).toBe(false); + expect(isKsuidId(`run_${twentySevenWithDash}`)).toBe(false); + }); + + it("is false for a 26-char and a 28-char body", () => { + expect("a".repeat(26)).toHaveLength(26); + expect(isKsuidId("a".repeat(26))).toBe(false); + expect(isKsuidId("a".repeat(28))).toBe(false); + expect(isKsuidId(`run_${"a".repeat(26)}`)).toBe(false); + expect(isKsuidId(`run_${"a".repeat(28)}`)).toBe(false); + }); +}); + +describe("generateKsuid", () => { + it("produces a 27-char base62 body", () => { + const id = generateKsuid(); + + expect(id).toHaveLength(27); + expect(id).toMatch(BASE62); + }); + + it("produces unique ids across calls", () => { + const ids = new Set(Array.from({ length: 100 }, () => generateKsuid())); + + expect(ids.size).toBe(100); + }); + + it("round-trips through toFriendlyId / fromFriendlyId", () => { + const id = generateKsuid(); + const friendlyId = toFriendlyId("run", id); + + expect(friendlyId).toBe(`run_${id}`); + expect(fromFriendlyId(friendlyId)).toBe(id); + + const generated = RunId.generateKsuid(); + expect(generated.friendlyId).toBe(`run_${generated.id}`); + expect(RunId.fromFriendlyId(generated.friendlyId)).toBe(generated.id); + }); + + it("is time-ordered: a later timestamp sorts after an earlier one", () => { + // The timestamp lives in the high bytes, so a larger timestamp encodes to a + // lexicographically-greater (left-padded, fixed-width) base62 string. + const realNow = Date.now; + try { + Date.now = () => 1_500_000_000_000; + const earlier = generateKsuid(); + Date.now = () => 1_500_000_100_000; + const later = generateKsuid(); + + expect(later > earlier).toBe(true); + expect(isKsuidId(earlier)).toBe(true); + expect(isKsuidId(later)).toBe(true); + } finally { + Date.now = realNow; + } + }); +}); + +describe("isKsuidId and the minter agree", () => { + it("isKsuidId(generateKsuid().id) === true and isKsuidId(generate().id) === false", () => { + expect(isKsuidId(RunId.generateKsuid().id)).toBe(true); + expect(isKsuidId(RunId.generate().id)).toBe(false); + }); +}); diff --git a/packages/core/src/v3/isomorphic/friendlyId.ts b/packages/core/src/v3/isomorphic/friendlyId.ts index 66575c7c178..ebcc8dfa284 100644 --- a/packages/core/src/v3/isomorphic/friendlyId.ts +++ b/packages/core/src/v3/isomorphic/friendlyId.ts @@ -11,6 +11,84 @@ export function generateInternalId() { return cuid(); } +// KSUID epoch (2014-05-13T16:53:20Z) — seconds offset applied to the unix timestamp. +const KSUID_EPOCH = 1_400_000_000; +const KSUID_TIMESTAMP_BYTES = 4; +const KSUID_PAYLOAD_BYTES = 16; +const KSUID_TOTAL_BYTES = KSUID_TIMESTAMP_BYTES + KSUID_PAYLOAD_BYTES; +const KSUID_STRING_LENGTH = 27; +const BASE62_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +/** Encode raw bytes as base62, left-padded to the given length. */ +function base62Encode(bytes: Uint8Array, length: number): string { + // Big-endian base-256 -> base-62 conversion (repeated division). + const digits = Array.from(bytes); + let result = ""; + + while (digits.length > 0) { + let remainder = 0; + const quotient: number[] = []; + + for (let i = 0; i < digits.length; i++) { + const acc = (digits[i] ?? 0) + remainder * 256; + const q = Math.floor(acc / 62); + remainder = acc % 62; + + if (quotient.length > 0 || q > 0) { + quotient.push(q); + } + } + + // `remainder` is always in [0, 61], so this index is always valid. + result = BASE62_ALPHABET.charAt(remainder) + result; + digits.length = 0; + digits.push(...quotient); + } + + return result.padStart(length, BASE62_ALPHABET.charAt(0)); +} + +/** + * Mint a KSUID body: a 27-char, base62, time-ordered identifier. + * + * Layout: 4-byte big-endian uint32 timestamp (seconds since the KSUID epoch) + * + 16 random bytes = 20 bytes, base62-encoded and left-padded to 27 chars. + * + * Isomorphic: relies only on `globalThis.crypto.getRandomValues` for randomness. + */ +export function generateKsuid(): string { + const bytes = new Uint8Array(KSUID_TOTAL_BYTES); + + const timestamp = Math.floor(Date.now() / 1000) - KSUID_EPOCH; + bytes[0] = (timestamp >>> 24) & 0xff; + bytes[1] = (timestamp >>> 16) & 0xff; + bytes[2] = (timestamp >>> 8) & 0xff; + bytes[3] = timestamp & 0xff; + + globalThis.crypto.getRandomValues(bytes.subarray(KSUID_TIMESTAMP_BYTES)); + + return base62Encode(bytes, KSUID_STRING_LENGTH); +} + +/** + * Pure string discriminator: is this id (or friendlyId) a KSUID-format body? + * + * Strips a leading `"_"` if present, then tests the body for the KSUID + * shape (27 chars, base62). The 25-char legacy cuid and any malformed input + * return false. Never throws. + */ +export function isKsuidId(idOrFriendlyId: string): boolean { + if (!idOrFriendlyId) { + return false; + } + + const underscoreIndex = idOrFriendlyId.indexOf("_"); + const body = + underscoreIndex === -1 ? idOrFriendlyId : idOrFriendlyId.slice(underscoreIndex + 1); + + return body.length === KSUID_STRING_LENGTH && /^[0-9A-Za-z]{27}$/.test(body); +} + /** Convert an internal ID to a friendly ID */ export function toFriendlyId(entityName: string, internalId: string): string { if (!entityName) { @@ -69,6 +147,16 @@ export class IdUtil { }; } + /** Mint an id whose body is a KSUID (27-char, base62, time-ordered). */ + generateKsuid() { + const internalId = generateKsuid(); + + return { + id: internalId, + friendlyId: this.toFriendlyId(internalId), + }; + } + toFriendlyId(internalId: string) { return toFriendlyId(this.entityName, internalId); } diff --git a/scripts/recover-stuck-runs.ts b/scripts/recover-stuck-runs.ts index 28bb4e85e46..7840498fc79 100755 --- a/scripts/recover-stuck-runs.ts +++ b/scripts/recover-stuck-runs.ts @@ -188,8 +188,13 @@ async function main() { console.log(`📊 Found ${runIds.length} runs in currentConcurrency set`); // Query database for latest snapshots and queue info of these runs. - // NOTE: raw join of TaskRunExecutionSnapshot to TaskRun, the one TaskRun read not behind - // RunStore (a join, not a by-id read, in an ops script). Revisit at table cutover. + // A snapshot's runId can reference a run in EITHER physical table during + // the runTableV2 cutover, so join against TaskRun UNION task_run_v2 by id; + // a stuck v2 (KSUID) run would otherwise be dropped from the join and never + // re-enqueued. UNION (not UNION ALL) so that if a future copy step leaves a + // run briefly in both tables under the same id, the identical clones collapse + // to one row and DISTINCT ON stays unambiguous. (Raw join in an ops script, + // not a by-id RunStore read.) const runInfo = await prisma.$queryRaw< Array<{ runId: string; @@ -214,7 +219,11 @@ async function main() { r."queue", r."concurrencyKey" FROM "TaskRunExecutionSnapshot" s - INNER JOIN "TaskRun" r ON r.id = s."runId" + INNER JOIN ( + SELECT id, "organizationId", "projectId", "runtimeEnvironmentId", "taskIdentifier", "queue", "concurrencyKey" FROM "TaskRun" WHERE id = ANY(${runIds}) + UNION + SELECT id, "organizationId", "projectId", "runtimeEnvironmentId", "taskIdentifier", "queue", "concurrencyKey" FROM task_run_v2 WHERE id = ANY(${runIds}) + ) r ON r.id = s."runId" WHERE s."runId" = ANY(${runIds}) AND s."isValid" = true ORDER BY s."runId", s."createdAt" DESC