From 9580f6cb076b300f0b67a62b8d94dbb8f738df66 Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Tue, 2 Jun 2026 10:26:36 -0700
Subject: [PATCH 1/2] up

---
 .github/workflows/cuda-perf.yml | 53 ++++++++++++++++++-
 .github/workflows/cuda.yml      | 91 ++++++++++++++++++++++++++++++---
 2 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index ada2fb9e696..7b24dcbbdde 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -32,8 +32,31 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
 
+permissions:
+  contents: read
+
 jobs:
+  changed-files:
+    name: Get changed files
+    uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      include-push-diff: true
+
+  run-decision:
+    name: CI run decision
+    uses: ./.github/workflows/_ci-run-decision.yml
+
   set-parameters:
+    needs: [changed-files, run-decision]
+    # Path-filtered: mirrors the workflow-level pull_request `paths:`
+    # filter so push commits that don't touch perf-relevant paths skip
+    # this whole workflow on non-sampled commits. Sampling preserves
+    # perf time-series at every 4th commit (vs every commit pre-PR).
+    if: |
+      contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') ||
+      needs.run-decision.outputs.is-full-run == 'true'
     runs-on: ubuntu-22.04
     outputs:
       benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
@@ -145,9 +168,24 @@ jobs:
   benchmark-cuda:
     name: benchmark-cuda
     needs:
+      - changed-files
+      - run-decision
       - set-parameters
       - export-models
-    if: always()
+    # Inherit the gate from set-parameters/export-models (they cascade-
+    # skip when the gate evaluates false). `always()` keeps benchmark-
+    # cuda running even when some export-models matrix cells fail —
+    # but only if the gate itself is open. Without the explicit gate
+    # here, `always()` would fire benchmark-cuda even when set-
+    # parameters was gated out.
+    if: |
+      always() &&
+      (
+        contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') ||
+        needs.run-decision.outputs.is-full-run == 'true'
+      )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -316,8 +354,19 @@ jobs:
 
   upload-benchmark-results:
     needs:
+      - changed-files
+      - run-decision
       - benchmark-cuda
-    if: always()
+    # Same gate as benchmark-cuda — skip the upload when the gate
+    # closed (no benchmarks ran).
+    if: |
+      always() &&
+      (
+        contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') ||
+        needs.run-decision.outputs.is-full-run == 'true'
+      )
     runs-on: ubuntu-22.04
     environment: upload-benchmark-results
     permissions:
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index f19b937994f..5972d064fc4 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -26,8 +26,30 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: false
 
+permissions:
+  contents: read
+
 jobs:
+  changed-files:
+    name: Get changed files
+    uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      include-push-diff: true
+
+  run-decision:
+    name: CI run decision
+    uses: ./.github/workflows/_ci-run-decision.yml
+
   test-cuda-builds:
+    needs: [changed-files, run-decision]
+    # Path-filtered: mirrors the workflow-level pull_request `paths:`
+    # filter so push commits that don't touch CUDA-relevant paths skip
+    # this job on non-sampled commits.
+    if: |
+      contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+      contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+      contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+      needs.run-decision.outputs.is-full-run == 'true'
     strategy:
       fail-fast: false
       matrix:
@@ -55,9 +77,19 @@ jobs:
 
   # This job will fail if any of the CUDA versions fail
   check-all-cuda-builds:
-    needs: test-cuda-builds
+    needs: [changed-files, run-decision, test-cuda-builds]
     runs-on: ubuntu-latest
-    if: always()
+    # Run only if the test-cuda-builds matrix actually ran (i.e. the same
+    # path/sample gate as test-cuda-builds itself). Otherwise this job
+    # would fire on every commit and fail because needs.result == 'skipped'.
+    if: |
+      always() &&
+      (
+        contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+        contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        needs.run-decision.outputs.is-full-run == 'true'
+      )
     steps:
       - name: Check if all CUDA builds succeeded
         run: |
@@ -71,6 +103,12 @@ jobs:
 
   test-models-cuda:
     name: test-models-cuda
+    needs: [changed-files, run-decision]
+    if: |
+      contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+      contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+      contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+      needs.run-decision.outputs.is-full-run == 'true'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -106,6 +144,12 @@ jobs:
 
   unittest-cuda:
     name: unittest-cuda
+    needs: [changed-files, run-decision]
+    if: |
+      contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+      contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+      contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+      needs.run-decision.outputs.is-full-run == 'true'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -154,8 +198,19 @@ jobs:
 
   export-model-cuda-artifact:
     name: export-model-cuda-artifact
-    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
-    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available).
+    # Path-filtered on push: mirrors the workflow-level pull_request `paths:`
+    # filter so push commits that don't touch CUDA-relevant paths skip
+    # this job on non-sampled commits.
+    needs: [changed-files, run-decision]
+    if: |
+      (github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request') &&
+      (
+        contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+        contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        needs.run-decision.outputs.is-full-run == 'true'
+      )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -300,7 +355,20 @@ jobs:
 
   test-model-cuda-e2e:
     name: test-model-cuda-e2e
-    needs: export-model-cuda-artifact
+    # Same path filter as export-model-cuda-artifact above. Also explicitly
+    # gated on the export job succeeding — when needs: jobs are *skipped*
+    # (e.g. fork PR), GitHub still evaluates this if:, so without the
+    # explicit success-check this job would run and then fail trying
+    # to download an artifact that was never produced.
+    needs: [changed-files, export-model-cuda-artifact, run-decision]
+    if: |
+      needs.export-model-cuda-artifact.result == 'success' &&
+      (
+        contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+        contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        needs.run-decision.outputs.is-full-run == 'true'
+      )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -417,8 +485,19 @@ jobs:
 
   test-cuda-pybind:
     name: test-cuda-pybind
-    needs: export-model-cuda-artifact
     # This job downloads models exported by export-model-cuda-artifact and runs them using pybind.
+    # Same gating as test-model-cuda-e2e — explicit success-check on the
+    # export job so a skipped export (fork PR, non-sampled push, no path
+    # match) auto-skips this job too.
+    needs: [changed-files, export-model-cuda-artifact, run-decision]
+    if: |
+      needs.export-model-cuda-artifact.result == 'success' &&
+      (
+        contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
+        contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
+        contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        needs.run-decision.outputs.is-full-run == 'true'
+      )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write

From 3e19cd74d29ad1eafaeea08b6c5362cef60e339c Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Tue, 2 Jun 2026 10:34:51 -0700
Subject: [PATCH 2/2] up

---
 .github/workflows/cuda-perf.yml |  8 ++++++++
 .github/workflows/cuda.yml      | 24 ++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/.github/workflows/cuda-perf.yml b/.github/workflows/cuda-perf.yml
index 7b24dcbbdde..1bb9b62be65 100644
--- a/.github/workflows/cuda-perf.yml
+++ b/.github/workflows/cuda-perf.yml
@@ -12,6 +12,8 @@ on:
       - .github/workflows/cuda-perf.yml
       - .ci/scripts/cuda_benchmark.py
       - .ci/scripts/cuda_perf_prompts/**
+      - .ci/scripts/export_model_artifact.sh
+      - .ci/scripts/test_model_e2e.sh
   workflow_dispatch:
     inputs:
       models:
@@ -56,6 +58,8 @@ jobs:
       contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') ||
       contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') ||
       contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
       needs.run-decision.outputs.is-full-run == 'true'
     runs-on: ubuntu-22.04
     outputs:
@@ -184,6 +188,8 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') ||
         contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') ||
         contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -365,6 +371,8 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda-perf.yml') ||
         contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_benchmark.py') ||
         contains(needs.changed-files.outputs.changed-files, '.ci/scripts/cuda_perf_prompts') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
     runs-on: ubuntu-22.04
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 5972d064fc4..eafdc3807f7 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -20,6 +20,9 @@ on:
       - .github/workflows/cuda.yml
       - backends/cuda/**
       - backends/aoti/**
+      - .ci/scripts/test-cuda-build.sh
+      - .ci/scripts/export_model_artifact.sh
+      - .ci/scripts/test_model_e2e.sh
   workflow_dispatch:
 
 concurrency:
@@ -49,6 +52,9 @@ jobs:
       contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
       contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
       contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
       needs.run-decision.outputs.is-full-run == 'true'
     strategy:
       fail-fast: false
@@ -88,6 +94,9 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
     steps:
@@ -108,6 +117,9 @@ jobs:
       contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
       contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
       contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
       needs.run-decision.outputs.is-full-run == 'true'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
@@ -149,6 +161,9 @@ jobs:
       contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
       contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
       contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
       needs.run-decision.outputs.is-full-run == 'true'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
@@ -209,6 +224,9 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -367,6 +385,9 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -496,6 +517,9 @@ jobs:
         contains(needs.changed-files.outputs.changed-files, 'backends/cuda') ||
         contains(needs.changed-files.outputs.changed-files, 'backends/aoti') ||
         contains(needs.changed-files.outputs.changed-files, '.github/workflows/cuda.yml') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test-cuda-build.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/export_model_artifact.sh') ||
+        contains(needs.changed-files.outputs.changed-files, '.ci/scripts/test_model_e2e.sh') ||
         needs.run-decision.outputs.is-full-run == 'true'
       )
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main