From eaf04ec7029e9d4a428a33a44e11c7e6a74780fa Mon Sep 17 00:00:00 2001
From: Gregory Comer <gjcomer@meta.com>
Date: Thu, 25 Jun 2026 14:11:21 -0700
Subject: [PATCH] Revert pthreadpool a56dcd7 update (backout D104906625 +
 dependent D108226589) (#20521)

Summary:

Revert the update to the third-party pthreadpool library, as it is suspected to have regressed performance on wearable PLLM/FLLM: S678566.

Differential Revision: D109744957
---
 extension/llm/custom_ops/op_sdpa_impl.h |  3 ---
 kernels/optimized/blas/CPUBlas.cpp      | 20 --------------------
 kernels/optimized/blas/CPUBlas.h        | 17 -----------------
 kernels/optimized/lib_defs.bzl          |  7 ++-----
 4 files changed, 2 insertions(+), 45 deletions(-)

diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h
index 8b923673a08..73c5ccf707f 100644
--- a/extension/llm/custom_ops/op_sdpa_impl.h
+++ b/extension/llm/custom_ops/op_sdpa_impl.h
@@ -805,9 +805,6 @@ void cpu_flash_attention(
       is_reduced_type ? reinterpret_cast<scalar_t*>(buf_reduced) : nullptr;
 
   auto compute_lambda = [&](int64_t begin, int64_t end) {
-    // Blocks are parallelized over the threadpool; keep each block's gemms
-    // single-threaded so an OpenMP-threaded BLAS doesn't nest a second layer.
-    ::executorch::cpublas::SingleThreadedGemmGuard gemm_guard;
     int64_t i = 0, j = 0, k = 0;
     data_index_init(begin, i, batchSize, j, num_head, k, qSlice);
     int ompIdx = torch::executor::get_thread_num();
diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp
index 4d4baefb9e3..51a4f1ca26b 100644
--- a/kernels/optimized/blas/CPUBlas.cpp
+++ b/kernels/optimized/blas/CPUBlas.cpp
@@ -23,13 +23,6 @@ extern "C" void zgemm_(char *transa, char *transb, int *m, int *n, int *k, void
 #endif // ET_BUILD_FOR_APPLE
 #endif // ET_BUILD_WITH_BLAS
 
-#ifdef ET_CPUBLAS_MKL_OMP
-// MKL's thread-local thread-count setter. The C name aliases the Fortran
-// by-reference entry point in this MKL build, so the argument is int*. Only
-// referenced when linked against OpenMP MKL, so the strong ref always resolves.
-extern "C" int mkl_set_num_threads_local(int* nt);
-#endif // ET_CPUBLAS_MKL_OMP
-
 namespace executorch {
 namespace cpublas {
 
@@ -37,19 +30,6 @@ using executorch::aten::BFloat16;
 using executorch::aten::complex;
 using executorch::aten::Half;
 
-SingleThreadedGemmGuard::SingleThreadedGemmGuard() : prev_num_threads_(0) {
-#ifdef ET_CPUBLAS_MKL_OMP
-  int one = 1;
-  prev_num_threads_ = mkl_set_num_threads_local(&one);
-#endif // ET_CPUBLAS_MKL_OMP
-}
-
-SingleThreadedGemmGuard::~SingleThreadedGemmGuard() {
-#ifdef ET_CPUBLAS_MKL_OMP
-  mkl_set_num_threads_local(&prev_num_threads_);
-#endif // ET_CPUBLAS_MKL_OMP
-}
-
 #ifdef ET_BUILD_WITH_BLAS
 #ifdef ET_BUILD_FOR_APPLE
 inline CBLAS_TRANSPOSE to_cblas_transpose(TransposeType trans) {
diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h
index baa25a97f4b..28bf68ad750 100644
--- a/kernels/optimized/blas/CPUBlas.h
+++ b/kernels/optimized/blas/CPUBlas.h
@@ -23,23 +23,6 @@ enum class TransposeType {
   ConjTranspose,
 };
 
-// Forces gemm() in its scope to run single-threaded when this library is built
-// against OpenMP-threaded MKL (-DET_CPUBLAS_MKL_OMP), so a gemm called from
-// inside a threadpool parallel region doesn't nest a second OpenMP team. No-op
-// for any other BLAS backend.
-class SingleThreadedGemmGuard {
- public:
-  SingleThreadedGemmGuard();
-  ~SingleThreadedGemmGuard();
-  SingleThreadedGemmGuard(const SingleThreadedGemmGuard&) = delete;
-  SingleThreadedGemmGuard& operator=(const SingleThreadedGemmGuard&) = delete;
-  SingleThreadedGemmGuard(SingleThreadedGemmGuard&&) = delete;
-  SingleThreadedGemmGuard& operator=(SingleThreadedGemmGuard&&) = delete;
-
- private:
-  [[maybe_unused]] int prev_num_threads_;
-};
-
 // clang-format off
 void normalize_last_dims(
     TransposeType transa, TransposeType transb,
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
index 42068890800..928fc44635d 100644
--- a/kernels/optimized/lib_defs.bzl
+++ b/kernels/optimized/lib_defs.bzl
@@ -175,7 +175,7 @@ def define_libs(is_fbcode=False):
         "//executorch/extension/threadpool:threadpool",
     ]
 
-    for libblas_name, mkl_dep, mkl_omp_define in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp", ["-DET_CPUBLAS_MKL_OMP"]), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl", [])]:
+    for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]:
         # Merge platform-specific kwargs
         platform_kwargs = get_apple_framework_deps_kwargs(is_fbcode)
         if not is_fbcode:
@@ -217,10 +217,7 @@ def define_libs(is_fbcode=False):
             }),
             header_namespace = "executorch/kernels/optimized",
             visibility = ["PUBLIC"],
-            preprocessor_flags = get_preprocessor_flags() + select({
-                ":linux-x86_64": mkl_omp_define,
-                "DEFAULT": [],
-            }),
+            preprocessor_flags = get_preprocessor_flags(),
             fbobjc_exported_preprocessor_flags = [
                 "-DET_BUILD_WITH_BLAS",
                 "-DET_BUILD_FOR_APPLE",