From eaf04ec7029e9d4a428a33a44e11c7e6a74780fa Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Thu, 25 Jun 2026 14:11:21 -0700 Subject: [PATCH] Revert pthreadpool a56dcd7 update (backout D104906625 + dependent D108226589) (#20521) Summary: Revert the update to the third-party pthreadpool library, as it is suspected to have regressed performance on wearable PLLM/FLLM: S678566. Differential Revision: D109744957 --- extension/llm/custom_ops/op_sdpa_impl.h | 3 --- kernels/optimized/blas/CPUBlas.cpp | 20 -------------------- kernels/optimized/blas/CPUBlas.h | 17 ----------------- kernels/optimized/lib_defs.bzl | 7 ++----- 4 files changed, 2 insertions(+), 45 deletions(-) diff --git a/extension/llm/custom_ops/op_sdpa_impl.h b/extension/llm/custom_ops/op_sdpa_impl.h index 8b923673a08..73c5ccf707f 100644 --- a/extension/llm/custom_ops/op_sdpa_impl.h +++ b/extension/llm/custom_ops/op_sdpa_impl.h @@ -805,9 +805,6 @@ void cpu_flash_attention( is_reduced_type ? reinterpret_cast(buf_reduced) : nullptr; auto compute_lambda = [&](int64_t begin, int64_t end) { - // Blocks are parallelized over the threadpool; keep each block's gemms - // single-threaded so an OpenMP-threaded BLAS doesn't nest a second layer. - ::executorch::cpublas::SingleThreadedGemmGuard gemm_guard; int64_t i = 0, j = 0, k = 0; data_index_init(begin, i, batchSize, j, num_head, k, qSlice); int ompIdx = torch::executor::get_thread_num(); diff --git a/kernels/optimized/blas/CPUBlas.cpp b/kernels/optimized/blas/CPUBlas.cpp index 4d4baefb9e3..51a4f1ca26b 100644 --- a/kernels/optimized/blas/CPUBlas.cpp +++ b/kernels/optimized/blas/CPUBlas.cpp @@ -23,13 +23,6 @@ extern "C" void zgemm_(char *transa, char *transb, int *m, int *n, int *k, void #endif // ET_BUILD_FOR_APPLE #endif // ET_BUILD_WITH_BLAS -#ifdef ET_CPUBLAS_MKL_OMP -// MKL's thread-local thread-count setter. The C name aliases the Fortran -// by-reference entry point in this MKL build, so the argument is int*. Only -// referenced when linked against OpenMP MKL, so the strong ref always resolves. -extern "C" int mkl_set_num_threads_local(int* nt); -#endif // ET_CPUBLAS_MKL_OMP - namespace executorch { namespace cpublas { @@ -37,19 +30,6 @@ using executorch::aten::BFloat16; using executorch::aten::complex; using executorch::aten::Half; -SingleThreadedGemmGuard::SingleThreadedGemmGuard() : prev_num_threads_(0) { -#ifdef ET_CPUBLAS_MKL_OMP - int one = 1; - prev_num_threads_ = mkl_set_num_threads_local(&one); -#endif // ET_CPUBLAS_MKL_OMP -} - -SingleThreadedGemmGuard::~SingleThreadedGemmGuard() { -#ifdef ET_CPUBLAS_MKL_OMP - mkl_set_num_threads_local(&prev_num_threads_); -#endif // ET_CPUBLAS_MKL_OMP -} - #ifdef ET_BUILD_WITH_BLAS #ifdef ET_BUILD_FOR_APPLE inline CBLAS_TRANSPOSE to_cblas_transpose(TransposeType trans) { diff --git a/kernels/optimized/blas/CPUBlas.h b/kernels/optimized/blas/CPUBlas.h index baa25a97f4b..28bf68ad750 100644 --- a/kernels/optimized/blas/CPUBlas.h +++ b/kernels/optimized/blas/CPUBlas.h @@ -23,23 +23,6 @@ enum class TransposeType { ConjTranspose, }; -// Forces gemm() in its scope to run single-threaded when this library is built -// against OpenMP-threaded MKL (-DET_CPUBLAS_MKL_OMP), so a gemm called from -// inside a threadpool parallel region doesn't nest a second OpenMP team. No-op -// for any other BLAS backend. -class SingleThreadedGemmGuard { - public: - SingleThreadedGemmGuard(); - ~SingleThreadedGemmGuard(); - SingleThreadedGemmGuard(const SingleThreadedGemmGuard&) = delete; - SingleThreadedGemmGuard& operator=(const SingleThreadedGemmGuard&) = delete; - SingleThreadedGemmGuard(SingleThreadedGemmGuard&&) = delete; - SingleThreadedGemmGuard& operator=(SingleThreadedGemmGuard&&) = delete; - - private: - [[maybe_unused]] int prev_num_threads_; -}; - // clang-format off void normalize_last_dims( TransposeType transa, TransposeType transb, diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl index 42068890800..928fc44635d 100644 --- a/kernels/optimized/lib_defs.bzl +++ b/kernels/optimized/lib_defs.bzl @@ -175,7 +175,7 @@ def define_libs(is_fbcode=False): "//executorch/extension/threadpool:threadpool", ] - for libblas_name, mkl_dep, mkl_omp_define in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp", ["-DET_CPUBLAS_MKL_OMP"]), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl", [])]: + for libblas_name, mkl_dep in [("libblas", "fbsource//third-party/mkl:mkl_lp64_omp"), ("libblas_mkl_noomp", "fbsource//third-party/mkl:mkl")]: # Merge platform-specific kwargs platform_kwargs = get_apple_framework_deps_kwargs(is_fbcode) if not is_fbcode: @@ -217,10 +217,7 @@ def define_libs(is_fbcode=False): }), header_namespace = "executorch/kernels/optimized", visibility = ["PUBLIC"], - preprocessor_flags = get_preprocessor_flags() + select({ - ":linux-x86_64": mkl_omp_define, - "DEFAULT": [], - }), + preprocessor_flags = get_preprocessor_flags(), fbobjc_exported_preprocessor_flags = [ "-DET_BUILD_WITH_BLAS", "-DET_BUILD_FOR_APPLE",