From 4cfa2b82d7afc202bb941a6d52c7383c4dafa0c2 Mon Sep 17 00:00:00 2001
From: Youngsik Yang <vacu9708@gmail.com>
Date: Mon, 8 Jun 2026 10:41:19 +0900
Subject: [PATCH 1/2] portable: accumulate in fp32 for Half/BFloat16 in softmax
 and log_softmax
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem:
Softmax and log_softmax accumulated exp(x - max) in the tensor dtype.
For BFloat16, the running sum saturates around 256 — adding 1.0 stops
changing the total — so a uniform softmax over N=512 elements outputs
~1/256 instead of 1/512.

Changes:
Accumulate the exp-sum in float for Half/BFloat16 by threading an ACC
type through the map-reduce calls. Loads and stores remain in the tensor
dtype.

Continues the fp32-accumulation work in #19117.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 kernels/portable/cpu/op_log_softmax.cpp | 20 ++++++++++++++++----
 kernels/portable/cpu/op_softmax.cpp     | 22 ++++++++++++++++++----
 kernels/test/op_log_softmax_test.cpp    | 13 +++++++++++++
 kernels/test/op_softmax_test.cpp        | 13 +++++++++++++
 4 files changed, 60 insertions(+), 8 deletions(-)
diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp
index 69d54410725..1fa7a903e7f 100644
--- a/kernels/portable/cpu/op_log_softmax.cpp
+++ b/kernels/portable/cpu/op_log_softmax.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <type_traits>
 
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -42,8 +43,16 @@ Tensor& log_softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
+  // For half-precision inputs, the exp-sum is accumulated in float to avoid
+  // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
+  // acc_type behavior. See also op_grid_sampler_2d.cpp.
   ET_SWITCH_FLOATHBF16_TYPES(
       in.scalar_type(), ctx, "_log_softmax.out", CTYPE, [&]() {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
         CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -61,11 +70,12 @@ Tensor& log_softmax_out(
                   size,
                   stride);
 
-              CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+              ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
-                    return std::exp(val_in - max_in);
+                    return std::exp(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
                   },
-                  [](const CTYPE mapped_in, CTYPE val_accum) {
+                  [](const ACC mapped_in, ACC val_accum) {
                     return val_accum + mapped_in;
                   },
                   in_data + base,
@@ -75,7 +85,9 @@ Tensor& log_softmax_out(
 
               apply_unary_map_fn(
                   [max_in, temp_sum](const CTYPE val_in) {
-                    return val_in - max_in - temp_sum;
+                    return static_cast<CTYPE>(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in) -
+                        temp_sum);
                   },
                   in_data + base,
                   out_data + base,
diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp
index 56ffa3c296c..81f3087ddf5 100644
--- a/kernels/portable/cpu/op_softmax.cpp
+++ b/kernels/portable/cpu/op_softmax.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <cmath>
+#include <type_traits>
 
 #include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
@@ -42,8 +43,16 @@ Tensor& softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
+  // For half-precision inputs, the exp-sum is accumulated in float to avoid
+  // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's
+  // acc_type behavior. See also op_grid_sampler_2d.cpp.
   ET_SWITCH_FLOATHBF16_TYPES(
       in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
         CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
@@ -61,11 +70,12 @@ Tensor& softmax_out(
                   size,
                   stride);
 
-              const CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+              const ACC temp_sum = apply_unary_map_reduce_fn<CTYPE, ACC>(
                   [max_in](const CTYPE val_in) {
-                    return std::exp(val_in - max_in);
+                    return std::exp(
+                        static_cast<ACC>(val_in) - static_cast<ACC>(max_in));
                   },
-                  [](const CTYPE mapped_in, CTYPE val_accum) {
+                  [](const ACC mapped_in, ACC val_accum) {
                     return val_accum + mapped_in;
                   },
                   in_data + base,
@@ -74,7 +84,11 @@ Tensor& softmax_out(
 
               apply_unary_map_fn(
                   [max_in, temp_sum](const CTYPE val_in) {
-                    return std::exp(val_in - max_in) / temp_sum;
+                    return static_cast<CTYPE>(
+                        std::exp(
+                            static_cast<ACC>(val_in) -
+                            static_cast<ACC>(max_in)) /
+                        temp_sum);
                   },
                   in_data + base,
                   out_data + base,
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
index 88a8660faf6..22fee57f627 100644
--- a/kernels/test/op_log_softmax_test.cpp
+++ b/kernels/test/op_log_softmax_test.cpp
@@ -369,6 +369,19 @@ TEST_F(OpLogSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
+TEST_F(OpLogSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512: without fp32 accumulation, the exp-sum saturates at BFloat16's
+  // precision limit (~256), so the output is ~-log(256) instead of -log(512).
+  // atol=1e-1 can catch pre-fix error: |log(512) - log(256)| = log(2)
+  constexpr int N = 512;
+  Tensor x = tf.zeros({1, N});
+  Tensor out = tf.zeros({1, N});
+  op_log_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
+  Tensor expected = tf.full({1, N}, -std::log(static_cast<float>(N)));
+  EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-1);
+}
+
 TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp
index 3f515bb4dcc..5f9b57d2d25 100644
--- a/kernels/test/op_softmax_test.cpp
+++ b/kernels/test/op_softmax_test.cpp
@@ -251,6 +251,19 @@ TEST_F(OpSoftmaxOutTest, SimpleGeneratedCase) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
+TEST_F(OpSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512: without fp32 accumulation the exp-sum saturates at BFloat16's
+  // precision limit (~256), so the output is ~1/256 instead of 1/512.
+  // 1e-3 is tight enough to catch pre-fix error: |1/256 - 1/512| ≈ 0.00195
+  constexpr int N = 512;
+  Tensor x = tf.zeros({1, N});
+  Tensor out = tf.zeros({1, N});
+  op_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out);
+  Tensor expected = tf.full({1, N}, 1.0f / N);
+  EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-3);
+}
+
 TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 

From 98d2f3946bf71cf862d7fda5377900e65f71d990 Mon Sep 17 00:00:00 2001
From: Youngsik Yang <vacu9708@gmail.com>
Date: Mon, 8 Jun 2026 11:00:35 +0900
Subject: [PATCH 2/2] portable: accumulate in fp32 for Half/BFloat16 in mean
 and sum

Problem:
The fast-path and generic reduction loops in mean.out and sum.IntList_out
accumulated the running sum in the tensor dtype. For BFloat16, the sum
saturates around 256, so a mean over N=512 all-ones elements gives 0.5
instead of 1.0, and summing 512 all-ones elements gives 256 instead of
512.

Changes:
Accumulate in float for Half/BFloat16 by promoting the loop accumulator
to ACC in both the fast path and the generic path. The final result is
cast back to the tensor dtype on store.

Continues the fp32-accumulation work in #19117.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .gitignore                       |  4 ++++
 kernels/portable/cpu/op_mean.cpp | 31 +++++++++++++++++++++++--------
 kernels/portable/cpu/op_sum.cpp  | 32 +++++++++++++++++++++-----------
 kernels/test/op_mean_test.cpp    | 29 +++++++++++++++++++++++++++++
 kernels/test/op_sum_test.cpp     | 29 +++++++++++++++++++++++++++++
 5 files changed, 106 insertions(+), 19 deletions(-)

diff --git a/.gitignore b/.gitignore
index 02dcea02026..84dec789cce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+.gitignore
+my_contrib
+executorch_overview.html
+
 # System files
 .DS_Store
 
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index 78ae41bb75d..9d905753250 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -58,17 +60,24 @@ Tensor& mean_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "mean.out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior.
       ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-        const CTYPE denom = static_cast<CTYPE>(reduce_size);
+        const ACC denom = static_cast<ACC>(reduce_size);
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc / denom;
+          out_data[i] = static_cast<CTYPE>(acc / denom);
         }
       });
       return out;
@@ -83,19 +92,25 @@ Tensor& mean_dim_out(
   static constexpr const char op_name[] = "mean.out";
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      using ACC = std::conditional_t<
+          std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+              std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+          float,
+          CTYPE_OUT>;
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
       const size_t num = get_reduced_dim_product(in, dim_list);
       const bool success = parallel_for_each_reduce_over_dim_list_output_index(
           in, dim_list, out, [&](const auto begin, const auto end) {
             for (const auto out_ix : c10::irange(begin, end)) {
-              CTYPE_OUT sum = 0;
+              ACC sum = 0;
               if (plan.has_value()) {
-                sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                sum = plan->execute<CTYPE_IN, ACC>(
+                    [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                    [](ACC outv, ACC acc) { return acc + outv; },
                     out_ix);
               }
-              out_data[out_ix] = sum / static_cast<float>(num);
+              out_data[out_ix] =
+                  static_cast<CTYPE_OUT>(sum / static_cast<float>(num));
             }
           });
       ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
index f2d750d93b7..194d909a9e3 100644
--- a/kernels/portable/cpu/op_sum.cpp
+++ b/kernels/portable/cpu/op_sum.cpp
@@ -7,6 +7,8 @@
  */
 #include <c10/util/irange.h>
 
+#include <type_traits>
+
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
@@ -60,16 +62,23 @@ Tensor& sum_dim_out(
 
       // @lint-ignore CLANGTIDY facebook-hte-CArray
       static constexpr const char op_name[] = "sum.IntList_out";
+      // For half-precision inputs, accumulate in float to avoid saturation.
+      // Matches ATen's acc_type behavior. See also op_grid_sampler_2d.cpp.
       ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+        using ACC = std::conditional_t<
+            std::is_same_v<CTYPE, executorch::aten::Half> ||
+                std::is_same_v<CTYPE, executorch::aten::BFloat16>,
+            float,
+            CTYPE>;
         const CTYPE* in_data = in.const_data_ptr<CTYPE>();
         CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
         for (int64_t i = 0; i < outer_size; i++) {
           const CTYPE* row = in_data + i * reduce_size;
-          CTYPE acc = 0;
+          ACC acc = 0;
           for (int64_t j = 0; j < reduce_size; j++) {
             acc += row[j];
           }
-          out_data[i] = acc;
+          out_data[i] = static_cast<CTYPE>(acc);
         }
       });
       return out;
@@ -108,23 +117,24 @@ Tensor& sum_dim_out(
     ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
       ET_SWITCH_REALHBBF16_TYPES(
           out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+            using ACC = std::conditional_t<
+                std::is_same_v<CTYPE_OUT, executorch::aten::Half> ||
+                    std::is_same_v<CTYPE_OUT, executorch::aten::BFloat16>,
+                float,
+                CTYPE_OUT>;
             CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
             const bool success =
                 parallel_for_each_reduce_over_dim_list_output_index(
                     in, dim_list, out, [&](const auto begin, const auto end) {
                       for (const auto out_ix : c10::irange(begin, end)) {
-                        CTYPE_OUT sum = 0;
+                        ACC sum = 0;
                         if (plan.has_value()) {
-                          sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                              [](CTYPE_IN v) {
-                                return static_cast<CTYPE_OUT>(v);
-                              },
-                              [](CTYPE_OUT outv, CTYPE_OUT acc) {
-                                return acc + outv;
-                              },
+                          sum = plan->execute<CTYPE_IN, ACC>(
+                              [](CTYPE_IN v) { return static_cast<ACC>(v); },
+                              [](ACC outv, ACC acc) { return acc + outv; },
                               out_ix);
                         }
-                        out_data[out_ix] = sum;
+                        out_data[out_ix] = static_cast<CTYPE_OUT>(sum);
                       }
                     });
             ET_KERNEL_CHECK_MSG(
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
index 6633ab9c3c7..c2571929742 100644
--- a/kernels/test/op_mean_test.cpp
+++ b/kernels/test/op_mean_test.cpp
@@ -263,6 +263,35 @@ void OpMeanOutTest::
   test_mean_dim_out_bool<ScalarType::Double>();
 }
 
+TEST_F(OpMeanOutTest, BFloat16GenericPathAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
+  // taken. Without fp32 accumulation the sum saturates at ~256, giving
+  // 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({N, 1});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 0;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+TEST_F(OpMeanOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512, all-ones input: without fp32 accumulation the sum saturates at
+  // ~256 in BFloat16, giving 256/512 = 0.5 instead of 1.0.
+  constexpr int N = 512;
+  Tensor x = tf.ones({1, N});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 1;
+  op_mean_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, 1.0f);
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
 TEST_F(OpMeanOutTest, InvalidDimensionListDies) {
   ET_SKIP_IF(
       torch::executor::testing::SupportedFeatures::get()->is_aten,
diff --git a/kernels/test/op_sum_test.cpp b/kernels/test/op_sum_test.cpp
index 18c71b1080b..ff71bcdb383 100644
--- a/kernels/test/op_sum_test.cpp
+++ b/kernels/test/op_sum_test.cpp
@@ -307,6 +307,35 @@ class OpSumOutTest : public OperatorTest {
   }
 };
 
+TEST_F(OpSumOutTest, BFloat16GenericPathAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is
+  // taken. Without fp32 accumulation the sum saturates at ~256 instead of
+  // 512. 512 = 2^9 is exactly representable in BFloat16.
+  constexpr int N = 512;
+  Tensor x = tf.ones({N, 1});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 0;
+  op_sum_intlist_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, static_cast<float>(N));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
+TEST_F(OpSumOutTest, BFloat16LargeDimAccumulatesInFloat) {
+  TensorFactory<ScalarType::BFloat16> tf;
+  // N=512, all-ones input: without fp32 accumulation the sum saturates at
+  // ~256 in BFloat16 instead of 512.
+  constexpr int N = 512;
+  Tensor x = tf.ones({1, N});
+  Tensor out = tf.zeros({1});
+  int64_t dim = 1;
+  op_sum_intlist_out(
+      x, ArrayRef<int64_t>{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out);
+  Tensor expected = tf.full({1}, static_cast<float>(N));
+  EXPECT_TENSOR_CLOSE(out, expected);
+}
+
 TEST_F(OpSumOutTest, InvalidDimensionListDies) {
   ET_SKIP_IF(
       torch::executor::testing::SupportedFeatures::get()->is_aten,