diff --git a/.gitignore b/.gitignore index 02dcea02026..84dec789cce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +.gitignore +my_contrib +executorch_overview.html + # System files .DS_Store diff --git a/kernels/portable/cpu/op_log_softmax.cpp b/kernels/portable/cpu/op_log_softmax.cpp index 69d54410725..1fa7a903e7f 100644 --- a/kernels/portable/cpu/op_log_softmax.cpp +++ b/kernels/portable/cpu/op_log_softmax.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -42,8 +43,16 @@ Tensor& log_softmax_out( // Adjust for negative dim dim = dim < 0 ? dim + nonzero_dim(in) : dim; + // For half-precision inputs, the exp-sum is accumulated in float to avoid + // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's + // acc_type behavior. See also op_grid_sampler_2d.cpp. ET_SWITCH_FLOATHBF16_TYPES( in.scalar_type(), ctx, "_log_softmax.out", CTYPE, [&]() { + using ACC = std::conditional_t< + std::is_same_v || + std::is_same_v, + float, + CTYPE>; const CTYPE* const in_data = in.const_data_ptr(); CTYPE* const out_data = out.mutable_data_ptr(); @@ -61,11 +70,12 @@ Tensor& log_softmax_out( size, stride); - CTYPE temp_sum = apply_unary_map_reduce_fn( + ACC temp_sum = apply_unary_map_reduce_fn( [max_in](const CTYPE val_in) { - return std::exp(val_in - max_in); + return std::exp( + static_cast(val_in) - static_cast(max_in)); }, - [](const CTYPE mapped_in, CTYPE val_accum) { + [](const ACC mapped_in, ACC val_accum) { return val_accum + mapped_in; }, in_data + base, @@ -75,7 +85,9 @@ Tensor& log_softmax_out( apply_unary_map_fn( [max_in, temp_sum](const CTYPE val_in) { - return val_in - max_in - temp_sum; + return static_cast( + static_cast(val_in) - static_cast(max_in) - + temp_sum); }, in_data + base, out_data + base, diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp index 78ae41bb75d..9d905753250 100644 --- a/kernels/portable/cpu/op_mean.cpp +++ b/kernels/portable/cpu/op_mean.cpp @@ -7,6 +7,8 @@ */ #include +#include + #include #include #include @@ -58,17 +60,24 @@ Tensor& mean_dim_out( // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "mean.out"; + // For half-precision inputs, accumulate in float to avoid saturation. + // Matches ATen's acc_type behavior. ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { + using ACC = std::conditional_t< + std::is_same_v || + std::is_same_v, + float, + CTYPE>; const CTYPE* in_data = in.const_data_ptr(); CTYPE* out_data = out.mutable_data_ptr(); - const CTYPE denom = static_cast(reduce_size); + const ACC denom = static_cast(reduce_size); for (int64_t i = 0; i < outer_size; i++) { const CTYPE* row = in_data + i * reduce_size; - CTYPE acc = 0; + ACC acc = 0; for (int64_t j = 0; j < reduce_size; j++) { acc += row[j]; } - out_data[i] = acc / denom; + out_data[i] = static_cast(acc / denom); } }); return out; @@ -83,19 +92,25 @@ Tensor& mean_dim_out( static constexpr const char op_name[] = "mean.out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + using ACC = std::conditional_t< + std::is_same_v || + std::is_same_v, + float, + CTYPE_OUT>; CTYPE_OUT* out_data = out.mutable_data_ptr(); const size_t num = get_reduced_dim_product(in, dim_list); const bool success = parallel_for_each_reduce_over_dim_list_output_index( in, dim_list, out, [&](const auto begin, const auto end) { for (const auto out_ix : c10::irange(begin, end)) { - CTYPE_OUT sum = 0; + ACC sum = 0; if (plan.has_value()) { - sum = plan->execute( - [](CTYPE_IN v) { return static_cast(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + sum = plan->execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](ACC outv, ACC acc) { return acc + outv; }, out_ix); } - out_data[out_ix] = sum / static_cast(num); + out_data[out_ix] = + static_cast(sum / static_cast(num)); } }); ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp index 56ffa3c296c..81f3087ddf5 100644 --- a/kernels/portable/cpu/op_softmax.cpp +++ b/kernels/portable/cpu/op_softmax.cpp @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -42,8 +43,16 @@ Tensor& softmax_out( // Adjust for negative dim dim = dim < 0 ? dim + nonzero_dim(in) : dim; + // For half-precision inputs, the exp-sum is accumulated in float to avoid + // saturation (BFloat16 saturates near 256, Half near 2048). Matches ATen's + // acc_type behavior. See also op_grid_sampler_2d.cpp. ET_SWITCH_FLOATHBF16_TYPES( in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() { + using ACC = std::conditional_t< + std::is_same_v || + std::is_same_v, + float, + CTYPE>; const CTYPE* const in_data = in.const_data_ptr(); CTYPE* const out_data = out.mutable_data_ptr(); @@ -61,11 +70,12 @@ Tensor& softmax_out( size, stride); - const CTYPE temp_sum = apply_unary_map_reduce_fn( + const ACC temp_sum = apply_unary_map_reduce_fn( [max_in](const CTYPE val_in) { - return std::exp(val_in - max_in); + return std::exp( + static_cast(val_in) - static_cast(max_in)); }, - [](const CTYPE mapped_in, CTYPE val_accum) { + [](const ACC mapped_in, ACC val_accum) { return val_accum + mapped_in; }, in_data + base, @@ -74,7 +84,11 @@ Tensor& softmax_out( apply_unary_map_fn( [max_in, temp_sum](const CTYPE val_in) { - return std::exp(val_in - max_in) / temp_sum; + return static_cast( + std::exp( + static_cast(val_in) - + static_cast(max_in)) / + temp_sum); }, in_data + base, out_data + base, diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp index f2d750d93b7..194d909a9e3 100644 --- a/kernels/portable/cpu/op_sum.cpp +++ b/kernels/portable/cpu/op_sum.cpp @@ -7,6 +7,8 @@ */ #include +#include + #include #include #include @@ -60,16 +62,23 @@ Tensor& sum_dim_out( // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "sum.IntList_out"; + // For half-precision inputs, accumulate in float to avoid saturation. + // Matches ATen's acc_type behavior. See also op_grid_sampler_2d.cpp. ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { + using ACC = std::conditional_t< + std::is_same_v || + std::is_same_v, + float, + CTYPE>; const CTYPE* in_data = in.const_data_ptr(); CTYPE* out_data = out.mutable_data_ptr(); for (int64_t i = 0; i < outer_size; i++) { const CTYPE* row = in_data + i * reduce_size; - CTYPE acc = 0; + ACC acc = 0; for (int64_t j = 0; j < reduce_size; j++) { acc += row[j]; } - out_data[i] = acc; + out_data[i] = static_cast(acc); } }); return out; @@ -108,23 +117,24 @@ Tensor& sum_dim_out( ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { ET_SWITCH_REALHBBF16_TYPES( out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + using ACC = std::conditional_t< + std::is_same_v || + std::is_same_v, + float, + CTYPE_OUT>; CTYPE_OUT* out_data = out.mutable_data_ptr(); const bool success = parallel_for_each_reduce_over_dim_list_output_index( in, dim_list, out, [&](const auto begin, const auto end) { for (const auto out_ix : c10::irange(begin, end)) { - CTYPE_OUT sum = 0; + ACC sum = 0; if (plan.has_value()) { - sum = plan->execute( - [](CTYPE_IN v) { - return static_cast(v); - }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { - return acc + outv; - }, + sum = plan->execute( + [](CTYPE_IN v) { return static_cast(v); }, + [](ACC outv, ACC acc) { return acc + outv; }, out_ix); } - out_data[out_ix] = sum; + out_data[out_ix] = static_cast(sum); } }); ET_KERNEL_CHECK_MSG( diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp index 88a8660faf6..22fee57f627 100644 --- a/kernels/test/op_log_softmax_test.cpp +++ b/kernels/test/op_log_softmax_test.cpp @@ -369,6 +369,19 @@ TEST_F(OpLogSoftmaxOutTest, SimpleGeneratedCase) { EXPECT_TENSOR_CLOSE(out, expected_result); } +TEST_F(OpLogSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) { + TensorFactory tf; + // N=512: without fp32 accumulation, the exp-sum saturates at BFloat16's + // precision limit (~256), so the output is ~-log(256) instead of -log(512). + // atol=1e-1 can catch pre-fix error: |log(512) - log(256)| = log(2) + constexpr int N = 512; + Tensor x = tf.zeros({1, N}); + Tensor out = tf.zeros({1, N}); + op_log_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out); + Tensor expected = tf.full({1, N}, -std::log(static_cast(N))); + EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-1); +} + TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) { TensorFactory tf; diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp index 6633ab9c3c7..c2571929742 100644 --- a/kernels/test/op_mean_test.cpp +++ b/kernels/test/op_mean_test.cpp @@ -263,6 +263,35 @@ void OpMeanOutTest:: test_mean_dim_out_bool(); } +TEST_F(OpMeanOutTest, BFloat16GenericPathAccumulatesInFloat) { + TensorFactory tf; + // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is + // taken. Without fp32 accumulation the sum saturates at ~256, giving + // 256/512 = 0.5 instead of 1.0. + constexpr int N = 512; + Tensor x = tf.ones({N, 1}); + Tensor out = tf.zeros({1}); + int64_t dim = 0; + op_mean_out( + x, ArrayRef{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out); + Tensor expected = tf.full({1}, 1.0f); + EXPECT_TENSOR_CLOSE(out, expected); +} + +TEST_F(OpMeanOutTest, BFloat16LargeDimAccumulatesInFloat) { + TensorFactory tf; + // N=512, all-ones input: without fp32 accumulation the sum saturates at + // ~256 in BFloat16, giving 256/512 = 0.5 instead of 1.0. + constexpr int N = 512; + Tensor x = tf.ones({1, N}); + Tensor out = tf.zeros({1}); + int64_t dim = 1; + op_mean_out( + x, ArrayRef{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out); + Tensor expected = tf.full({1}, 1.0f); + EXPECT_TENSOR_CLOSE(out, expected); +} + TEST_F(OpMeanOutTest, InvalidDimensionListDies) { ET_SKIP_IF( torch::executor::testing::SupportedFeatures::get()->is_aten, diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp index 3f515bb4dcc..5f9b57d2d25 100644 --- a/kernels/test/op_softmax_test.cpp +++ b/kernels/test/op_softmax_test.cpp @@ -251,6 +251,19 @@ TEST_F(OpSoftmaxOutTest, SimpleGeneratedCase) { EXPECT_TENSOR_CLOSE(out, expected_result); } +TEST_F(OpSoftmaxOutTest, BFloat16LargeDimAccumulatesInFloat) { + TensorFactory tf; + // N=512: without fp32 accumulation the exp-sum saturates at BFloat16's + // precision limit (~256), so the output is ~1/256 instead of 1/512. + // 1e-3 is tight enough to catch pre-fix error: |1/256 - 1/512| ≈ 0.00195 + constexpr int N = 512; + Tensor x = tf.zeros({1, N}); + Tensor out = tf.zeros({1, N}); + op_softmax_out(x, /*dim=*/1, /*half_to_float=*/false, out); + Tensor expected = tf.full({1, N}, 1.0f / N); + EXPECT_TENSOR_CLOSE_WITH_TOL(out, expected, /*rtol=*/1e-5, /*atol=*/1e-3); +} + TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundSameAsExpected) { TensorFactory tf; diff --git a/kernels/test/op_sum_test.cpp b/kernels/test/op_sum_test.cpp index 18c71b1080b..ff71bcdb383 100644 --- a/kernels/test/op_sum_test.cpp +++ b/kernels/test/op_sum_test.cpp @@ -307,6 +307,35 @@ class OpSumOutTest : public OperatorTest { } }; +TEST_F(OpSumOutTest, BFloat16GenericPathAccumulatesInFloat) { + TensorFactory tf; + // Reducing dim=0 of {512, 1} is not the last dim, so the generic path is + // taken. Without fp32 accumulation the sum saturates at ~256 instead of + // 512. 512 = 2^9 is exactly representable in BFloat16. + constexpr int N = 512; + Tensor x = tf.ones({N, 1}); + Tensor out = tf.zeros({1}); + int64_t dim = 0; + op_sum_intlist_out( + x, ArrayRef{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out); + Tensor expected = tf.full({1}, static_cast(N)); + EXPECT_TENSOR_CLOSE(out, expected); +} + +TEST_F(OpSumOutTest, BFloat16LargeDimAccumulatesInFloat) { + TensorFactory tf; + // N=512, all-ones input: without fp32 accumulation the sum saturates at + // ~256 in BFloat16 instead of 512. + constexpr int N = 512; + Tensor x = tf.ones({1, N}); + Tensor out = tf.zeros({1}); + int64_t dim = 1; + op_sum_intlist_out( + x, ArrayRef{&dim, 1}, /*keepdim=*/false, /*dtype=*/{}, out); + Tensor expected = tf.full({1}, static_cast(N)); + EXPECT_TENSOR_CLOSE(out, expected); +} + TEST_F(OpSumOutTest, InvalidDimensionListDies) { ET_SKIP_IF( torch::executor::testing::SupportedFeatures::get()->is_aten,