From e71227e1b1cf189d11e9ea5602eade4a83a495dd Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 5 May 2026 14:06:57 -0400 Subject: [PATCH 1/3] feat: dynamic batch_bool masks + avx_128 / avx2_128 mask overloads Adds runtime batch_bool mask overloads of xsimd::load_masked and xsimd::store_masked across AVX, AVX2, AVX-512, SSE, SVE, RVV, and NEON; generic common-path fallback collapsed to a whole-vector select. SVE compile-time masked load/store forwarded through the runtime path so the per-lane predicate is correct on SVE wider than 128 bits. Adds arch-specific runtime-mask overloads of load_masked / store_masked for the avx_128 and avx2_128 arches so they inherit the hardware predicated load/store path on x86. Squashed from: b57a7667 feat: add runtime batch_bool mask overloads for load_masked/store_masked d5f21c70 feat: add runtime batch_bool mask overloads for avx_128 / avx2_128 --- docs/source/api/data_transfer.rst | 12 +- .../xsimd/arch/common/xsimd_common_memory.hpp | 35 +++++ include/xsimd/arch/xsimd_avx.hpp | 33 +++++ include/xsimd/arch/xsimd_avx2.hpp | 22 ++++ include/xsimd/arch/xsimd_avx2_128.hpp | 39 ++++++ include/xsimd/arch/xsimd_avx_128.hpp | 31 +++++ include/xsimd/arch/xsimd_common_fwd.hpp | 4 + include/xsimd/arch/xsimd_rvv.hpp | 24 ++++ include/xsimd/arch/xsimd_sve.hpp | 43 +++++- include/xsimd/types/xsimd_api.hpp | 64 +++++++++ include/xsimd/types/xsimd_batch.hpp | 28 +++- include/xsimd/types/xsimd_utils.hpp | 10 ++ test/test_load_store.cpp | 124 ++++++++++++------ 13 files changed, 422 insertions(+), 47 deletions(-) diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index 73f9256fd..0195e6c51 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -12,7 +12,7 @@ Data Transfers From memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`load` | load values from memory (optionally masked) | +| :cpp:func:`load` | load values from memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_aligned` | load values from aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -32,7 +32,7 @@ From a scalar: To memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`store` | store values to memory (optionally masked) | +| :cpp:func:`store` | store values to memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_aligned` | store values to aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -84,3 +84,11 @@ The following empty types are used for tag dispatching: .. doxygenstruct:: xsimd::unaligned_mode :project: xsimd + +.. rubric:: Footnotes + +.. [#m] Masked ``load`` / ``store`` come in two flavours. The + :cpp:class:`batch_bool_constant` overload encodes the mask in the type and + is resolved at compile time. The runtime :cpp:class:`batch_bool` overload + accepts a mask computed at runtime. Prefer the compile-time mask whenever + the selection is known at compile time. diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index bd2d14f93..de76d512a 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -437,6 +437,23 @@ namespace xsimd return detail::load_masked_common(mem, mask, cvt, mode, detail::masked_memory_uses_fp_bitcast {}); } + template + XSIMD_INLINE batch + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + // Per-lane validity contract: only active lanes of ``mem`` are + // required to be addressable. An unconditional whole-vector load + // would touch inactive lanes and trip ASan/Valgrind on partial + // buffers, so stay scalar. Arches with hardware predicated loads + // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single + // intrinsic that suppresses inactive-lane reads in hardware. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array buffer; + for (std::size_t i = 0; i < size; ++i) + buffer[i] = mask.get(i) ? mem[i] : T(0); + return batch::load_aligned(buffer.data()); + } + template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment mode, requires_arch) noexcept @@ -444,6 +461,24 @@ namespace xsimd detail::store_masked_common(mem, src, mask, mode, detail::masked_memory_uses_fp_bitcast {}); } + template + XSIMD_INLINE void + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + // Per-lane validity contract (matches native masked-store APIs): + // only active lanes of ``mem`` are touched. A load+select+store + // RMW would both read and write inactive bytes, breaking that + // contract — stay scalar. Arches with hardware predicated stores + // override this with a single intrinsic that suppresses inactive + // lanes in hardware. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array src_buf; + src.store_aligned(src_buf.data()); + for (std::size_t i = 0; i < size; ++i) + if (mask.get(i)) + mem[i] = src_buf[i]; + } + template XSIMD_INLINE batch load_stream(T_in const* mem, convert cvt, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index a542d3f31..2c92cd3ca 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -987,6 +987,23 @@ namespace xsimd } } + // Runtime-mask load for float/double on AVX. Both aligned_mode and + // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault + // on masked-off lanes, so partial loads across page boundaries are safe. + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_ps(mem, _mm256_castps_si256(mask)); + } + + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask)); + } + // load_masked (single overload for float/double) template ::value>> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept @@ -1070,6 +1087,22 @@ namespace xsimd } } + // Runtime-mask store for float/double on AVX. Same fault-suppression + // semantics as the masked loads above; alignment mode is irrelevant. + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + detail::maskstore(mem, mask, src); + } + + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + detail::maskstore(mem, mask, src); + } + // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 5cb47f908..ce61a362f 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -174,6 +174,17 @@ namespace xsimd return bitwise_cast(r); } + // Runtime-mask load for 32/64-bit integers on AVX2; narrower widths fall + // back to the scalar common path. Aligned and unaligned share the same + // intrinsic — masked-off lanes do not fault regardless of alignment. + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + return detail::maskload(reinterpret_cast(mem), __m256i(mask)); + } + // store_masked namespace detail { @@ -232,6 +243,17 @@ namespace xsimd store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); } + // Runtime-mask store for 32/64-bit integers on AVX2; narrower widths fall + // back to the scalar common path. Same fault-suppression semantics as the + // masked loads above; alignment mode is irrelevant. + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + detail::maskstore(reinterpret_cast(mem), __m256i(mask), __m256i(src)); + } + // load_stream template ::value, void>> XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index c0f119e4e..0d33c64fe 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -137,6 +137,45 @@ namespace xsimd return _mm_maskstore_epi64(reinterpret_cast(mem), mask.as_batch(), src); } + // Runtime-mask path for 32/64-bit integers; narrower widths fall back to + // the common scalar path. Aligned and unaligned share the same intrinsic + // — masked-off lanes do not fault regardless of alignment. + namespace detail + { + XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept + { + return _mm_maskload_epi32(mem, mask); + } + XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept + { + return _mm_maskload_epi64(mem, mask); + } + XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept + { + _mm_maskstore_epi32(mem, mask, src); + } + XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept + { + _mm_maskstore_epi64(mem, mask, src); + } + } + + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + return detail::maskload_128(reinterpret_cast(mem), __m128i(mask)); + } + + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + detail::maskstore_128(reinterpret_cast(mem), __m128i(mask), __m128i(src)); + } + // gather template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 07dafd78b..29931998c 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -115,6 +115,22 @@ namespace xsimd return _mm_maskload_pd(mem, mask.as_batch()); } + // Runtime-mask load for float/double on AVX-128. Both aligned_mode and + // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault + // on masked-off lanes, so partial loads across page boundaries are safe. + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_ps(mem, _mm_castps_si128(mask)); + } + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_pd(mem, _mm_castpd_si128(mask)); + } + // store_masked template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept @@ -128,6 +144,21 @@ namespace xsimd return _mm_maskstore_pd(mem, mask.as_batch(), src); } + // Runtime-mask store for float/double on AVX-128. Same fault-suppression + // semantics as the masked loads above; alignment mode is irrelevant. + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_ps(mem, _mm_castps_si128(mask), src); + } + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_pd(mem, _mm_castpd_si128(mask), src); + } + // swizzle (dynamic mask) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 8c4818176..ab37e7912 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -88,8 +88,12 @@ namespace xsimd XSIMD_INLINE batch load(T const* mem, unaligned_mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; + template + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; + template + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 483f5e28f..5983f525d 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -409,6 +409,11 @@ namespace xsimd { XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) + // Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0, + // no memory access is performed for inactive lanes (page-fault safe). + XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*)) + // Masked store: inactive lanes are not written. + XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec)) } template = 0> @@ -423,6 +428,16 @@ namespace xsimd return load_aligned(src, convert(), rvv {}); } + // load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init + // passthrough so inactive lanes read as 0, matching xsimd's contract. + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + const auto zero = detail_rvv::rvvmv_splat(proj_t {}); + return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast(mem)); + } + // load_complex namespace detail_rvv { @@ -500,6 +515,15 @@ namespace xsimd store_aligned(dst, src, rvv {}); } + // store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes + // are not written (page-fault safe). + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + detail_rvv::rvvse_m(mask, reinterpret_cast(mem), src); + } + /****************** * scatter/gather * ******************/ diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 2a46947bf..871f3b7da 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -101,11 +101,28 @@ namespace xsimd return load_aligned(src, convert(), sve {}); } - // load_masked - template = 0> - XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept + // load_masked (compile-time mask): build a runtime predicate from + // the constant mask and reuse the runtime-mask path. ``pmask`` only + // constructs a 128-bit chunk predicate (svdupq_b{8,16,32,64}), which + // is replication-based and does not correctly express a per-lane + // mask on SVE wider than 128 bits — going through ``as_batch_bool`` + // gives the right predicate for every vector width. ``int32``/ + // ``int64``/``uint32``/``uint64`` are excluded so the common-arch + // dispatchers that reinterpret to ``float``/``double`` win partial + // ordering (otherwise we'd be ambiguous with ``requires_arch``). + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode m, requires_arch) noexcept { - return svld1(detail_sve::pmask(), reinterpret_cast const*>(mem)); + return load_masked(mem, mask.as_batch_bool(), convert {}, m, sve {}); + } + + // load_masked (runtime mask) + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return svld1(mask, reinterpret_cast const*>(mem)); } // load_complex @@ -141,6 +158,24 @@ namespace xsimd store_aligned(dst, src, sve {}); } + // store_masked (compile-time mask): forward to the runtime-mask + // path for the same reason as load_masked above; same exclusion of + // 32/64-bit integers to defer to the common dispatchers. + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode m, requires_arch) noexcept + { + store_masked(mem, src, mask.as_batch_bool(), m, sve {}); + } + + // store_masked (runtime mask) + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + svst1(mask, reinterpret_cast*>(mem), src); + } + // store_complex template = 0> XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 043f7e38a..b1ec3e58f 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1551,6 +1551,27 @@ namespace xsimd return batch::load(ptr, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr using a runtime mask. Elements + * corresponding to \c false in the mask are not accessed in memory and are + * zero-initialized in the resulting batch. No type conversion is performed: + * \c ptr must point to \c T. Prefer the \c batch_bool_constant overload + * whenever the mask is known at compile time. + * @param ptr the memory buffer to read + * @param mask runtime selection mask for the elements to load + * @return a new batch instance + */ + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -1571,6 +1592,16 @@ namespace xsimd return batch::load(ptr, mask, unaligned_mode {}); } + /// \overload + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2712,6 +2743,28 @@ namespace xsimd val.store(mem, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Copy selected elements of batch \c val to the buffer \c mem using a + * runtime mask. Elements corresponding to \c false in the mask are not + * written and leave the contents of \c mem untouched. No type conversion + * is performed: \c mem must point to \c T. Prefer the \c + * batch_bool_constant overload whenever the mask is known at compile time. + * @param mem the memory buffer to write to + * @param val the batch to copy from + * @param mask runtime selection mask for the elements to store + */ + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2732,6 +2785,17 @@ namespace xsimd val.store(mem, mask, unaligned_mode {}); } + /// \overload + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 59f87d2bb..1b97ea105 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -168,9 +168,12 @@ namespace xsimd template XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_INLINE void store(U* mem, batch_bool_constant mask, Mode) const noexcept; + /** \brief Runtime-mask store; see xsimd::store(T*, batch const&, batch_bool, Mode). */ + template + XSIMD_INLINE void store(T* mem, batch_bool mask, Mode = {}) const noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept; @@ -180,9 +183,12 @@ namespace xsimd XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; + /** \brief Runtime-mask load; see xsimd::load(T const*, batch_bool, Mode). */ + template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(T const* mem, batch_bool mask, Mode = {}) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; @@ -756,6 +762,14 @@ namespace xsimd } } + template + template + XSIMD_INLINE batch batch::load(T const* mem, batch_bool mask, Mode mode) noexcept + { + detail::static_check_supported_config(); + return kernel::load_masked(mem, mask, kernel::convert {}, mode, A {}); + } + template template XSIMD_INLINE void batch::store(U* mem, @@ -779,6 +793,16 @@ namespace xsimd } } + template + template + XSIMD_INLINE void batch::store(T* mem, + batch_bool mask, + Mode mode) const noexcept + { + detail::static_check_supported_config(); + kernel::store_masked(mem, *this, mask, mode, A {}); + } + template template XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp index 5dbab8551..e198c0966 100644 --- a/include/xsimd/types/xsimd_utils.hpp +++ b/include/xsimd/types/xsimd_utils.hpp @@ -457,6 +457,16 @@ namespace xsimd template using complex_batch_type_t = typename complex_batch_type::type; + + namespace details + { + // Returns a bitmask with the lowest \c size bits set. Used by masked + // load/store fast paths to detect "all lanes active". + inline constexpr uint64_t full_mask(std::size_t size) noexcept + { + return size >= 64 ? ~uint64_t(0) : ((uint64_t(1) << size) - 1); + } + } } #endif diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index c353976e5..794458154 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -23,6 +23,7 @@ struct load_store_test { using batch_type = B; using value_type = typename B::value_type; + using batch_bool_type = typename B::batch_bool_type; using index_type = typename xsimd::as_integer_t; template using allocator = xsimd::default_allocator; @@ -65,22 +66,6 @@ struct load_store_test static constexpr bool get(std::size_t index, std::size_t size) noexcept { return index >= (size / 2); } }; - struct mask_first_n - { - static constexpr bool get(std::size_t index, std::size_t size) noexcept - { - return index < (size > 2 ? size / 3 : std::size_t(1)); - } - }; - - struct mask_last_n - { - static constexpr bool get(std::size_t index, std::size_t size) noexcept - { - return index >= size - (size > 2 ? size / 3 : std::size_t(1)); - } - }; - struct mask_even { static constexpr bool get(std::size_t index, std::size_t) noexcept { return (index % 2) == 0; } @@ -104,6 +89,41 @@ struct load_store_test static constexpr bool get(std::size_t, std::size_t) noexcept { return true; } }; + template + static batch_bool_type make_runtime_mask() noexcept + { + uint64_t bits = 0; + for (std::size_t i = 0; i < size; ++i) + { + if (Generator::get(i, size)) + { + bits |= uint64_t(1) << i; + } + } + return batch_bool_type::from_mask(bits); + } + + struct compile_time_mask + { + static constexpr const char* tag = ""; + template + static auto make() noexcept + -> decltype(xsimd::make_batch_bool_constant()) + { + return xsimd::make_batch_bool_constant(); + } + }; + + struct runtime_mask + { + static constexpr const char* tag = " runtime"; + template + static batch_bool_type make() noexcept + { + return make_runtime_mask(); + } + }; + int8_vector_type i8_vec; uint8_vector_type ui8_vec; int16_vector_type i16_vec; @@ -366,16 +386,22 @@ struct load_store_test template void run_mask_tests(const V& v, const std::string& name, batch_type& b, const array_type& expected, std::true_type) { - run_load_mask_pattern(v, name, b, expected, " masked none"); - run_load_mask_pattern(v, name, b, expected, " masked first element"); - run_load_mask_pattern(v, name, b, expected, " masked first half"); - run_load_mask_pattern(v, name, b, expected, " masked last half"); - run_load_mask_pattern(v, name, b, expected, " masked first N"); - run_load_mask_pattern(v, name, b, expected, " masked last N"); - run_load_mask_pattern(v, name, b, expected, " masked even elements"); - run_load_mask_pattern(v, name, b, expected, " masked odd elements"); - run_load_mask_pattern(v, name, b, expected, " masked pseudo random"); - run_load_mask_pattern(v, name, b, expected, " masked all elements"); + run_load_mask_patterns(v, name, b, expected); + run_load_mask_patterns(v, name, b, expected); + } + + template + void run_load_mask_patterns(const V& v, const std::string& name, batch_type& b, const array_type& expected) + { + const std::string p = std::string(MaskKind::tag) + " masked"; + run_load_mask_pattern(v, name, b, expected, p + " none"); + run_load_mask_pattern(v, name, b, expected, p + " first element"); + run_load_mask_pattern(v, name, b, expected, p + " first half"); + run_load_mask_pattern(v, name, b, expected, p + " last half"); + run_load_mask_pattern(v, name, b, expected, p + " even elements"); + run_load_mask_pattern(v, name, b, expected, p + " odd elements"); + run_load_mask_pattern(v, name, b, expected, p + " pseudo random"); + run_load_mask_pattern(v, name, b, expected, p + " all elements"); } template @@ -383,10 +409,10 @@ struct load_store_test { } - template + template void run_load_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label) { - constexpr auto mask = xsimd::make_batch_bool_constant(); + const auto mask = MaskKind::template make(); array_type expected_masked { 0 }; for (std::size_t i = 0; i < size; ++i) @@ -403,10 +429,10 @@ struct load_store_test CHECK_BATCH_EQ(b, expected_masked); } - template + template void run_store_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label) { - auto mask = xsimd::make_batch_bool_constant(); + const auto mask = MaskKind::template make(); for (std::size_t i = 0; i < size; ++i) { expected_masked[i] = Generator::get(i, size) ? v[i] : value_type(); @@ -424,15 +450,21 @@ struct load_store_test template void run_store_mask_tests(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, std::true_type) { - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first element"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first half"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked last half"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first N"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked last N"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked even elements"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked odd elements"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked pseudo random"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked all elements"); + run_store_mask_patterns(v, name, b, res, expected_masked); + run_store_mask_patterns(v, name, b, res, expected_masked); + } + + template + void run_store_mask_patterns(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked) + { + const std::string p = std::string(MaskKind::tag) + " masked"; + run_store_mask_pattern(v, name, b, res, expected_masked, p + " first element"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " first half"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " last half"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " even elements"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " odd elements"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " pseudo random"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " all elements"); } template @@ -452,6 +484,7 @@ struct load_store_test V sentinel_expected(size, sentinel); auto zero_mask = xsimd::make_batch_bool_constant(); + auto runtime_zero_mask = make_runtime_mask(); std::fill(res.begin(), res.end(), sentinel); b.store(res.data(), zero_mask, xsimd::aligned_mode()); INFO(name, " masked none aligned store"); @@ -469,6 +502,19 @@ struct load_store_test CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) { return v == sentinel; })); + std::fill(res.begin(), res.end(), sentinel); + xsimd::store(res.data(), b, runtime_zero_mask, xsimd::aligned_mode()); + INFO(name, " runtime masked none aligned store"); + CHECK_VECTOR_EQ(res, sentinel_expected); + + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store(scratch_ptr, b, runtime_zero_mask, xsimd::unaligned_mode()); + INFO(name, " runtime masked none unaligned store"); + std::copy(scratch_ptr, scratch_ptr + scratch_slice.size(), scratch_slice.begin()); + CHECK_VECTOR_EQ(scratch_slice, sentinel_expected); + CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) + { return v == sentinel; })); + run_store_mask_tests(v, name, b, res, expected_masked, std::true_type {}); } From 262f5a721ddf9b697ee4fe69509f6258665bebc1 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 5 May 2026 14:42:57 -0400 Subject: [PATCH 2/3] refactor: trim masked load/store comments and consolidate AVX2-128 helpers Shorten verbose comments around masked load/store paths, drop the sizeof(int)/sizeof(long long) static_asserts (intrinsic boundaries now reinterpret_cast at the call site), and collapse the four maskload_128/maskstore_128 detail overloads into two XSIMD_IF_CONSTEXPR- dispatched templates. Public surface unchanged. --- .github/workflows/doxygen.yml | 4 +- .../xsimd/arch/common/xsimd_common_memory.hpp | 16 +- include/xsimd/arch/xsimd_avx.hpp | 15 +- include/xsimd/arch/xsimd_avx2.hpp | 108 +++-------- include/xsimd/arch/xsimd_avx2_128.hpp | 106 ++++------- include/xsimd/arch/xsimd_avx512f.hpp | 17 ++ include/xsimd/arch/xsimd_avx512vl_128.hpp | 180 +++++++++--------- include/xsimd/arch/xsimd_avx512vl_256.hpp | 179 +++++++++-------- include/xsimd/arch/xsimd_avx_128.hpp | 7 +- include/xsimd/arch/xsimd_isa.hpp | 13 +- include/xsimd/types/xsimd_utils.hpp | 10 - 11 files changed, 282 insertions(+), 373 deletions(-) diff --git a/.github/workflows/doxygen.yml b/.github/workflows/doxygen.yml index 00826b921..9d749cf36 100644 --- a/.github/workflows/doxygen.yml +++ b/.github/workflows/doxygen.yml @@ -9,6 +9,8 @@ jobs: steps: - uses: actions/checkout@v6 - name: Install dependencies - run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme + run: | + sudo apt-get update + sudo apt-get install -y doxygen python3-breathe python3-sphinx-rtd-theme - name: Render run: make -C docs diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index de76d512a..ae1478a83 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -441,12 +441,8 @@ namespace xsimd XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - // Per-lane validity contract: only active lanes of ``mem`` are - // required to be addressable. An unconditional whole-vector load - // would touch inactive lanes and trip ASan/Valgrind on partial - // buffers, so stay scalar. Arches with hardware predicated loads - // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single - // intrinsic that suppresses inactive-lane reads in hardware. + // Scalar fallback: only active lanes are touched. Arches with + // hardware predicated loads override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array buffer; for (std::size_t i = 0; i < size; ++i) @@ -465,12 +461,8 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - // Per-lane validity contract (matches native masked-store APIs): - // only active lanes of ``mem`` are touched. A load+select+store - // RMW would both read and write inactive bytes, breaking that - // contract — stay scalar. Arches with hardware predicated stores - // override this with a single intrinsic that suppresses inactive - // lanes in hardware. + // Scalar fallback: only active lanes are touched. Arches with + // hardware predicated stores override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array src_buf; src.store_aligned(src_buf.data()); diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 2c92cd3ca..3191fc922 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -987,9 +987,7 @@ namespace xsimd } } - // Runtime-mask load for float/double on AVX. Both aligned_mode and - // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault - // on masked-off lanes, so partial loads across page boundaries are safe. + // Runtime-mask load (float/double). template XSIMD_INLINE batch load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept @@ -1036,12 +1034,8 @@ namespace xsimd // store_masked namespace detail { - // True when batch_bool is the legacy VEX vector mask, i.e. it is stored - // in the same register as the data (__m256 / __m256d) rather than in an EVEX - // k-register (__mmask8) as on the avx512vl architectures. The _mm256_cast*_si256 - // path below is only well-formed for the vector-mask representation. This names - // no architecture — it tests the mask's representation, in the spirit of - // detail::masked_memory_uses_fp_bitcast. + // True when batch_bool shares the data register (__m256/__m256d) rather + // than an EVEX k-register; the _mm256_cast*_si256 path below needs the former. template using uses_vector_mask = std::is_same::register_type, typename batch::register_type>; @@ -1087,8 +1081,7 @@ namespace xsimd } } - // Runtime-mask store for float/double on AVX. Same fault-suppression - // semantics as the masked loads above; alignment mode is irrelevant. + // Runtime-mask store (float/double). template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index ce61a362f..5cebad342 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -117,18 +117,34 @@ namespace xsimd } } - // load_masked - // AVX2 low-level helpers (operate on raw SIMD registers) + // load_masked / store_masked: AVX2 has _mm256_maskload/maskstore_epi{32,64}; + // 8/16-bit integers fall back to the common scalar path. namespace detail { - XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept + template + XSIMD_INLINE __m256i maskload(T const* mem, __m256i mask) noexcept { - return _mm256_maskload_epi32(mem, mask); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_maskload_epi32(reinterpret_cast(mem), mask); + } + else + { + return _mm256_maskload_epi64(reinterpret_cast(mem), mask); + } } - XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept + template + XSIMD_INLINE void maskstore(T* mem, __m256i mask, __m256i src) noexcept { - return _mm256_maskload_epi64(reinterpret_cast(mem), mask); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); + } + else + { + _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); + } } XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept @@ -137,72 +153,22 @@ namespace xsimd } } - // single templated implementation for integer masked loads (32/64-bit) template - XSIMD_INLINE std::enable_if_t::value && (sizeof(T) >= 4), batch> + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2"); - using int_t = std::conditional_t; - // Use the raw register-level maskload helpers for the remaining cases. - return detail::maskload(reinterpret_cast(mem), mask.as_batch()); - } - - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); - } - - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); - return bitwise_cast(r); + return detail::maskload(mem, mask.as_batch()); } - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); - } - - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); - return bitwise_cast(r); - } - - // Runtime-mask load for 32/64-bit integers on AVX2; narrower widths fall - // back to the scalar common path. Aligned and unaligned share the same - // intrinsic — masked-off lanes do not fault regardless of alignment. template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; - return detail::maskload(reinterpret_cast(mem), __m256i(mask)); - } - - // store_masked - namespace detail - { - template - XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept - { - _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); - } - - template - XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept - { - _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); - } + return detail::maskload(mem, __m256i(mask)); } template ::value && (sizeof(T) >= 4)>> + typename = std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; @@ -225,33 +191,15 @@ namespace xsimd } else { - detail::maskstore(mem, mask.as_batch(), src); + detail::maskstore(mem, mask.as_batch(), src); } } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - const auto s32 = bitwise_cast(src); - store_masked(reinterpret_cast(mem), s32, batch_bool_constant {}, Mode {}, avx2 {}); - } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - const auto s64 = bitwise_cast(src); - store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); - } - - // Runtime-mask store for 32/64-bit integers on AVX2; narrower widths fall - // back to the scalar common path. Same fault-suppression semantics as the - // masked loads above; alignment mode is irrelevant. template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; - detail::maskstore(reinterpret_cast(mem), __m256i(mask), __m256i(src)); + detail::maskstore(mem, __m256i(mask), __m256i(src)); } // load_stream diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index 0d33c64fe..a55a1b729 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -89,91 +89,65 @@ namespace xsimd } } - // load_masked — native 128-bit integer masked loads. Tagged on avx2_128 - // because the vpmaskmov* intrinsics require AVX2; an AVX1-only build routes - // integer masked memory through the float path in xsimd_common_memory.hpp. - // Any arch with a native masked path provides its own exact-tag overload that - // out-ranks this one, so no cross-arch exclusion is needed here. - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return _mm_maskload_epi32(mem, mask.as_batch()); - } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return _mm_maskload_epi32(reinterpret_cast(mem), mask.as_batch()); - } - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return _mm_maskload_epi64(reinterpret_cast(mem), mask.as_batch()); - } - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // load_masked / store_masked: native 128-bit integer masked memory. + // Tagged on avx2_128 because vpmaskmov* needs AVX2; an AVX1-only build + // routes integer masked memory through the float path in + // xsimd_common_memory.hpp. 8/16-bit fall back to the common scalar path. + namespace detail { - return _mm_maskload_epi64(reinterpret_cast(mem), mask.as_batch()); - } + template + XSIMD_INLINE __m128i maskload_avx2_128(T const* mem, __m128i mask) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_maskload_epi32(reinterpret_cast(mem), mask); + } + else + { + return _mm_maskload_epi64(reinterpret_cast(mem), mask); + } + } - // store_masked — native 128-bit integer masked stores (see load note above). - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - return _mm_maskstore_epi32(mem, mask.as_batch(), src); - } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - return _mm_maskstore_epi32(reinterpret_cast(mem), mask.as_batch(), src); - } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - return _mm_maskstore_epi64(reinterpret_cast(mem), mask.as_batch(), src); + template + XSIMD_INLINE void maskstore_avx2_128(T* mem, __m128i mask, __m128i src) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + _mm_maskstore_epi32(reinterpret_cast(mem), mask, src); + } + else + { + _mm_maskstore_epi64(reinterpret_cast(mem), mask, src); + } + } } - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskstore_epi64(reinterpret_cast(mem), mask.as_batch(), src); + return detail::maskload_avx2_128(mem, mask.as_batch()); } - // Runtime-mask path for 32/64-bit integers; narrower widths fall back to - // the common scalar path. Aligned and unaligned share the same intrinsic - // — masked-off lanes do not fault regardless of alignment. - namespace detail + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept - { - return _mm_maskload_epi32(mem, mask); - } - XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept - { - return _mm_maskload_epi64(mem, mask); - } - XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept - { - _mm_maskstore_epi32(mem, mask, src); - } - XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept - { - _mm_maskstore_epi64(mem, mask, src); - } + detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src)); } template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; - return detail::maskload_128(reinterpret_cast(mem), __m128i(mask)); + return detail::maskload_avx2_128(mem, __m128i(mask)); } template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - using int_t = std::conditional_t; - detail::maskstore_128(reinterpret_cast(mem), __m128i(mask), __m128i(src)); + detail::maskstore_avx2_128(mem, __m128i(mask), __m128i(src)); } // gather diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index cc057eacf..db8817868 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -354,6 +354,23 @@ namespace xsimd } } + // Runtime-mask load/store: same native k-register path as the constant + // overloads above, minus the compile-time half-forwarding. 8/16-bit + // elements fall back to the common scalar path. + template = 4)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return detail::load_masked(mem, mask.mask(), Mode {}); + } + + template = 4)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + detail::store_masked(mem, src, mask.mask(), Mode {}); + } + // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp index 855870af3..ed1d1cd17 100644 --- a/include/xsimd/arch/xsimd_avx512vl_128.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -26,6 +26,14 @@ namespace xsimd namespace detail { + // Defined in xsimd_avx512f.hpp. This header is included before it so + // that avx512f.hpp's masked load/store forwarder can resolve the + // avx512vl_128 overloads below by ordinary lookup; forward-declare + // the two helpers it borrows from there. + XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept; + template + XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N]); + template XSIMD_INLINE batch_bool compare_int_avx512vl_128(batch const& self, batch const& other) noexcept { @@ -188,125 +196,111 @@ namespace xsimd return _mm_abs_epi64(self); } - // Per-type masked load/store — partial ordering picks these over the - // avx2 bridges this arch inherits. Unsigned overloads reinterpret to - // the signed EVEX intrinsic. - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Masked load/store: native 128-bit EVEX predication shared by the + // constant (batch_bool_constant) and runtime (batch_bool) overloads. + // Partial ordering picks the avx512vl_128 tag over the avx2_128 bridges + // this arch inherits — crucial because the k-register mask cannot feed + // the avx2 vpmaskmov path. 8/16-bit elements fall back to the common + // scalar path. Unsigned element types reinterpret to the signed EVEX + // intrinsic. + namespace detail { - XSIMD_IF_CONSTEXPR(std::is_same::value) + // One core per native register type; signed and unsigned integrals + // share an overload (the EVEX intrinsic is sign-agnostic). Mode + // selects aligned vs unaligned. + template = 0> + XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_load_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm_maskz_load_epi32((__mmask8)m, mem); + else + return _mm_maskz_loadu_epi32((__mmask8)m, mem); } - else + template = 0> + XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm_maskz_load_epi64((__mmask8)m, mem); + else + return _mm_maskz_loadu_epi64((__mmask8)m, mem); } - } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); - } - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE __m128 maskload128(float const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_load_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm_maskz_load_ps((__mmask8)m, mem); + else + return _mm_maskz_loadu_ps((__mmask8)m, mem); } - else + template + XSIMD_INLINE __m128d maskload128(double const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm_maskz_load_pd((__mmask8)m, mem); + else + return _mm_maskz_loadu_pd((__mmask8)m, mem); } - } - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); - } - template - XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + + template = 0> + XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept { - return _mm_maskz_load_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm_mask_store_epi32(mem, (__mmask8)m, src); + else + _mm_mask_storeu_epi32(mem, (__mmask8)m, src); } - else + template = 0> + XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm_mask_store_epi64(mem, (__mmask8)m, src); + else + _mm_mask_storeu_epi64(mem, (__mmask8)m, src); } - } - template - XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE void maskstore128(float* mem, __m128 src, uint64_t m, Mode) noexcept { - return _mm_maskz_load_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm_mask_store_ps(mem, (__mmask8)m, src); + else + _mm_mask_storeu_ps(mem, (__mmask8)m, src); } - else + template + XSIMD_INLINE void maskstore128(double* mem, __m128d src, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm_mask_store_pd(mem, (__mmask8)m, src); + else + _mm_mask_storeu_pd(mem, (__mmask8)m, src); } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_epi32(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_epi32(mem, mask.mask(), src); - } - } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); - } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_epi64(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_epi64(mem, mask.mask(), src); - } + return detail::maskload128(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); + return detail::maskload128(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_ps(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_ps(mem, mask.mask(), src); - } + detail::maskstore128(mem, src, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_pd(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_pd(mem, mask.mask(), src); - } + detail::maskstore128(mem, src, mask.mask(), Mode {}); } // max diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index c0b4a568e..bb4614552 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -26,6 +26,14 @@ namespace xsimd namespace detail { + // Defined in xsimd_avx512f.hpp. This header is included before it so + // that avx512f.hpp's masked load/store forwarder can resolve the + // avx512vl_256 overloads below by ordinary lookup; forward-declare + // the two helpers it borrows from there. + XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept; + template + XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N]); + template XSIMD_INLINE batch_bool compare_int_avx512vl_256(batch const& self, batch const& other) noexcept { @@ -188,125 +196,110 @@ namespace xsimd return _mm256_abs_epi64(self); } - // Per-type masked load/store — partial ordering picks these over the - // avx2 bridges this arch inherits. Unsigned overloads reinterpret to - // the signed EVEX intrinsic. - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Masked load/store: native 256-bit EVEX predication shared by the + // constant (batch_bool_constant) and runtime (batch_bool) overloads. + // Partial ordering picks the avx512vl_256 tag over the avx2 bridges this + // arch inherits — crucial because the k-register mask cannot feed the + // avx2 vpmaskmov path. 8/16-bit elements fall back to the common scalar + // path. Unsigned element types reinterpret to the signed EVEX intrinsic. + namespace detail { - XSIMD_IF_CONSTEXPR(std::is_same::value) + // One core per native register type; signed and unsigned integrals + // share an overload (the EVEX intrinsic is sign-agnostic). Mode + // selects aligned vs unaligned. + template = 0> + XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm256_maskz_load_epi32((__mmask8)m, mem); + else + return _mm256_maskz_loadu_epi32((__mmask8)m, mem); } - else + template = 0> + XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm256_maskz_load_epi64((__mmask8)m, mem); + else + return _mm256_maskz_loadu_epi64((__mmask8)m, mem); } - } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); - } - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE __m256 maskload256(float const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm256_maskz_load_ps((__mmask8)m, mem); + else + return _mm256_maskz_loadu_ps((__mmask8)m, mem); } - else + template + XSIMD_INLINE __m256d maskload256(double const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + return _mm256_maskz_load_pd((__mmask8)m, mem); + else + return _mm256_maskz_loadu_pd((__mmask8)m, mem); } - } - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); - } - template - XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + + template = 0> + XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm256_mask_store_epi32(mem, (__mmask8)m, src); + else + _mm256_mask_storeu_epi32(mem, (__mmask8)m, src); } - else + template = 0> + XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm256_mask_store_epi64(mem, (__mmask8)m, src); + else + _mm256_mask_storeu_epi64(mem, (__mmask8)m, src); } - } - template - XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE void maskstore256(float* mem, __m256 src, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm256_mask_store_ps(mem, (__mmask8)m, src); + else + _mm256_mask_storeu_ps(mem, (__mmask8)m, src); } - else + template + XSIMD_INLINE void maskstore256(double* mem, __m256d src, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + _mm256_mask_store_pd(mem, (__mmask8)m, src); + else + _mm256_mask_storeu_pd(mem, (__mmask8)m, src); } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_epi32(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); - } - } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); - } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_epi64(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); - } + return detail::maskload256(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); + return detail::maskload256(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_ps(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_ps(mem, mask.mask(), src); - } + detail::maskstore256(mem, src, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_pd(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_pd(mem, mask.mask(), src); - } + detail::maskstore256(mem, src, mask.mask(), Mode {}); } // max diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 29931998c..e534ceba0 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -115,9 +115,7 @@ namespace xsimd return _mm_maskload_pd(mem, mask.as_batch()); } - // Runtime-mask load for float/double on AVX-128. Both aligned_mode and - // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault - // on masked-off lanes, so partial loads across page boundaries are safe. + // Runtime-mask load (float/double). template XSIMD_INLINE batch load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept @@ -144,8 +142,7 @@ namespace xsimd return _mm_maskstore_pd(mem, mask.as_batch(), src); } - // Runtime-mask store for float/double on AVX-128. Same fault-suppression - // semantics as the masked loads above; alignment mode is irrelevant. + // Runtime-mask store (float/double). template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 06edfa98f..ff9dc381d 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -74,14 +74,23 @@ #include "./xsimd_fma3_avx2.hpp" #endif +#if XSIMD_WITH_AVX512VL +// The 128/256-bit AVX512VL sub-arches derive from the AVX2 lineage (not AVX512F) +// and carry the k-register masked load/store overloads. avx512f.hpp's masked +// load/store forwards to the 256-bit sized-batch arch (avx512vl_256) via an +// unqualified dependent call, which clang only resolves through ordinary lookup +// at the point of definition (ADL cannot reach xsimd::kernel from xsimd-namespace +// arguments). The sub-arch overloads must therefore be declared beforehand. +#include "./xsimd_avx512vl_128.hpp" +#include "./xsimd_avx512vl_256.hpp" +#endif + #if XSIMD_WITH_AVX512F #include "./xsimd_avx512f.hpp" #endif #if XSIMD_WITH_AVX512VL #include "./xsimd_avx512vl.hpp" -#include "./xsimd_avx512vl_128.hpp" -#include "./xsimd_avx512vl_256.hpp" #endif #if XSIMD_WITH_AVX512DQ diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp index e198c0966..5dbab8551 100644 --- a/include/xsimd/types/xsimd_utils.hpp +++ b/include/xsimd/types/xsimd_utils.hpp @@ -457,16 +457,6 @@ namespace xsimd template using complex_batch_type_t = typename complex_batch_type::type; - - namespace details - { - // Returns a bitmask with the lowest \c size bits set. Used by masked - // load/store fast paths to detect "all lanes active". - inline constexpr uint64_t full_mask(std::size_t size) noexcept - { - return size >= 64 ? ~uint64_t(0) : ((uint64_t(1) << size) - 1); - } - } } #endif From 6e282690dfbac356e24b4f77e1e695537449ba3d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 10 Jun 2026 14:54:29 -0400 Subject: [PATCH 3/3] perf: native AVX512BW masked load/store for 8/16-bit integers 8/16-bit int masked load/store on AVX512BW previously fell through to the branchy common scalar fallback because xsimd_avx512bw.hpp had no load_masked/store_masked overloads. Add four requires_arch overloads (runtime batch_bool + compile-time batch_bool_constant, load + store) constrained to sizeof(T)==1||2, emitting the native vmovdqu8 / vmovdqu16 predicated moves (2 instructions, no branch). The size branch lives only in the runtime overloads; the constant overloads delegate via mask.as_batch_bool(), which also avoids batch_bool_constant::mask() (return type int) truncating a 64-lane int8 compile-time mask. 32/64-bit stays on the avx512f path; SSE/AVX2 8/16-bit scalar fallback is hardware-forced and unchanged. --- docs/source/api/data_transfer.rst | 4 +- .../xsimd/arch/common/xsimd_common_memory.hpp | 4 +- include/xsimd/arch/xsimd_avx2.hpp | 4 ++ include/xsimd/arch/xsimd_avx512bw.hpp | 47 +++++++++++++++++++ include/xsimd/arch/xsimd_avx512f.hpp | 3 +- include/xsimd/arch/xsimd_avx512vl_128.hpp | 32 +++++++++++++ include/xsimd/arch/xsimd_avx512vl_256.hpp | 32 +++++++++++++ include/xsimd/arch/xsimd_isa.hpp | 7 ++- 8 files changed, 124 insertions(+), 9 deletions(-) diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index 0195e6c51..38b4ab390 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -90,5 +90,5 @@ The following empty types are used for tag dispatching: .. [#m] Masked ``load`` / ``store`` come in two flavours. The :cpp:class:`batch_bool_constant` overload encodes the mask in the type and is resolved at compile time. The runtime :cpp:class:`batch_bool` overload - accepts a mask computed at runtime. Prefer the compile-time mask whenever - the selection is known at compile time. + accepts a mask computed at runtime. For performance reasons, prefer the + compile-time mask whenever possible. diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index ae1478a83..a77b5aa1c 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -442,7 +442,7 @@ namespace xsimd load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { // Scalar fallback: only active lanes are touched. Arches with - // hardware predicated loads override this. + // hardware predicated loads should override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array buffer; for (std::size_t i = 0; i < size; ++i) @@ -462,7 +462,7 @@ namespace xsimd store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { // Scalar fallback: only active lanes are touched. Arches with - // hardware predicated stores override this. + // hardware predicated stores should override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array src_buf; src.store_aligned(src_buf.data()); diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 5cebad342..abac721b0 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -126,10 +126,12 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { + static_assert(sizeof(int) == 4, "_mm256_maskload_epi32 requires a 4-byte int"); return _mm256_maskload_epi32(reinterpret_cast(mem), mask); } else { + static_assert(sizeof(long long) == 8, "_mm256_maskload_epi64 requires an 8-byte long long"); return _mm256_maskload_epi64(reinterpret_cast(mem), mask); } } @@ -139,10 +141,12 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { + static_assert(sizeof(int) == 4, "_mm256_maskstore_epi32 requires a 4-byte int"); _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); } else { + static_assert(sizeof(long long) == 8, "_mm256_maskstore_epi64 requires an 8-byte long long"); _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); } } diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 57894a831..f6a78366c 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -378,6 +378,53 @@ namespace xsimd } } + // load_masked / store_masked: native vmovdqu8 / vmovdqu16 predication for + // 8/16-bit, replacing the common scalar fallback. No aligned masked 8/16 + // intrinsic exists and masked moves never fault, so loadu fits both modes. + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_maskz_loadu_epi8((__mmask64)mask.mask(), mem); + } + else + { + return _mm512_maskz_loadu_epi16((__mmask32)mask.mask(), mem); + } + } + + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + _mm512_mask_storeu_epi8((void*)mem, (__mmask64)mask.mask(), src); + } + else + { + _mm512_mask_storeu_epi16((void*)mem, (__mmask32)mask.mask(), src); + } + } + + // Constant masks reuse the runtime overloads; as_batch_bool() also avoids + // batch_bool_constant::mask() truncating a 64-lane int8 mask to int. + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return load_masked(mem, mask.as_batch_bool(), convert {}, Mode {}, avx512bw {}); + } + + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + store_masked(mem, src, mask.as_batch_bool(), Mode {}, avx512bw {}); + } + // max template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index db8817868..ca1b38079 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -356,7 +356,8 @@ namespace xsimd // Runtime-mask load/store: same native k-register path as the constant // overloads above, minus the compile-time half-forwarding. 8/16-bit - // elements fall back to the common scalar path. + // elements are handled natively by avx512bw (vmovdqu8 / vmovdqu16); + // without AVX512BW they fall back to the common scalar path. template = 4)>> XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp index ed1d1cd17..26c49b562 100644 --- a/include/xsimd/arch/xsimd_avx512vl_128.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -212,66 +212,98 @@ namespace xsimd XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm_maskz_load_epi32((__mmask8)m, mem); + } else + { return _mm_maskz_loadu_epi32((__mmask8)m, mem); + } } template = 0> XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm_maskz_load_epi64((__mmask8)m, mem); + } else + { return _mm_maskz_loadu_epi64((__mmask8)m, mem); + } } template XSIMD_INLINE __m128 maskload128(float const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm_maskz_load_ps((__mmask8)m, mem); + } else + { return _mm_maskz_loadu_ps((__mmask8)m, mem); + } } template XSIMD_INLINE __m128d maskload128(double const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm_maskz_load_pd((__mmask8)m, mem); + } else + { return _mm_maskz_loadu_pd((__mmask8)m, mem); + } } template = 0> XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm_mask_store_epi32(mem, (__mmask8)m, src); + } else + { _mm_mask_storeu_epi32(mem, (__mmask8)m, src); + } } template = 0> XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm_mask_store_epi64(mem, (__mmask8)m, src); + } else + { _mm_mask_storeu_epi64(mem, (__mmask8)m, src); + } } template XSIMD_INLINE void maskstore128(float* mem, __m128 src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm_mask_store_ps(mem, (__mmask8)m, src); + } else + { _mm_mask_storeu_ps(mem, (__mmask8)m, src); + } } template XSIMD_INLINE void maskstore128(double* mem, __m128d src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm_mask_store_pd(mem, (__mmask8)m, src); + } else + { _mm_mask_storeu_pd(mem, (__mmask8)m, src); + } } } diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index bb4614552..9731574c8 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -211,66 +211,98 @@ namespace xsimd XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm256_maskz_load_epi32((__mmask8)m, mem); + } else + { return _mm256_maskz_loadu_epi32((__mmask8)m, mem); + } } template = 0> XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm256_maskz_load_epi64((__mmask8)m, mem); + } else + { return _mm256_maskz_loadu_epi64((__mmask8)m, mem); + } } template XSIMD_INLINE __m256 maskload256(float const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm256_maskz_load_ps((__mmask8)m, mem); + } else + { return _mm256_maskz_loadu_ps((__mmask8)m, mem); + } } template XSIMD_INLINE __m256d maskload256(double const* mem, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { return _mm256_maskz_load_pd((__mmask8)m, mem); + } else + { return _mm256_maskz_loadu_pd((__mmask8)m, mem); + } } template = 0> XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm256_mask_store_epi32(mem, (__mmask8)m, src); + } else + { _mm256_mask_storeu_epi32(mem, (__mmask8)m, src); + } } template = 0> XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm256_mask_store_epi64(mem, (__mmask8)m, src); + } else + { _mm256_mask_storeu_epi64(mem, (__mmask8)m, src); + } } template XSIMD_INLINE void maskstore256(float* mem, __m256 src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm256_mask_store_ps(mem, (__mmask8)m, src); + } else + { _mm256_mask_storeu_ps(mem, (__mmask8)m, src); + } } template XSIMD_INLINE void maskstore256(double* mem, __m256d src, uint64_t m, Mode) noexcept { XSIMD_IF_CONSTEXPR(std::is_same::value) + { _mm256_mask_store_pd(mem, (__mmask8)m, src); + } else + { _mm256_mask_storeu_pd(mem, (__mmask8)m, src); + } } } diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index ff9dc381d..320e3b311 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -81,18 +81,17 @@ // unqualified dependent call, which clang only resolves through ordinary lookup // at the point of definition (ADL cannot reach xsimd::kernel from xsimd-namespace // arguments). The sub-arch overloads must therefore be declared beforehand. +// clang-format off #include "./xsimd_avx512vl_128.hpp" #include "./xsimd_avx512vl_256.hpp" +#include "./xsimd_avx512vl.hpp" +// clang-format on #endif #if XSIMD_WITH_AVX512F #include "./xsimd_avx512f.hpp" #endif -#if XSIMD_WITH_AVX512VL -#include "./xsimd_avx512vl.hpp" -#endif - #if XSIMD_WITH_AVX512DQ #include "./xsimd_avx512dq.hpp" #endif