diff --git a/.github/workflows/doxygen.yml b/.github/workflows/doxygen.yml index 00826b921..9d749cf36 100644 --- a/.github/workflows/doxygen.yml +++ b/.github/workflows/doxygen.yml @@ -9,6 +9,8 @@ jobs: steps: - uses: actions/checkout@v6 - name: Install dependencies - run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme + run: | + sudo apt-get update + sudo apt-get install -y doxygen python3-breathe python3-sphinx-rtd-theme - name: Render run: make -C docs diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index 73f9256fd..38b4ab390 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -12,7 +12,7 @@ Data Transfers From memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`load` | load values from memory (optionally masked) | +| :cpp:func:`load` | load values from memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_aligned` | load values from aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -32,7 +32,7 @@ From a scalar: To memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`store` | store values to memory (optionally masked) | +| :cpp:func:`store` | store values to memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_aligned` | store values to aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -84,3 +84,11 @@ The following empty types are used for tag dispatching: .. doxygenstruct:: xsimd::unaligned_mode :project: xsimd + +.. rubric:: Footnotes + +.. [#m] Masked ``load`` / ``store`` come in two flavours. The + :cpp:class:`batch_bool_constant` overload encodes the mask in the type and + is resolved at compile time. The runtime :cpp:class:`batch_bool` overload + accepts a mask computed at runtime. For performance reasons, prefer the + compile-time mask whenever possible. diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index bd2d14f93..a77b5aa1c 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -437,6 +437,19 @@ namespace xsimd return detail::load_masked_common(mem, mask, cvt, mode, detail::masked_memory_uses_fp_bitcast {}); } + template + XSIMD_INLINE batch + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + // Scalar fallback: only active lanes are touched. Arches with + // hardware predicated loads should override this. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array buffer; + for (std::size_t i = 0; i < size; ++i) + buffer[i] = mask.get(i) ? mem[i] : T(0); + return batch::load_aligned(buffer.data()); + } + template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment mode, requires_arch) noexcept @@ -444,6 +457,20 @@ namespace xsimd detail::store_masked_common(mem, src, mask, mode, detail::masked_memory_uses_fp_bitcast {}); } + template + XSIMD_INLINE void + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + // Scalar fallback: only active lanes are touched. Arches with + // hardware predicated stores should override this. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array src_buf; + src.store_aligned(src_buf.data()); + for (std::size_t i = 0; i < size; ++i) + if (mask.get(i)) + mem[i] = src_buf[i]; + } + template XSIMD_INLINE batch load_stream(T_in const* mem, convert cvt, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index a542d3f31..3191fc922 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -987,6 +987,21 @@ namespace xsimd } } + // Runtime-mask load (float/double). + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_ps(mem, _mm256_castps_si256(mask)); + } + + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask)); + } + // load_masked (single overload for float/double) template ::value>> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept @@ -1019,12 +1034,8 @@ namespace xsimd // store_masked namespace detail { - // True when batch_bool is the legacy VEX vector mask, i.e. it is stored - // in the same register as the data (__m256 / __m256d) rather than in an EVEX - // k-register (__mmask8) as on the avx512vl architectures. The _mm256_cast*_si256 - // path below is only well-formed for the vector-mask representation. This names - // no architecture — it tests the mask's representation, in the spirit of - // detail::masked_memory_uses_fp_bitcast. + // True when batch_bool shares the data register (__m256/__m256d) rather + // than an EVEX k-register; the _mm256_cast*_si256 path below needs the former. template using uses_vector_mask = std::is_same::register_type, typename batch::register_type>; @@ -1070,6 +1081,21 @@ namespace xsimd } } + // Runtime-mask store (float/double). + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + detail::maskstore(mem, mask, src); + } + + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + detail::maskstore(mem, mask, src); + } + // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 5cb47f908..abac721b0 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -117,18 +117,38 @@ namespace xsimd } } - // load_masked - // AVX2 low-level helpers (operate on raw SIMD registers) + // load_masked / store_masked: AVX2 has _mm256_maskload/maskstore_epi{32,64}; + // 8/16-bit integers fall back to the common scalar path. namespace detail { - XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept + template + XSIMD_INLINE __m256i maskload(T const* mem, __m256i mask) noexcept { - return _mm256_maskload_epi32(mem, mask); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + static_assert(sizeof(int) == 4, "_mm256_maskload_epi32 requires a 4-byte int"); + return _mm256_maskload_epi32(reinterpret_cast(mem), mask); + } + else + { + static_assert(sizeof(long long) == 8, "_mm256_maskload_epi64 requires an 8-byte long long"); + return _mm256_maskload_epi64(reinterpret_cast(mem), mask); + } } - XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept + template + XSIMD_INLINE void maskstore(T* mem, __m256i mask, __m256i src) noexcept { - return _mm256_maskload_epi64(reinterpret_cast(mem), mask); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + static_assert(sizeof(int) == 4, "_mm256_maskstore_epi32 requires a 4-byte int"); + _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); + } + else + { + static_assert(sizeof(long long) == 8, "_mm256_maskstore_epi64 requires an 8-byte long long"); + _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); + } } XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept @@ -137,61 +157,22 @@ namespace xsimd } } - // single templated implementation for integer masked loads (32/64-bit) template - XSIMD_INLINE std::enable_if_t::value && (sizeof(T) >= 4), batch> + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2"); - using int_t = std::conditional_t; - // Use the raw register-level maskload helpers for the remaining cases. - return detail::maskload(reinterpret_cast(mem), mask.as_batch()); + return detail::maskload(mem, mask.as_batch()); } - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); - } - - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); - return bitwise_cast(r); - } - - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); - } - - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); - return bitwise_cast(r); - } - - // store_masked - namespace detail - { - template - XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept - { - _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); - } - - template - XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept - { - _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); - } + return detail::maskload(mem, __m256i(mask)); } template ::value && (sizeof(T) >= 4)>> + typename = std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; @@ -214,22 +195,15 @@ namespace xsimd } else { - detail::maskstore(mem, mask.as_batch(), src); + detail::maskstore(mem, mask.as_batch(), src); } } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - const auto s32 = bitwise_cast(src); - store_masked(reinterpret_cast(mem), s32, batch_bool_constant {}, Mode {}, avx2 {}); - } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - const auto s64 = bitwise_cast(src); - store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); + detail::maskstore(mem, __m256i(mask), __m256i(src)); } // load_stream diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index c0f119e4e..a55a1b729 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -89,52 +89,65 @@ namespace xsimd } } - // load_masked — native 128-bit integer masked loads. Tagged on avx2_128 - // because the vpmaskmov* intrinsics require AVX2; an AVX1-only build routes - // integer masked memory through the float path in xsimd_common_memory.hpp. - // Any arch with a native masked path provides its own exact-tag overload that - // out-ranks this one, so no cross-arch exclusion is needed here. - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // load_masked / store_masked: native 128-bit integer masked memory. + // Tagged on avx2_128 because vpmaskmov* needs AVX2; an AVX1-only build + // routes integer masked memory through the float path in + // xsimd_common_memory.hpp. 8/16-bit fall back to the common scalar path. + namespace detail { - return _mm_maskload_epi32(mem, mask.as_batch()); - } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return _mm_maskload_epi32(reinterpret_cast(mem), mask.as_batch()); - } - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return _mm_maskload_epi64(reinterpret_cast(mem), mask.as_batch()); - } - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - return _mm_maskload_epi64(reinterpret_cast(mem), mask.as_batch()); + template + XSIMD_INLINE __m128i maskload_avx2_128(T const* mem, __m128i mask) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_maskload_epi32(reinterpret_cast(mem), mask); + } + else + { + return _mm_maskload_epi64(reinterpret_cast(mem), mask); + } + } + + template + XSIMD_INLINE void maskstore_avx2_128(T* mem, __m128i mask, __m128i src) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + _mm_maskstore_epi32(reinterpret_cast(mem), mask, src); + } + else + { + _mm_maskstore_epi64(reinterpret_cast(mem), mask, src); + } + } } - // store_masked — native 128-bit integer masked stores (see load note above). - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - return _mm_maskstore_epi32(mem, mask.as_batch(), src); + return detail::maskload_avx2_128(mem, mask.as_batch()); } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi32(reinterpret_cast(mem), mask.as_batch(), src); + detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src)); } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - return _mm_maskstore_epi64(reinterpret_cast(mem), mask.as_batch(), src); + return detail::maskload_avx2_128(mem, __m128i(mask)); } - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - return _mm_maskstore_epi64(reinterpret_cast(mem), mask.as_batch(), src); + detail::maskstore_avx2_128(mem, __m128i(mask), __m128i(src)); } // gather diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 57894a831..f6a78366c 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -378,6 +378,53 @@ namespace xsimd } } + // load_masked / store_masked: native vmovdqu8 / vmovdqu16 predication for + // 8/16-bit, replacing the common scalar fallback. No aligned masked 8/16 + // intrinsic exists and masked moves never fault, so loadu fits both modes. + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_maskz_loadu_epi8((__mmask64)mask.mask(), mem); + } + else + { + return _mm512_maskz_loadu_epi16((__mmask32)mask.mask(), mem); + } + } + + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + _mm512_mask_storeu_epi8((void*)mem, (__mmask64)mask.mask(), src); + } + else + { + _mm512_mask_storeu_epi16((void*)mem, (__mmask32)mask.mask(), src); + } + } + + // Constant masks reuse the runtime overloads; as_batch_bool() also avoids + // batch_bool_constant::mask() truncating a 64-lane int8 mask to int. + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return load_masked(mem, mask.as_batch_bool(), convert {}, Mode {}, avx512bw {}); + } + + template ::value && (sizeof(T) == 1 || sizeof(T) == 2)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + store_masked(mem, src, mask.as_batch_bool(), Mode {}, avx512bw {}); + } + // max template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index cc057eacf..ca1b38079 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -354,6 +354,24 @@ namespace xsimd } } + // Runtime-mask load/store: same native k-register path as the constant + // overloads above, minus the compile-time half-forwarding. 8/16-bit + // elements are handled natively by avx512bw (vmovdqu8 / vmovdqu16); + // without AVX512BW they fall back to the common scalar path. + template = 4)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return detail::load_masked(mem, mask.mask(), Mode {}); + } + + template = 4)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + detail::store_masked(mem, src, mask.mask(), Mode {}); + } + // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp index 855870af3..26c49b562 100644 --- a/include/xsimd/arch/xsimd_avx512vl_128.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -26,6 +26,14 @@ namespace xsimd namespace detail { + // Defined in xsimd_avx512f.hpp. This header is included before it so + // that avx512f.hpp's masked load/store forwarder can resolve the + // avx512vl_128 overloads below by ordinary lookup; forward-declare + // the two helpers it borrows from there. + XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept; + template + XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N]); + template XSIMD_INLINE batch_bool compare_int_avx512vl_128(batch const& self, batch const& other) noexcept { @@ -188,125 +196,143 @@ namespace xsimd return _mm_abs_epi64(self); } - // Per-type masked load/store — partial ordering picks these over the - // avx2 bridges this arch inherits. Unsigned overloads reinterpret to - // the signed EVEX intrinsic. - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Masked load/store: native 128-bit EVEX predication shared by the + // constant (batch_bool_constant) and runtime (batch_bool) overloads. + // Partial ordering picks the avx512vl_128 tag over the avx2_128 bridges + // this arch inherits — crucial because the k-register mask cannot feed + // the avx2 vpmaskmov path. 8/16-bit elements fall back to the common + // scalar path. Unsigned element types reinterpret to the signed EVEX + // intrinsic. + namespace detail { - XSIMD_IF_CONSTEXPR(std::is_same::value) + // One core per native register type; signed and unsigned integrals + // share an overload (the EVEX intrinsic is sign-agnostic). Mode + // selects aligned vs unaligned. + template = 0> + XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_load_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi32((__mmask8)m, mem); + } + else + { + return _mm_maskz_loadu_epi32((__mmask8)m, mem); + } } - else + template = 0> + XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi64((__mmask8)m, mem); + } + else + { + return _mm_maskz_loadu_epi64((__mmask8)m, mem); + } } - } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); - } - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE __m128 maskload128(float const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_load_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_ps((__mmask8)m, mem); + } + else + { + return _mm_maskz_loadu_ps((__mmask8)m, mem); + } } - else + template + XSIMD_INLINE __m128d maskload128(double const* mem, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_pd((__mmask8)m, mem); + } + else + { + return _mm_maskz_loadu_pd((__mmask8)m, mem); + } } - } - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); - } - template - XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + + template = 0> + XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept { - return _mm_maskz_load_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi32(mem, (__mmask8)m, src); + } + else + { + _mm_mask_storeu_epi32(mem, (__mmask8)m, src); + } } - else + template = 0> + XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi64(mem, (__mmask8)m, src); + } + else + { + _mm_mask_storeu_epi64(mem, (__mmask8)m, src); + } } - } - template - XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE void maskstore128(float* mem, __m128 src, uint64_t m, Mode) noexcept { - return _mm_maskz_load_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_ps(mem, (__mmask8)m, src); + } + else + { + _mm_mask_storeu_ps(mem, (__mmask8)m, src); + } } - else + template + XSIMD_INLINE void maskstore128(double* mem, __m128d src, uint64_t m, Mode) noexcept { - return _mm_maskz_loadu_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_pd(mem, (__mmask8)m, src); + } + else + { + _mm_mask_storeu_pd(mem, (__mmask8)m, src); + } } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_epi32(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_epi32(mem, mask.mask(), src); - } - } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); + return detail::maskload128(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_epi64(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_epi64(mem, mask.mask(), src); - } + return detail::maskload128(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); + detail::maskstore128(mem, src, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_ps(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_ps(mem, mask.mask(), src); - } - } - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm_mask_store_pd(mem, mask.mask(), src); - } - else - { - _mm_mask_storeu_pd(mem, mask.mask(), src); - } + detail::maskstore128(mem, src, mask.mask(), Mode {}); } // max diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index c0b4a568e..9731574c8 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -26,6 +26,14 @@ namespace xsimd namespace detail { + // Defined in xsimd_avx512f.hpp. This header is included before it so + // that avx512f.hpp's masked load/store forwarder can resolve the + // avx512vl_256 overloads below by ordinary lookup; forward-declare + // the two helpers it borrows from there. + XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept; + template + XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N]); + template XSIMD_INLINE batch_bool compare_int_avx512vl_256(batch const& self, batch const& other) noexcept { @@ -188,125 +196,142 @@ namespace xsimd return _mm256_abs_epi64(self); } - // Per-type masked load/store — partial ordering picks these over the - // avx2 bridges this arch inherits. Unsigned overloads reinterpret to - // the signed EVEX intrinsic. - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Masked load/store: native 256-bit EVEX predication shared by the + // constant (batch_bool_constant) and runtime (batch_bool) overloads. + // Partial ordering picks the avx512vl_256 tag over the avx2 bridges this + // arch inherits — crucial because the k-register mask cannot feed the + // avx2 vpmaskmov path. 8/16-bit elements fall back to the common scalar + // path. Unsigned element types reinterpret to the signed EVEX intrinsic. + namespace detail { - XSIMD_IF_CONSTEXPR(std::is_same::value) + // One core per native register type; signed and unsigned integrals + // share an overload (the EVEX intrinsic is sign-agnostic). Mode + // selects aligned vs unaligned. + template = 0> + XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi32((__mmask8)m, mem); + } + else + { + return _mm256_maskz_loadu_epi32((__mmask8)m, mem); + } } - else + template = 0> + XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_epi32(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi64((__mmask8)m, mem); + } + else + { + return _mm256_maskz_loadu_epi64((__mmask8)m, mem); + } } - } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); - } - template - XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE __m256 maskload256(float const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_ps((__mmask8)m, mem); + } + else + { + return _mm256_maskz_loadu_ps((__mmask8)m, mem); + } } - else + template + XSIMD_INLINE __m256d maskload256(double const* mem, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_epi64(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_pd((__mmask8)m, mem); + } + else + { + return _mm256_maskz_loadu_pd((__mmask8)m, mem); + } } - } - template - XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); - } - template - XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + + template = 0> + XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi32(mem, (__mmask8)m, src); + } + else + { + _mm256_mask_storeu_epi32(mem, (__mmask8)m, src); + } } - else + template = 0> + XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_ps(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi64(mem, (__mmask8)m, src); + } + else + { + _mm256_mask_storeu_epi64(mem, (__mmask8)m, src); + } } - } - template - XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) + template + XSIMD_INLINE void maskstore256(float* mem, __m256 src, uint64_t m, Mode) noexcept { - return _mm256_maskz_load_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_ps(mem, (__mmask8)m, src); + } + else + { + _mm256_mask_storeu_ps(mem, (__mmask8)m, src); + } } - else + template + XSIMD_INLINE void maskstore256(double* mem, __m256d src, uint64_t m, Mode) noexcept { - return _mm256_maskz_loadu_pd(mask.mask(), mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_pd(mem, (__mmask8)m, src); + } + else + { + _mm256_mask_storeu_pd(mem, (__mmask8)m, src); + } } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_epi32(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); - } - } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); + return detail::maskload256(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_epi64(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); - } + return detail::maskload256(mem, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); + detail::maskstore256(mem, src, mask.mask(), Mode {}); } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_ps(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_ps(mem, mask.mask(), src); - } - } - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + + template ::value && (sizeof(T) == 4 || sizeof(T) == 8)>> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - _mm256_mask_store_pd(mem, mask.mask(), src); - } - else - { - _mm256_mask_storeu_pd(mem, mask.mask(), src); - } + detail::maskstore256(mem, src, mask.mask(), Mode {}); } // max diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 07dafd78b..e534ceba0 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -115,6 +115,20 @@ namespace xsimd return _mm_maskload_pd(mem, mask.as_batch()); } + // Runtime-mask load (float/double). + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_ps(mem, _mm_castps_si128(mask)); + } + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_pd(mem, _mm_castpd_si128(mask)); + } + // store_masked template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept @@ -128,6 +142,20 @@ namespace xsimd return _mm_maskstore_pd(mem, mask.as_batch(), src); } + // Runtime-mask store (float/double). + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_ps(mem, _mm_castps_si128(mask), src); + } + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_pd(mem, _mm_castpd_si128(mask), src); + } + // swizzle (dynamic mask) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 8c4818176..ab37e7912 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -88,8 +88,12 @@ namespace xsimd XSIMD_INLINE batch load(T const* mem, unaligned_mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; + template + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; + template + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 06edfa98f..320e3b311 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -74,14 +74,22 @@ #include "./xsimd_fma3_avx2.hpp" #endif -#if XSIMD_WITH_AVX512F -#include "./xsimd_avx512f.hpp" -#endif - #if XSIMD_WITH_AVX512VL -#include "./xsimd_avx512vl.hpp" +// The 128/256-bit AVX512VL sub-arches derive from the AVX2 lineage (not AVX512F) +// and carry the k-register masked load/store overloads. avx512f.hpp's masked +// load/store forwards to the 256-bit sized-batch arch (avx512vl_256) via an +// unqualified dependent call, which clang only resolves through ordinary lookup +// at the point of definition (ADL cannot reach xsimd::kernel from xsimd-namespace +// arguments). The sub-arch overloads must therefore be declared beforehand. +// clang-format off #include "./xsimd_avx512vl_128.hpp" #include "./xsimd_avx512vl_256.hpp" +#include "./xsimd_avx512vl.hpp" +// clang-format on +#endif + +#if XSIMD_WITH_AVX512F +#include "./xsimd_avx512f.hpp" #endif #if XSIMD_WITH_AVX512DQ diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 483f5e28f..5983f525d 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -409,6 +409,11 @@ namespace xsimd { XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) + // Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0, + // no memory access is performed for inactive lanes (page-fault safe). + XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*)) + // Masked store: inactive lanes are not written. + XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec)) } template = 0> @@ -423,6 +428,16 @@ namespace xsimd return load_aligned(src, convert(), rvv {}); } + // load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init + // passthrough so inactive lanes read as 0, matching xsimd's contract. + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + const auto zero = detail_rvv::rvvmv_splat(proj_t {}); + return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast(mem)); + } + // load_complex namespace detail_rvv { @@ -500,6 +515,15 @@ namespace xsimd store_aligned(dst, src, rvv {}); } + // store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes + // are not written (page-fault safe). + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + detail_rvv::rvvse_m(mask, reinterpret_cast(mem), src); + } + /****************** * scatter/gather * ******************/ diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 2a46947bf..871f3b7da 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -101,11 +101,28 @@ namespace xsimd return load_aligned(src, convert(), sve {}); } - // load_masked - template = 0> - XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept + // load_masked (compile-time mask): build a runtime predicate from + // the constant mask and reuse the runtime-mask path. ``pmask`` only + // constructs a 128-bit chunk predicate (svdupq_b{8,16,32,64}), which + // is replication-based and does not correctly express a per-lane + // mask on SVE wider than 128 bits — going through ``as_batch_bool`` + // gives the right predicate for every vector width. ``int32``/ + // ``int64``/``uint32``/``uint64`` are excluded so the common-arch + // dispatchers that reinterpret to ``float``/``double`` win partial + // ordering (otherwise we'd be ambiguous with ``requires_arch``). + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode m, requires_arch) noexcept { - return svld1(detail_sve::pmask(), reinterpret_cast const*>(mem)); + return load_masked(mem, mask.as_batch_bool(), convert {}, m, sve {}); + } + + // load_masked (runtime mask) + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return svld1(mask, reinterpret_cast const*>(mem)); } // load_complex @@ -141,6 +158,24 @@ namespace xsimd store_aligned(dst, src, sve {}); } + // store_masked (compile-time mask): forward to the runtime-mask + // path for the same reason as load_masked above; same exclusion of + // 32/64-bit integers to defer to the common dispatchers. + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode m, requires_arch) noexcept + { + store_masked(mem, src, mask.as_batch_bool(), m, sve {}); + } + + // store_masked (runtime mask) + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + svst1(mask, reinterpret_cast*>(mem), src); + } + // store_complex template = 0> XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 043f7e38a..b1ec3e58f 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1551,6 +1551,27 @@ namespace xsimd return batch::load(ptr, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr using a runtime mask. Elements + * corresponding to \c false in the mask are not accessed in memory and are + * zero-initialized in the resulting batch. No type conversion is performed: + * \c ptr must point to \c T. Prefer the \c batch_bool_constant overload + * whenever the mask is known at compile time. + * @param ptr the memory buffer to read + * @param mask runtime selection mask for the elements to load + * @return a new batch instance + */ + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -1571,6 +1592,16 @@ namespace xsimd return batch::load(ptr, mask, unaligned_mode {}); } + /// \overload + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2712,6 +2743,28 @@ namespace xsimd val.store(mem, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Copy selected elements of batch \c val to the buffer \c mem using a + * runtime mask. Elements corresponding to \c false in the mask are not + * written and leave the contents of \c mem untouched. No type conversion + * is performed: \c mem must point to \c T. Prefer the \c + * batch_bool_constant overload whenever the mask is known at compile time. + * @param mem the memory buffer to write to + * @param val the batch to copy from + * @param mask runtime selection mask for the elements to store + */ + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2732,6 +2785,17 @@ namespace xsimd val.store(mem, mask, unaligned_mode {}); } + /// \overload + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 59f87d2bb..1b97ea105 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -168,9 +168,12 @@ namespace xsimd template XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_INLINE void store(U* mem, batch_bool_constant mask, Mode) const noexcept; + /** \brief Runtime-mask store; see xsimd::store(T*, batch const&, batch_bool, Mode). */ + template + XSIMD_INLINE void store(T* mem, batch_bool mask, Mode = {}) const noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept; @@ -180,9 +183,12 @@ namespace xsimd XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; + /** \brief Runtime-mask load; see xsimd::load(T const*, batch_bool, Mode). */ + template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(T const* mem, batch_bool mask, Mode = {}) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; @@ -756,6 +762,14 @@ namespace xsimd } } + template + template + XSIMD_INLINE batch batch::load(T const* mem, batch_bool mask, Mode mode) noexcept + { + detail::static_check_supported_config(); + return kernel::load_masked(mem, mask, kernel::convert {}, mode, A {}); + } + template template XSIMD_INLINE void batch::store(U* mem, @@ -779,6 +793,16 @@ namespace xsimd } } + template + template + XSIMD_INLINE void batch::store(T* mem, + batch_bool mask, + Mode mode) const noexcept + { + detail::static_check_supported_config(); + kernel::store_masked(mem, *this, mask, mode, A {}); + } + template template XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index c353976e5..794458154 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -23,6 +23,7 @@ struct load_store_test { using batch_type = B; using value_type = typename B::value_type; + using batch_bool_type = typename B::batch_bool_type; using index_type = typename xsimd::as_integer_t; template using allocator = xsimd::default_allocator; @@ -65,22 +66,6 @@ struct load_store_test static constexpr bool get(std::size_t index, std::size_t size) noexcept { return index >= (size / 2); } }; - struct mask_first_n - { - static constexpr bool get(std::size_t index, std::size_t size) noexcept - { - return index < (size > 2 ? size / 3 : std::size_t(1)); - } - }; - - struct mask_last_n - { - static constexpr bool get(std::size_t index, std::size_t size) noexcept - { - return index >= size - (size > 2 ? size / 3 : std::size_t(1)); - } - }; - struct mask_even { static constexpr bool get(std::size_t index, std::size_t) noexcept { return (index % 2) == 0; } @@ -104,6 +89,41 @@ struct load_store_test static constexpr bool get(std::size_t, std::size_t) noexcept { return true; } }; + template + static batch_bool_type make_runtime_mask() noexcept + { + uint64_t bits = 0; + for (std::size_t i = 0; i < size; ++i) + { + if (Generator::get(i, size)) + { + bits |= uint64_t(1) << i; + } + } + return batch_bool_type::from_mask(bits); + } + + struct compile_time_mask + { + static constexpr const char* tag = ""; + template + static auto make() noexcept + -> decltype(xsimd::make_batch_bool_constant()) + { + return xsimd::make_batch_bool_constant(); + } + }; + + struct runtime_mask + { + static constexpr const char* tag = " runtime"; + template + static batch_bool_type make() noexcept + { + return make_runtime_mask(); + } + }; + int8_vector_type i8_vec; uint8_vector_type ui8_vec; int16_vector_type i16_vec; @@ -366,16 +386,22 @@ struct load_store_test template void run_mask_tests(const V& v, const std::string& name, batch_type& b, const array_type& expected, std::true_type) { - run_load_mask_pattern(v, name, b, expected, " masked none"); - run_load_mask_pattern(v, name, b, expected, " masked first element"); - run_load_mask_pattern(v, name, b, expected, " masked first half"); - run_load_mask_pattern(v, name, b, expected, " masked last half"); - run_load_mask_pattern(v, name, b, expected, " masked first N"); - run_load_mask_pattern(v, name, b, expected, " masked last N"); - run_load_mask_pattern(v, name, b, expected, " masked even elements"); - run_load_mask_pattern(v, name, b, expected, " masked odd elements"); - run_load_mask_pattern(v, name, b, expected, " masked pseudo random"); - run_load_mask_pattern(v, name, b, expected, " masked all elements"); + run_load_mask_patterns(v, name, b, expected); + run_load_mask_patterns(v, name, b, expected); + } + + template + void run_load_mask_patterns(const V& v, const std::string& name, batch_type& b, const array_type& expected) + { + const std::string p = std::string(MaskKind::tag) + " masked"; + run_load_mask_pattern(v, name, b, expected, p + " none"); + run_load_mask_pattern(v, name, b, expected, p + " first element"); + run_load_mask_pattern(v, name, b, expected, p + " first half"); + run_load_mask_pattern(v, name, b, expected, p + " last half"); + run_load_mask_pattern(v, name, b, expected, p + " even elements"); + run_load_mask_pattern(v, name, b, expected, p + " odd elements"); + run_load_mask_pattern(v, name, b, expected, p + " pseudo random"); + run_load_mask_pattern(v, name, b, expected, p + " all elements"); } template @@ -383,10 +409,10 @@ struct load_store_test { } - template + template void run_load_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label) { - constexpr auto mask = xsimd::make_batch_bool_constant(); + const auto mask = MaskKind::template make(); array_type expected_masked { 0 }; for (std::size_t i = 0; i < size; ++i) @@ -403,10 +429,10 @@ struct load_store_test CHECK_BATCH_EQ(b, expected_masked); } - template + template void run_store_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label) { - auto mask = xsimd::make_batch_bool_constant(); + const auto mask = MaskKind::template make(); for (std::size_t i = 0; i < size; ++i) { expected_masked[i] = Generator::get(i, size) ? v[i] : value_type(); @@ -424,15 +450,21 @@ struct load_store_test template void run_store_mask_tests(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, std::true_type) { - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first element"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first half"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked last half"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked first N"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked last N"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked even elements"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked odd elements"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked pseudo random"); - run_store_mask_pattern(v, name, b, res, expected_masked, " masked all elements"); + run_store_mask_patterns(v, name, b, res, expected_masked); + run_store_mask_patterns(v, name, b, res, expected_masked); + } + + template + void run_store_mask_patterns(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked) + { + const std::string p = std::string(MaskKind::tag) + " masked"; + run_store_mask_pattern(v, name, b, res, expected_masked, p + " first element"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " first half"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " last half"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " even elements"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " odd elements"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " pseudo random"); + run_store_mask_pattern(v, name, b, res, expected_masked, p + " all elements"); } template @@ -452,6 +484,7 @@ struct load_store_test V sentinel_expected(size, sentinel); auto zero_mask = xsimd::make_batch_bool_constant(); + auto runtime_zero_mask = make_runtime_mask(); std::fill(res.begin(), res.end(), sentinel); b.store(res.data(), zero_mask, xsimd::aligned_mode()); INFO(name, " masked none aligned store"); @@ -469,6 +502,19 @@ struct load_store_test CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) { return v == sentinel; })); + std::fill(res.begin(), res.end(), sentinel); + xsimd::store(res.data(), b, runtime_zero_mask, xsimd::aligned_mode()); + INFO(name, " runtime masked none aligned store"); + CHECK_VECTOR_EQ(res, sentinel_expected); + + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store(scratch_ptr, b, runtime_zero_mask, xsimd::unaligned_mode()); + INFO(name, " runtime masked none unaligned store"); + std::copy(scratch_ptr, scratch_ptr + scratch_slice.size(), scratch_slice.begin()); + CHECK_VECTOR_EQ(scratch_slice, sentinel_expected); + CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) + { return v == sentinel; })); + run_store_mask_tests(v, name, b, res, expected_masked, std::true_type {}); }