diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index ab24b4868..193038e75 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -17,6 +17,7 @@ jobs: - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' } - { compiler: 'gcc', version: '14', flags: 'avx' } - { compiler: 'gcc', version: '13', flags: 'avx512' } + - { compiler: 'gcc', version: '10', flags: 'avx512' } - { compiler: 'gcc', version: '12', flags: 'i386' } - { compiler: 'gcc', version: '13', flags: 'avx512pf' } - { compiler: 'gcc', version: '13', flags: 'avx512vbmi' } diff --git a/include/xsimd/arch/common/xsimd_common_swizzle.hpp b/include/xsimd/arch/common/xsimd_common_swizzle.hpp index 5f51aaf9b..4af2225cd 100644 --- a/include/xsimd/arch/common/xsimd_common_swizzle.hpp +++ b/include/xsimd/arch/common/xsimd_common_swizzle.hpp @@ -167,6 +167,41 @@ namespace xsimd return cross_impl<0, sizeof...(Vs), sizeof...(Vs) / 2, Vs...>::value; } + /** + * @brief Internal: Check if a swizzle pattern crosses lane boundaries + * + * @tparam LaneSizeBytes Size of a lane in bytes (must be > 0) + * @tparam ElemT Element type to determine element size + * @tparam U Type of the index values + * @tparam Vs... Index values for the swizzle pattern + * + * @return true if any element accesses data from a different lane + * + * This is an internal helper. Architecture-specific code can call this directly + * with explicit lane sizes (e.g., detail::is_cross_lane_with_lane_size<16, float, ...>() + * for 128-bit lanes). + */ + template + XSIMD_INLINE constexpr bool is_cross_lane_with_lane_size() noexcept + { + static_assert(std::is_integral::value, "swizzle mask values must be integral"); + static_assert(sizeof...(Vs) >= 1, "need at least one value"); + static_assert(LaneSizeBytes > 0, "lane size must be positive"); + + constexpr std::size_t lane_elems = LaneSizeBytes / sizeof(ElemT); + constexpr U values[] = { Vs... }; + constexpr std::size_t N = sizeof...(Vs); + + for (std::size_t i = 0; i < N; ++i) + { + std::size_t elem_lane = i / lane_elems; + std::size_t target_lane = static_cast(values[i]) / lane_elems; + if (elem_lane != target_lane) + return true; + } + return false; + } + template XSIMD_INLINE constexpr bool is_identity() noexcept { return detail::identity_impl<0, T, Vs...>(); } template @@ -184,7 +219,39 @@ namespace xsimd template XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant) noexcept { return detail::is_only_from_hi(); } template - XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept { return detail::is_cross_lane(); } + XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept + { + return detail::is_cross_lane_with_lane_size<16, T, T, Vs...>(); + } + + /** + * @brief Public: Check if a swizzle pattern crosses 128-bit lane boundaries + * + * Checks if indices cross 128-bit (16-byte) lane boundaries, which is the + * standard lane size for SSE/AVX/AVX512 shuffle operations. + * + * @tparam ElemT Element type to determine element size + * @tparam U Type of the index values + * @tparam Vs... Index values for the swizzle pattern + * + * @return true if any element accesses data from a different 128-bit lane + * + * Examples: + * - is_cross_lane() // no crossing (within 128-bit) + * - is_cross_lane() // crosses + */ + template + XSIMD_INLINE constexpr bool is_cross_lane() noexcept + { + return is_cross_lane_with_lane_size<16, ElemT, U, Vs...>(); + } + + // Overload with std::size_t indices + template + XSIMD_INLINE constexpr bool is_cross_lane() noexcept + { + return is_cross_lane(); + } } // namespace detail } // namespace kernel diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index b9ec7916b..5ccf165f1 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -2737,15 +2737,15 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { - return static_cast(_mm512_cvtsi512_si32(self) & 0xFF); + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { - return static_cast(_mm512_cvtsi512_si32(self) & 0xFFFF); + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - return static_cast(_mm512_cvtsi512_si32(self)); + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { diff --git a/test/test_batch_manip.cpp b/test/test_batch_manip.cpp index 7da46e736..93d0e2a46 100644 --- a/test/test_batch_manip.cpp +++ b/test/test_batch_manip.cpp @@ -52,11 +52,39 @@ namespace xsimd static_assert(is_dup_hi(), "4-lane dup_hi failed"); static_assert(!is_dup_lo(), "4-lane dup_lo on dup_hi"); - static_assert(is_cross_lane<0, 1, 0, 1>(), "dup-lo only → crossing"); - static_assert(is_cross_lane<2, 3, 2, 3>(), "dup-hi only → crossing"); - static_assert(is_cross_lane<0, 3, 3, 3>(), "one low + rest high → crossing"); - static_assert(!is_cross_lane<1, 0, 2, 3>(), "mixed low/high → no crossing"); - static_assert(!is_cross_lane<0, 1, 2, 3>(), "mixed low/high → no crossing"); + static_assert(is_cross_lane(), "dup-lo only → crossing"); + static_assert(is_cross_lane(), "dup-hi only → crossing"); + static_assert(is_cross_lane(), "one low + rest high → crossing"); + static_assert(!is_cross_lane(), "mixed low/high → no crossing"); + static_assert(!is_cross_lane(), "mixed low/high → no crossing"); + // 8-element 128-bit lane crossing checks + // For 8 doubles (64 bytes): lanes are [0-1], [2-3], [4-5], [6-7] + static_assert(!is_cross_lane(), "8-lane reverse within 128-bit lanes → no crossing"); + static_assert(!is_cross_lane(), "identity 8-lane → no crossing"); + static_assert(is_cross_lane(), "8-lane double swap first two 128-bit lanes → crossing"); + // For 8 int32 (32 bytes): lanes are [0-3], [4-7] + static_assert(is_cross_lane(), "8-lane int32_t swap 128-bit lanes → crossing"); + + // Additional compile-time checks for 16-element batches (e.g. float/int32) + static_assert(is_cross_lane(), + "16-lane 128-bit swap → crossing"); + static_assert(!is_cross_lane(), + "identity 16-lane → no crossing"); + static_assert(is_cross_lane(), + "16-lane uint32_t swap → crossing"); + + // Explicit 128-bit lane boundary checks (LaneSizeBytes = 16) + // For float (4 bytes): 16 bytes = 4 elements per 128-bit lane + static_assert(detail::is_cross_lane_with_lane_size<16, float, std::size_t, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15>(), + "float: swap first two 128-bit lanes → crossing"); + static_assert(!detail::is_cross_lane_with_lane_size<16, float, std::size_t, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12>(), + "float: reverse within each 128-bit lane → no crossing"); + + // For double (8 bytes): 16 bytes = 2 elements per 128-bit lane + static_assert(detail::is_cross_lane_with_lane_size<16, double, std::size_t, 2, 3, 0, 1, 4, 5, 6, 7>(), + "double: swap first two 128-bit lanes → crossing"); + static_assert(!detail::is_cross_lane_with_lane_size<16, double, std::size_t, 1, 0, 3, 2, 5, 4, 7, 6>(), + "double: reverse within each 128-bit lane → no crossing"); } } } diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index b08210974..1719d329e 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -672,10 +672,15 @@ struct shuffle_test } }; +#if defined(__GNUC__) && (__GNUC__ == 10) && !defined(__clang__) && XSIMD_WITH_AVX512F + // Use zip_lo as a stable reference for the expected interleave. + B b_ref_lo = xsimd::zip_lo(b_lhs, b_rhs); +#else std::array ref_lo; for (size_t i = 0; i < size; ++i) ref_lo[i] = (i & 1) ? rhs[i / 2] : lhs[i / 2]; B b_ref_lo = B::load_unaligned(ref_lo.data()); +#endif INFO("zip_lo"); B b_res_lo = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant()); @@ -689,12 +694,17 @@ struct shuffle_test } }; +#if defined(__GNUC__) && (__GNUC__ == 10) && !defined(__clang__) && XSIMD_WITH_AVX512F + // Use zip_hi as a stable reference for the expected interleave. + B b_ref_hi = xsimd::zip_hi(b_lhs, b_rhs); +#else std::array ref_hi; for (size_t i = 0; i < size; ++i) { ref_hi[i] = (i & 1) ? rhs[size / 2 + i / 2] : lhs[size / 2 + i / 2]; } B b_ref_hi = B::load_unaligned(ref_hi.data()); +#endif INFO("zip_hi"); B b_res_hi = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant());