Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ jobs:
- { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' }
- { compiler: 'gcc', version: '14', flags: 'avx' }
- { compiler: 'gcc', version: '13', flags: 'avx512' }
- { compiler: 'gcc', version: '10', flags: 'avx512' }
- { compiler: 'gcc', version: '12', flags: 'i386' }
- { compiler: 'gcc', version: '13', flags: 'avx512pf' }
- { compiler: 'gcc', version: '13', flags: 'avx512vbmi' }
Expand Down
69 changes: 68 additions & 1 deletion include/xsimd/arch/common/xsimd_common_swizzle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,41 @@ namespace xsimd
return cross_impl<0, sizeof...(Vs), sizeof...(Vs) / 2, Vs...>::value;
}

/**
* @brief Internal: Check if a swizzle pattern crosses lane boundaries
*
* @tparam LaneSizeBytes Size of a lane in bytes (must be > 0)
* @tparam ElemT Element type to determine element size
* @tparam U Type of the index values
* @tparam Vs... Index values for the swizzle pattern
*
* @return true if any element accesses data from a different lane
*
* This is an internal helper. Architecture-specific code can call this directly
* with explicit lane sizes (e.g., detail::is_cross_lane_with_lane_size<16, float, ...>()
* for 128-bit lanes).
*/
template <std::size_t LaneSizeBytes, typename ElemT, typename U, U... Vs>
XSIMD_INLINE constexpr bool is_cross_lane_with_lane_size() noexcept
{
static_assert(std::is_integral<U>::value, "swizzle mask values must be integral");
static_assert(sizeof...(Vs) >= 1, "need at least one value");
static_assert(LaneSizeBytes > 0, "lane size must be positive");

constexpr std::size_t lane_elems = LaneSizeBytes / sizeof(ElemT);
constexpr U values[] = { Vs... };
constexpr std::size_t N = sizeof...(Vs);

for (std::size_t i = 0; i < N; ++i)
{
std::size_t elem_lane = i / lane_elems;
std::size_t target_lane = static_cast<std::size_t>(values[i]) / lane_elems;
if (elem_lane != target_lane)
return true;
}
return false;
}

template <typename T, T... Vs>
XSIMD_INLINE constexpr bool is_identity() noexcept { return detail::identity_impl<0, T, Vs...>(); }
template <typename T, T... Vs>
Expand All @@ -184,7 +219,39 @@ namespace xsimd
template <typename T, class A, T... Vs>
XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept { return detail::is_only_from_hi<T, Vs...>(); }
template <typename T, class A, T... Vs>
XSIMD_INLINE constexpr bool is_cross_lane(batch_constant<T, A, Vs...>) noexcept { return detail::is_cross_lane<Vs...>(); }
XSIMD_INLINE constexpr bool is_cross_lane(batch_constant<T, A, Vs...>) noexcept
{
return detail::is_cross_lane_with_lane_size<16, T, T, Vs...>();
}

/**
* @brief Public: Check if a swizzle pattern crosses 128-bit lane boundaries
*
* Checks if indices cross 128-bit (16-byte) lane boundaries, which is the
* standard lane size for SSE/AVX/AVX512 shuffle operations.
*
* @tparam ElemT Element type to determine element size
* @tparam U Type of the index values
* @tparam Vs... Index values for the swizzle pattern
*
* @return true if any element accesses data from a different 128-bit lane
*
* Examples:
* - is_cross_lane<float, 0, 1, 2, 3, 4, 5, 6, 7>() // no crossing (within 128-bit)
* - is_cross_lane<float, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15>() // crosses
*/
template <typename ElemT, typename U, U... Vs>
XSIMD_INLINE constexpr bool is_cross_lane() noexcept
{
return is_cross_lane_with_lane_size<16, ElemT, U, Vs...>();
}

// Overload with std::size_t indices
template <typename ElemT, std::size_t... Vs>
XSIMD_INLINE constexpr bool is_cross_lane() noexcept
{
return is_cross_lane<ElemT, std::size_t, Vs...>();
}

} // namespace detail
} // namespace kernel
Expand Down
6 changes: 3 additions & 3 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2737,15 +2737,15 @@ namespace xsimd
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFF);
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return static_cast<T>(_mm512_cvtsi512_si32(self) & 0xFFFF);
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return static_cast<T>(_mm512_cvtsi512_si32(self));
return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
Expand Down
38 changes: 33 additions & 5 deletions test/test_batch_manip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,39 @@ namespace xsimd
static_assert(is_dup_hi<std::uint32_t, 2, 3, 2, 3>(), "4-lane dup_hi failed");
static_assert(!is_dup_lo<std::uint32_t, 2, 3, 2, 3>(), "4-lane dup_lo on dup_hi");

static_assert(is_cross_lane<0, 1, 0, 1>(), "dup-lo only → crossing");
static_assert(is_cross_lane<2, 3, 2, 3>(), "dup-hi only → crossing");
static_assert(is_cross_lane<0, 3, 3, 3>(), "one low + rest high → crossing");
static_assert(!is_cross_lane<1, 0, 2, 3>(), "mixed low/high → no crossing");
static_assert(!is_cross_lane<0, 1, 2, 3>(), "mixed low/high → no crossing");
static_assert(is_cross_lane<double, 0, 1, 0, 1>(), "dup-lo only → crossing");
static_assert(is_cross_lane<double, 2, 3, 2, 3>(), "dup-hi only → crossing");
static_assert(is_cross_lane<double, 0, 3, 3, 3>(), "one low + rest high → crossing");
static_assert(!is_cross_lane<double, 1, 0, 2, 3>(), "mixed low/high → no crossing");
static_assert(!is_cross_lane<double, 0, 1, 2, 3>(), "mixed low/high → no crossing");
// 8-element 128-bit lane crossing checks
// For 8 doubles (64 bytes): lanes are [0-1], [2-3], [4-5], [6-7]
static_assert(!is_cross_lane<double, 1, 0, 3, 2, 5, 4, 7, 6>(), "8-lane reverse within 128-bit lanes → no crossing");
static_assert(!is_cross_lane<double, 0, 1, 2, 3, 4, 5, 6, 7>(), "identity 8-lane → no crossing");
static_assert(is_cross_lane<double, 2, 3, 0, 1, 4, 5, 6, 7>(), "8-lane double swap first two 128-bit lanes → crossing");
// For 8 int32 (32 bytes): lanes are [0-3], [4-7]
static_assert(is_cross_lane<std::int32_t, 4, 5, 6, 7, 0, 1, 2, 3>(), "8-lane int32_t swap 128-bit lanes → crossing");

// Additional compile-time checks for 16-element batches (e.g. float/int32)
static_assert(is_cross_lane<float, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7>(),
"16-lane 128-bit swap → crossing");
static_assert(!is_cross_lane<float, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>(),
"identity 16-lane → no crossing");
static_assert(is_cross_lane<std::uint32_t, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7>(),
"16-lane uint32_t swap → crossing");

// Explicit 128-bit lane boundary checks (LaneSizeBytes = 16)
// For float (4 bytes): 16 bytes = 4 elements per 128-bit lane
static_assert(detail::is_cross_lane_with_lane_size<16, float, std::size_t, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15>(),
"float: swap first two 128-bit lanes → crossing");
static_assert(!detail::is_cross_lane_with_lane_size<16, float, std::size_t, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12>(),
"float: reverse within each 128-bit lane → no crossing");

// For double (8 bytes): 16 bytes = 2 elements per 128-bit lane
static_assert(detail::is_cross_lane_with_lane_size<16, double, std::size_t, 2, 3, 0, 1, 4, 5, 6, 7>(),
"double: swap first two 128-bit lanes → crossing");
static_assert(!detail::is_cross_lane_with_lane_size<16, double, std::size_t, 1, 0, 3, 2, 5, 4, 7, 6>(),
"double: reverse within each 128-bit lane → no crossing");
}
}
}
Expand Down
10 changes: 10 additions & 0 deletions test/test_shuffle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -672,10 +672,15 @@ struct shuffle_test
}
};

#if defined(__GNUC__) && (__GNUC__ == 10) && !defined(__clang__) && XSIMD_WITH_AVX512F
// Use zip_lo as a stable reference for the expected interleave.
B b_ref_lo = xsimd::zip_lo(b_lhs, b_rhs);
#else
std::array<value_type, size> ref_lo;
for (size_t i = 0; i < size; ++i)
ref_lo[i] = (i & 1) ? rhs[i / 2] : lhs[i / 2];
B b_ref_lo = B::load_unaligned(ref_lo.data());
#endif

INFO("zip_lo");
B b_res_lo = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant<mask_type, zip_lo_generator, arch_type>());
Expand All @@ -689,12 +694,17 @@ struct shuffle_test
}
};

#if defined(__GNUC__) && (__GNUC__ == 10) && !defined(__clang__) && XSIMD_WITH_AVX512F
// Use zip_hi as a stable reference for the expected interleave.
B b_ref_hi = xsimd::zip_hi(b_lhs, b_rhs);
#else
std::array<value_type, size> ref_hi;
for (size_t i = 0; i < size; ++i)
{
ref_hi[i] = (i & 1) ? rhs[size / 2 + i / 2] : lhs[size / 2 + i / 2];
}
B b_ref_hi = B::load_unaligned(ref_hi.data());
#endif

INFO("zip_hi");
B b_res_hi = xsimd::shuffle(b_lhs, b_rhs, xsimd::make_batch_constant<mask_type, zip_hi_generator, arch_type>());
Expand Down