Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions include/xsimd/config/xsimd_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@
* @defgroup xsimd_config_macro Instruction Set Detection
*/

/**
* @ingroup xsimd_config_macro
*
* Set to 1 if the target is the x86 architecture family.
*/
#if defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
#define XSIMD_TARGET_X86 1
#else
#define XSIMD_TARGET_X86 0
#endif

/**
* @ingroup xsimd_config_macro
*
Expand Down
166 changes: 37 additions & 129 deletions include/xsimd/config/xsimd_cpuid.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
#ifndef XSIMD_CPUID_HPP
#define XSIMD_CPUID_HPP

#include <algorithm>
#include <cstring>
#include "../types/xsimd_all_registers.hpp"
#include "../xsimd_cpu_features_x86.hpp"
#include "xsimd_inline.hpp"

#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector))
#include <asm/hwcap.h>
Expand All @@ -25,13 +26,6 @@

#endif

#if defined(_MSC_VER)
// Contains the definition of __cpuidex
#include <intrin.h>
#endif

#include "../types/xsimd_all_registers.hpp"

namespace xsimd
{
namespace detail
Expand All @@ -40,7 +34,7 @@ namespace xsimd
{

#define ARCH_FIELD_EX(arch, field_name) \
unsigned field_name; \
unsigned field_name = 0; \
XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; }

#define ARCH_FIELD_EX_REUSE(arch, field_name) \
Expand Down Expand Up @@ -90,8 +84,6 @@ namespace xsimd

XSIMD_INLINE supported_arch() noexcept
{
memset(this, 0, sizeof(supported_arch));

#if XSIMD_WITH_WASM
wasm = 1;
#endif
Expand Down Expand Up @@ -122,138 +114,54 @@ namespace xsimd
#endif
rvv = bool(getauxval(AT_HWCAP) & HWCAP_V);
#endif

#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)

auto get_xcr0_low = []() noexcept
{
uint32_t xcr0;

#if defined(_MSC_VER) && _MSC_VER >= 1400

xcr0 = (uint32_t)_xgetbv(0);

#elif defined(__GNUC__)

__asm__(
"xorl %%ecx, %%ecx\n"
"xgetbv\n"
: "=a"(xcr0)
:
#if defined(__i386__)
: "ecx", "edx"
#else
: "rcx", "rdx"
#endif
);

#else /* _MSC_VER < 1400 */
#error "_MSC_VER < 1400 is not supported"
#endif /* _MSC_VER && _MSC_VER >= 1400 */
return xcr0;
};

auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
{

#if defined(_MSC_VER)
__cpuidex(reg, level, count);

#elif defined(__INTEL_COMPILER)
__cpuid(reg, level);

#elif defined(__GNUC__) || defined(__clang__)

#if defined(__i386__) && defined(__PIC__)
// %ebx may be the PIC register
__asm__("xchg{l}\t{%%}ebx, %1\n\t"
"cpuid\n\t"
"xchg{l}\t{%%}ebx, %1\n\t"
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
: "0"(level), "2"(count));

#else
__asm__("cpuid\n\t"
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
: "0"(level), "2"(count));
#endif

#else
#error "Unsupported configuration"
#endif
};

int regs1[4];

get_cpuid(regs1, 0x1);

// OS can explicitly disable the usage of SSE/AVX extensions
// by setting an appropriate flag in CR0 register
//
// https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html
const auto cpuid = xsimd::x86_cpu_id::read();
auto xcr0 = xsimd::x86_xcr0::make_false();

unsigned sse_state_os_enabled = 1;
bool sse_enabled = true;
// AVX and AVX512 strictly require OSXSAVE to be enabled by the OS.
// If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1),
// AVX state won't be preserved across context switches, so AVX cannot be used.
unsigned avx_state_os_enabled = 0;
unsigned avx512_state_os_enabled = 0;
bool avx_enabled = false;
bool avx512_enabled = false;

// OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit
// 18] to enable XSETBV/XGETBV instructions to access XCR0 and
// to support processor extended state management using
// XSAVE/XRSTOR.
bool osxsave = regs1[2] >> 27 & 1;
if (osxsave)
if (cpuid.osxsave())
{
xcr0 = xsimd::x86_xcr0::read();

uint32_t xcr0 = get_xcr0_low();

sse_state_os_enabled = xcr0 >> 1 & 1;
avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled;
avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled;
sse_enabled = xcr0.sse_enabled();
avx_enabled = xcr0.avx_enabled();
avx512_enabled = xcr0.avx512_enabled();
}

sse2 = regs1[3] >> 26 & sse_state_os_enabled;
sse3 = regs1[2] >> 0 & sse_state_os_enabled;
ssse3 = regs1[2] >> 9 & sse_state_os_enabled;
sse4_1 = regs1[2] >> 19 & sse_state_os_enabled;
sse4_2 = regs1[2] >> 20 & sse_state_os_enabled;
fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled;

avx = regs1[2] >> 28 & avx_state_os_enabled;
fma3_avx = avx && fma3_sse42;

int regs8[4];
get_cpuid(regs8, 0x80000001);
fma4 = regs8[2] >> 16 & avx_state_os_enabled;

// sse4a = regs[2] >> 6 & 1;
sse2 = cpuid.sse2() && sse_enabled;
sse3 = cpuid.sse3() && sse_enabled;
ssse3 = cpuid.ssse3() && sse_enabled;
sse4_1 = cpuid.sse4_1() && sse_enabled;
sse4_2 = cpuid.sse4_2() && sse_enabled;
fma3_sse42 = cpuid.fma3() && sse_enabled;

// xop = regs[2] >> 11 & 1;

int regs7[4];
get_cpuid(regs7, 0x7);
avx2 = regs7[1] >> 5 & avx_state_os_enabled;

int regs7a[4];
get_cpuid(regs7a, 0x7, 0x1);
avxvnni = regs7a[0] >> 4 & avx_state_os_enabled;
// sse4a not implemented in cpu_id yet
// xop not implemented in cpu_id yet

avx = cpuid.avx() && avx_enabled;
fma3_avx = avx && fma3_sse42;
fma4 = cpuid.fma4() && avx_enabled;
avx2 = cpuid.avx2() && avx_enabled;
avxvnni = cpuid.avxvnni() && avx_enabled;
fma3_avx2 = avx2 && fma3_sse42;

avx512f = regs7[1] >> 16 & avx512_state_os_enabled;
avx512cd = regs7[1] >> 28 & avx512_state_os_enabled;
avx512dq = regs7[1] >> 17 & avx512_state_os_enabled;
avx512bw = regs7[1] >> 30 & avx512_state_os_enabled;
avx512er = regs7[1] >> 27 & avx512_state_os_enabled;
avx512pf = regs7[1] >> 26 & avx512_state_os_enabled;
avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled;
avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled;
avx512vbmi2 = regs7[2] >> 6 & avx512_state_os_enabled;
avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled;
avx512f = cpuid.avx512f() && avx512_enabled;
avx512cd = cpuid.avx512cd() && avx512_enabled;
avx512dq = cpuid.avx512dq() && avx512_enabled;
avx512bw = cpuid.avx512bw() && avx512_enabled;
avx512er = cpuid.avx512er() && avx512_enabled;
avx512pf = cpuid.avx512pf() && avx512_enabled;
avx512ifma = cpuid.avx512ifma() && avx512_enabled;
avx512vbmi = cpuid.avx512vbmi() && avx512_enabled;
avx512vbmi2 = cpuid.avx512vbmi2() && avx512_enabled;
avx512vnni_bw = cpuid.avx512vnni_bw() && avx512_enabled;
avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw;
#endif
}
};
} // namespace detail
Expand Down
2 changes: 2 additions & 0 deletions include/xsimd/types/xsimd_common_arch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#ifndef XSIMD_COMMON_ARCH_HPP
#define XSIMD_COMMON_ARCH_HPP

#include <cstddef>

#include "../config/xsimd_config.hpp"

/**
Expand Down
2 changes: 2 additions & 0 deletions include/xsimd/types/xsimd_register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include <type_traits>

#include "../config/xsimd_inline.hpp"

namespace xsimd
{
namespace types
Expand Down
41 changes: 41 additions & 0 deletions include/xsimd/utils/bits.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
***************************************************************************/

#ifndef XSIMD_CPUID_UTILS_HPP
#define XSIMD_CPUID_UTILS_HPP

namespace xsimd
{
namespace utils
{
template <typename I>
constexpr I make_bit_mask(I bit)
{
return static_cast<I>(I { 1 } << bit);
}

template <typename I, typename... Args>
constexpr I make_bit_mask(I bit, Args... bits)
{
// TODO(C++17): Use fold expression
return make_bit_mask<I>(bit) | make_bit_mask<I>(static_cast<I>(bits)...);
}

template <int... Bits, typename I>
constexpr bool bit_is_set(I value)
{
constexpr I mask = make_bit_mask<I>(static_cast<I>(Bits)...);
return (value & mask) == mask;
}
}
}

#endif
Loading