From 9f62704a448178cf9c08c51eb3fdde6dcb2be8b0 Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Sat, 22 Jun 2024 16:22:29 +0200 Subject: [PATCH] Move x86 CPUID code from cpuid.hpp to src/x86/cpuid.cpp --- CMakeLists.txt | 33 ++- ChangeLog | 8 + cmake/auto_vectorization.cmake | 4 +- cmake/libatomic.cmake | 5 +- cmake/multiarch_avx512_vbmi2.cmake | 14 +- cmake/multiarch_x86_popcnt.cmake | 53 ++++ include/primesieve/CpuInfo.hpp | 3 +- include/primesieve/Erat.hpp | 4 +- include/primesieve/PrimeGenerator.hpp | 20 +- include/primesieve/ctz.hpp | 157 ++++++++++ include/primesieve/intrinsics.hpp | 345 ---------------------- include/primesieve/popcnt.hpp | 214 ++++++++++++++ scripts/build_clang_multiarch_win_x64.bat | 2 +- src/CpuInfo.cpp | 33 +-- src/PrimeGenerator.cpp | 14 +- src/app/main.cpp | 19 +- src/popcount.cpp | 2 +- src/x86/cpuid.cpp | 125 ++++++++ test/CMakeLists.txt | 2 +- test/cpuid.cpp | 53 +--- 20 files changed, 647 insertions(+), 463 deletions(-) create mode 100644 cmake/multiarch_x86_popcnt.cmake create mode 100644 include/primesieve/ctz.hpp delete mode 100644 include/primesieve/intrinsics.hpp create mode 100644 include/primesieve/popcnt.hpp create mode 100644 src/x86/cpuid.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 84bc037c7..ab9275260 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,7 @@ if(NOT isMultiConfig AND NOT CMAKE_BUILD_TYPE) endif() if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(ENABLE_ASSERT "ENABLE_ASSERT") + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_ASSERT") endif() # primesieve binary source files ##################################### @@ -82,6 +82,17 @@ set(LIB_SRC src/api-c.cpp src/RiemannR.cpp src/SievingPrimes.cpp) +# Check if compiler supports CPU multiarch ########################### + +if(WITH_MULTIARCH) + include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake") + + if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2) + set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp) + endif() +endif() + # Required includes ################################################## include(GNUInstallDirs) @@ -107,12 +118,6 @@ if(WITH_AUTO_VECTORIZATION) include("${PROJECT_SOURCE_DIR}/cmake/auto_vectorization.cmake") endif() -# Check if compiler supports x64 multiarch ########################### - -if(WITH_MULTIARCH) - include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake") -endif() - # libprimesieve (shared library) ##################################### find_package(Threads REQUIRED QUIET) @@ -120,13 +125,13 @@ find_package(Threads REQUIRED QUIET) if(BUILD_SHARED_LIBS) add_library(libprimesieve SHARED ${LIB_SRC}) set_target_properties(libprimesieve PROPERTIES OUTPUT_NAME primesieve) - target_link_libraries(libprimesieve PRIVATE Threads::Threads ${LIBATOMIC}) + target_link_libraries(libprimesieve PRIVATE Threads::Threads ${PRIMESIEVE_LINK_LIBRARIES}) string(REPLACE "." ";" SOVERSION_LIST ${PRIMESIEVE_SOVERSION}) list(GET SOVERSION_LIST 0 PRIMESIEVE_SOVERSION_MAJOR) set_target_properties(libprimesieve PROPERTIES SOVERSION ${PRIMESIEVE_SOVERSION_MAJOR}) set_target_properties(libprimesieve PROPERTIES VERSION ${PRIMESIEVE_SOVERSION}) - target_compile_options(libprimesieve PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG}) - target_compile_definitions(libprimesieve PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") + target_compile_options(libprimesieve PRIVATE ${PRIMESIEVE_COMPILE_OPTIONS}) + target_compile_definitions(libprimesieve PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) if(WIN32_MSVC_COMPATIBLE) # On Windows the shared library will be named primesieve.dll @@ -162,9 +167,9 @@ endif() if(BUILD_STATIC_LIBS) add_library(libprimesieve-static STATIC ${LIB_SRC}) set_target_properties(libprimesieve-static PROPERTIES OUTPUT_NAME primesieve) - target_link_libraries(libprimesieve-static PRIVATE Threads::Threads ${LIBATOMIC}) - target_compile_options(libprimesieve-static PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG}) - target_compile_definitions(libprimesieve-static PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") + target_link_libraries(libprimesieve-static PRIVATE Threads::Threads ${PRIMESIEVE_LINK_LIBRARIES}) + target_compile_options(libprimesieve-static PRIVATE ${PRIMESIEVE_COMPILE_OPTIONS}) + target_compile_definitions(libprimesieve-static PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) if(WITH_MSVC_CRT_STATIC) set_target_properties(libprimesieve-static PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded") @@ -219,7 +224,7 @@ endif() if(BUILD_PRIMESIEVE) add_executable(primesieve ${BIN_SRC}) target_link_libraries(primesieve primesieve::primesieve Threads::Threads) - target_compile_definitions(primesieve PRIVATE "${ENABLE_ASSERT}") + target_compile_definitions(primesieve PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) target_compile_features(primesieve PRIVATE cxx_auto_type) install(TARGETS primesieve DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/ChangeLog b/ChangeLog index 4a07fa953..ded64abb2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Changes in version 12.4, 22/06/2024 +=================================== + +* Move x86 CPUID code from cpuid.hpp to src/x86/cpuid.cpp. +* multiarch_x86_popcnt.cmake: Detect x86 POPCNT support. +* CMakeLists.txt: Use CMake list for all compile time definitions. +* CMakeLists.txt: Use CMake list for all link libraries. + Changes in version 12.3, 15/04/2024 =================================== diff --git a/cmake/auto_vectorization.cmake b/cmake/auto_vectorization.cmake index 84897cc45..4a243f70c 100644 --- a/cmake/auto_vectorization.cmake +++ b/cmake/auto_vectorization.cmake @@ -16,7 +16,7 @@ check_cxx_compiler_flag(-ftree-vectorize ftree_vectorize) cmake_pop_check_state() if(ftree_vectorize) - set(FTREE_VECTORIZE_FLAG "-ftree-vectorize") + list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-ftree-vectorize") cmake_push_check_state() set(CMAKE_REQUIRED_FLAGS -Werror) @@ -24,6 +24,6 @@ if(ftree_vectorize) cmake_pop_check_state() if(fvect_cost_model) - set(FVECT_COST_MODEL_FLAG "-fvect-cost-model=dynamic") + list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-fvect-cost-model=dynamic") endif() endif() diff --git a/cmake/libatomic.cmake b/cmake/libatomic.cmake index 333926f2b..acd7ca993 100644 --- a/cmake/libatomic.cmake +++ b/cmake/libatomic.cmake @@ -57,7 +57,10 @@ if(NOT atomic64) }" atomic64_with_libatomic) - if (NOT atomic64_with_libatomic) + if(atomic64_with_libatomic) + list(APPEND PRIMESIEVE_LINK_LIBRARIES "${LIBATOMIC}") + else() + set(LIBATOMIC "") message(FATAL_ERROR "Failed to compile std::atomic, libatomic likely not found!") endif() endif() diff --git a/cmake/multiarch_avx512_vbmi2.cmake b/cmake/multiarch_avx512_vbmi2.cmake index f56d5e933..c0b7c0e96 100644 --- a/cmake/multiarch_avx512_vbmi2.cmake +++ b/cmake/multiarch_avx512_vbmi2.cmake @@ -7,7 +7,7 @@ include(CheckCXXSourceCompiles) include(CMakePushCheckState) cmake_push_check_state() -set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include") +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") check_cxx_source_compiles(" // GCC/Clang function multiversioning for AVX512 is not needed if @@ -20,19 +20,19 @@ check_cxx_source_compiles(" Error: AVX512VBMI2 multiarch not needed! #endif - #include + #include #include #include class PrimeGenerator { public: __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) - void fillNextPrimes_avx512(uint64_t* primes64); + void fillNextPrimes_avx512_vbmi2(uint64_t* primes64); void fillNextPrimes_default(uint64_t* primes64); void fillNextPrimes(uint64_t* primes64) { - if (cpu_supports_avx512_vbmi2) - fillNextPrimes_avx512(primes64); + if (primesieve::has_cpuid_avx512_vbmi2()) + fillNextPrimes_avx512_vbmi2(primes64); else fillNextPrimes_default(primes64); } @@ -44,7 +44,7 @@ check_cxx_source_compiles(" } __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) - void PrimeGenerator::fillNextPrimes_avx512(uint64_t* primes64) + void PrimeGenerator::fillNextPrimes_avx512_vbmi2(uint64_t* primes64) { __m512i bytes_0_to_7 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); __m512i base = _mm512_set1_epi64(123); @@ -64,7 +64,7 @@ check_cxx_source_compiles(" " multiarch_avx512_vbmi2) if(multiarch_avx512_vbmi2) - set(ENABLE_MULTIARCH_AVX512 "ENABLE_MULTIARCH_AVX512") + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_VBMI2") endif() cmake_pop_check_state() diff --git a/cmake/multiarch_x86_popcnt.cmake b/cmake/multiarch_x86_popcnt.cmake new file mode 100644 index 000000000..707a8e470 --- /dev/null +++ b/cmake/multiarch_x86_popcnt.cmake @@ -0,0 +1,53 @@ +# On x86 CPUs we need to enable the use of cpuid.cpp. +# If cpuid.cpp compiles we assume it is a x86 CPU. + +include(CheckCXXSourceCompiles) +include(CMakePushCheckState) + +cmake_push_check_state() +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") + +check_cxx_source_compiles(" + // Enable CPUID for POPCNT on x86 and x86-64 CPUs. + // This is required because not all x86 and x86-64 CPUs + // support the POPCNT instruction. + #if !(defined(__x86_64__) || \ + defined(__i386__) || \ + defined(_M_X64) || \ + defined(_M_IX86)) + Error: x86 POPCNT multiarch not needed! + #endif + + // Both GCC and Clang (even Clang on Windows) define the __POPCNT__ + // macro if the user compiles with -mpopcnt. The __POPCNT__ + // macro is even defined if the user compiles with other flags + // such as -mavx or -march=native. + #if defined(__POPCNT__) + Error: x86 POPCNT multiarch not needed! + + // The MSVC compiler does not support a POPCNT macro, but if the user + // compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines + // the __AVX__ macro and POPCNT is also supported. + #elif defined(_MSC_VER) && defined(__AVX__) + Error: x86 POPCNT multiarch not needed! + #endif + + #include + #include + + int main() + { + if (primesieve::has_cpuid_popcnt()) + std::cout << \"CPU supports POPCNT!\" << std::endl; + else + std::cout << \"CPU does not support POPCNT!\" << std::endl; + + return 0; + } +" multiarch_x86_popcnt) + +if(multiarch_x86_popcnt) + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_x86_POPCNT") +endif() + +cmake_pop_check_state() diff --git a/include/primesieve/CpuInfo.hpp b/include/primesieve/CpuInfo.hpp index 426cc6323..5ccd1835a 100644 --- a/include/primesieve/CpuInfo.hpp +++ b/include/primesieve/CpuInfo.hpp @@ -1,7 +1,7 @@ /// /// @file CpuInfo.hpp /// -/// Copyright (C) 2023 Kim Walisch, +/// Copyright (C) 2024 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. @@ -22,7 +22,6 @@ class CpuInfo public: CpuInfo(); bool hasCpuName() const; - bool hasAVX512() const; bool hasLogicalCpuCores() const; bool hasL1Cache() const; bool hasL2Cache() const; diff --git a/include/primesieve/Erat.hpp b/include/primesieve/Erat.hpp index bfd952863..32c0f221d 100644 --- a/include/primesieve/Erat.hpp +++ b/include/primesieve/Erat.hpp @@ -1,7 +1,7 @@ /// /// @file Erat.hpp /// -/// Copyright (C) 2023 Kim Walisch, +/// Copyright (C) 2024 Kim Walisch, /// /// This file is distributed under the BSD License. See the COPYING /// file in the top level directory. @@ -15,8 +15,8 @@ #include "EratMedium.hpp" #include "EratBig.hpp" #include "macros.hpp" -#include "intrinsics.hpp" #include "Vector.hpp" +#include "ctz.hpp" #include diff --git a/include/primesieve/PrimeGenerator.hpp b/include/primesieve/PrimeGenerator.hpp index 59f7567d9..aa0cecdc6 100644 --- a/include/primesieve/PrimeGenerator.hpp +++ b/include/primesieve/PrimeGenerator.hpp @@ -27,9 +27,9 @@ defined(__AVX512VBMI__) && \ defined(__AVX512VBMI2__) && \ __has_include() - #define ENABLE_AVX512 + #define ENABLE_AVX512_VBMI2 -#elif defined(ENABLE_MULTIARCH_AVX512) && \ +#elif defined(ENABLE_MULTIARCH_AVX512_VBMI2) && \ __has_include() #include "cpu_supports_avx512_vbmi2.hpp" #define ENABLE_DEFAULT @@ -50,11 +50,11 @@ class PrimeGenerator : public Erat ALWAYS_INLINE void fillNextPrimes(Vector& primes, std::size_t* size) { - #if defined(ENABLE_AVX512) - fillNextPrimes_avx512(primes, size); - #elif defined(ENABLE_MULTIARCH_AVX512) + #if defined(ENABLE_AVX512_VBMI2) + fillNextPrimes_avx512_vbmi2(primes, size); + #elif defined(ENABLE_MULTIARCH_AVX512_VBMI2) if (cpu_supports_avx512_vbmi2) - fillNextPrimes_avx512(primes, size); + fillNextPrimes_avx512_vbmi2(primes, size); else fillNextPrimes_default(primes, size); #else @@ -68,13 +68,13 @@ class PrimeGenerator : public Erat void fillNextPrimes_default(Vector& primes, std::size_t* size); #endif -#if defined(ENABLE_AVX512) || \ - defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_AVX512_VBMI2) || \ + defined(ENABLE_MULTIARCH_AVX512_VBMI2) - #if defined(ENABLE_MULTIARCH_AVX512) + #if defined(ENABLE_MULTIARCH_AVX512_VBMI2) __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) #endif - void fillNextPrimes_avx512(Vector& primes, std::size_t* size); + void fillNextPrimes_avx512_vbmi2(Vector& primes, std::size_t* size); #endif diff --git a/include/primesieve/ctz.hpp b/include/primesieve/ctz.hpp new file mode 100644 index 000000000..06bf5eeee --- /dev/null +++ b/include/primesieve/ctz.hpp @@ -0,0 +1,157 @@ +/// +/// @file ctz.hpp +/// @brief Count the number of trailing zeros. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef CTZ_HPP +#define CTZ_HPP + +#include "macros.hpp" +#include + +// GCC/Clang & MSVC +#if defined(__x86_64__) || \ + defined(_M_X64) + #define IS_X64 +#endif + +// On x64 CPUs: +// GCC & Clang enable TZCNT with -mbmi. +// MSVC enables TZCNT with /arch:AVX2 or later. +#if defined(__BMI__) || \ + (defined(_MSC_VER) && defined(__AVX2__)) + #define HAS_TZCNT +#endif + +// In 2022 std::countr_zero(x) generates good assembly for +// most compilers & CPU architectures, except for: +// 1) GCC & Clang on x64 without __BMI__. +// 2) MSVC on x64 without __AVX2__. +// Hence on x64 CPUs we only use std::countr_zero(x) if +// the compiler generates the TZCNT instruction. +#if defined(HAS_CPP20_BIT_HEADER) && \ + (defined(HAS_TZCNT) || !defined(IS_X64)) + +#define HAS_CTZ64 +#define CTZ64_SUPPORTS_ZERO + +namespace { + +inline int ctz64(uint64_t x) +{ + // No undefined behavior, std::countr_zero(0) = 64 + return std::countr_zero(x); +} + +} // namespace + +#elif (defined(__GNUC__) || \ + defined(__clang__)) && \ + defined(__x86_64__) + +#define HAS_CTZ64 +#define CTZ64_SUPPORTS_ZERO + +namespace { + +inline uint64_t ctz64(uint64_t x) +{ +#if defined(HAS_TZCNT) + // No undefined behavior, TZCNT(0) = 64 + __asm__("tzcnt %1, %0" : "=r"(x) : "r"(x)); + return x; +#else + // REP BSF uses the TZCNT instruction on x64 CPUs with the BMI1 + // instruction set (>= 2013) and the BSF instruction on older x64 + // CPUs. BSF(0) is undefined behavior, it leaves the destination + // register unmodified. Fortunately, it is possible to avoid this + // undefined behavior by always setting the destination register + // to the same value before executing BSF(0). This works on all + // AMD & Intel CPUs since the i586 (from 1993), the Linux kernel + // also relies on this behavior, see this Linux commit: + // https://github.com/torvalds/linux/commit/ca3d30cc02f780f68771087040ce935add6ba2b7 + // + // The constraint "0" for input operand 1 says that it must occupy + // the same location as output operand 0. Hence the assembly below + // uses the same input & output register. This ensures that + // BSF(0) = 0, hence there is no undefined behavior. However, you + // cannot rely on ctz64(0) = 0 since TZCNT(0) = 64. + __asm__("rep bsf %1, %0" : "=r"(x) : "0"(x)); + ASSERT(x <= 64); + return x; +#endif +} + +} // namespace + +#elif (defined(__GNUC__) || \ + defined(__clang__)) && \ + defined(__aarch64__) + +#define HAS_CTZ64 +#define CTZ64_SUPPORTS_ZERO + +namespace { + +inline uint64_t ctz64(uint64_t x) +{ + // No undefined behavior, CTZ(0) = 64. + // ARM64 has no CTZ instruction, we have to emulate it. + __asm__("rbit %0, %1 \n\t" + "clz %0, %0 \n\t" + : "=r" (x) + : "r" (x)); + + return x; +} + +} // namespace + +#elif defined(_MSC_VER) && \ + defined(HAS_TZCNT) && \ + __has_include() + +#include +#define HAS_CTZ64 +#define CTZ64_SUPPORTS_ZERO + +// This allows us to generate the TZCNT instruction for MSVC +// without C++20 support, hence without std::countr_zero(x). +// No undefined behavior, _tzcnt_u64(0) = 64. +#define ctz64(x) _tzcnt_u64(x) + +#elif defined(__GNUC__) || \ + __has_builtin(__builtin_ctzl) + +#define HAS_CTZ64 + +namespace { + +inline int ctz64(uint64_t x) +{ + // __builtin_ctz(0) is undefined behavior, + // we don't define CTZ64_SUPPORTS_ZERO. + ASSERT(x != 0); + +#if __cplusplus >= 201703L + if constexpr(sizeof(int) >= sizeof(uint64_t)) + return __builtin_ctz(x); + else if constexpr(sizeof(long) >= sizeof(uint64_t)) + return __builtin_ctzl(x); + else if constexpr(sizeof(long long) >= sizeof(uint64_t)) + return __builtin_ctzll(x); +#else + return __builtin_ctzll(x); +#endif +} + +} // namespace + +#endif + +#endif // CTZ_HPP diff --git a/include/primesieve/intrinsics.hpp b/include/primesieve/intrinsics.hpp deleted file mode 100644 index 3ce0725d1..000000000 --- a/include/primesieve/intrinsics.hpp +++ /dev/null @@ -1,345 +0,0 @@ -/// -/// @file intrinsics.hpp -/// @brief Wrappers for compiler intrinsics. -/// -/// Copyright (C) 2024 Kim Walisch, -/// -/// This file is distributed under the BSD License. See the COPYING -/// file in the top level directory. -/// - -#ifndef INTRINSICS_HPP -#define INTRINSICS_HPP - -#include "cpu_supports_popcnt.hpp" -#include "macros.hpp" - -#include - -namespace { - -/// This uses fewer arithmetic operations than any other known -/// implementation on machines with fast multiplication. -/// It uses 12 arithmetic operations, one of which is a multiply. -/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation -/// -inline uint64_t popcnt64_bitwise(uint64_t x) -{ - uint64_t m1 = 0x5555555555555555ll; - uint64_t m2 = 0x3333333333333333ll; - uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; - uint64_t h01 = 0x0101010101010101ll; - - x -= (x >> 1) & m1; - x = (x & m2) + ((x >> 2) & m2); - x = (x + (x >> 4)) & m4; - - return (x * h01) >> 56; -} - -} // namespace - -// GCC & Clang -#if defined(__GNUC__) || \ - __has_builtin(__builtin_popcountl) - -// CPUID is only enabled on x86 and x86-64 CPUs -// if the user compiles without -mpopcnt. -#if defined(ENABLE_CPUID_POPCNT) -#if defined(__x86_64__) - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ - // On my AMD EPYC 7642 CPU using GCC 12 this runtime - // check incurs an overall overhead of about 1%. - if_likely(cpu_supports_popcnt) - { - __asm__("popcnt %1, %0" : "=r"(x) : "r"(x)); - return x; - } - else - { - // On x86 and x64 CPUs when using the GCC compiler - // __builtin_popcount*(x) is slow (not inlined function call) - // when compiling without -mpopcnt. Therefore we avoid - // using __builtin_popcount*(x) here. - return popcnt64_bitwise(x); - } -} - -} // namespace - -#elif defined(__i386__) - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ - if_likely(cpu_supports_popcnt) - { - uint32_t x0 = uint32_t(x); - uint32_t x1 = uint32_t(x >> 32); - __asm__("popcnt %1, %0" : "=r"(x0) : "r"(x0)); - __asm__("popcnt %1, %0" : "=r"(x1) : "r"(x1)); - return x0 + x1; - } - else - { - // On x86 and x64 CPUs when using the GCC compiler - // __builtin_popcount*(x) is slow (not inlined function call) - // when compiling without -mpopcnt. Therefore we avoid - // using __builtin_popcount*(x) here. - return popcnt64_bitwise(x); - } -} - -} // namespace - -#endif // i386 - -#else // GCC & Clang (no CPUID, not x86) - -namespace { - -inline int popcnt64(uint64_t x) -{ -#if __cplusplus >= 201703L - if constexpr(sizeof(int) >= sizeof(uint64_t)) - return __builtin_popcount(x); - else if constexpr(sizeof(long) >= sizeof(uint64_t)) - return __builtin_popcountl(x); - else if constexpr(sizeof(long long) >= sizeof(uint64_t)) - return __builtin_popcountll(x); -#else - return __builtin_popcountll(x); -#endif -} - -} // namespace - -#endif // GCC & Clang - -#elif defined(_MSC_VER) && \ - defined(_M_X64) && \ - __has_include() - -#include - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ -#if defined(HAS_POPCNT) - return __popcnt64(x); -#elif defined(ENABLE_CPUID_POPCNT) - if_likely(cpu_supports_popcnt) - return __popcnt64(x); - else - return popcnt64_bitwise(x); -#else - return popcnt64_bitwise(x); -#endif -} - -} // namespace - -#elif defined(_MSC_VER) && \ - defined(_M_IX86) && \ - __has_include() - -#include - -namespace { - -inline uint64_t popcnt64(uint64_t x) -{ -#if defined(HAS_POPCNT) - return __popcnt(uint32_t(x)) + - __popcnt(uint32_t(x >> 32)); -#elif defined(ENABLE_CPUID_POPCNT) - if_likely(cpu_supports_popcnt) - return __popcnt(uint32_t(x)) + - __popcnt(uint32_t(x >> 32)); - else - return popcnt64_bitwise(x); -#else - return popcnt64_bitwise(x); -#endif -} - -} // namespace - -#elif #if __cplusplus >= 202002L && \ - __has_include() - -#include - -namespace { - -/// We only use the C++ standard library as a fallback if there -/// are no compiler intrinsics available for POPCNT. -/// Compiler intrinsics often generate faster assembly. -inline int popcnt64(uint64_t x) -{ - return std::popcount(x); -} - -} // namespace - -#else - -namespace { - -/// Portable (but slow) popcount algorithm -inline uint64_t popcnt64(uint64_t x) -{ - return popcnt64_bitwise(x); -} - -} // namespace - -#endif // popcnt64() - -// GCC/Clang & MSVC -#if defined(__x86_64__) || \ - defined(_M_X64) - #define IS_X64 -#endif - -// On x64 CPUs: -// GCC & Clang enable TZCNT with -mbmi. -// MSVC enables TZCNT with /arch:AVX2 or later. -#if defined(__BMI__) || \ - (defined(_MSC_VER) && defined(__AVX2__)) - #define HAS_TZCNT -#endif - -// In 2022 std::countr_zero(x) generates good assembly for -// most compilers & CPU architectures, except for: -// 1) GCC & Clang on x64 without __BMI__. -// 2) MSVC on x64 without __AVX2__. -// Hence on x64 CPUs we only use std::countr_zero(x) if -// the compiler generates the TZCNT instruction. -#if defined(HAS_CPP20_BIT_HEADER) && \ - (defined(HAS_TZCNT) || !defined(IS_X64)) - -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -namespace { - -inline int ctz64(uint64_t x) -{ - // No undefined behavior, std::countr_zero(0) = 64 - return std::countr_zero(x); -} - -} // namespace - -#elif (defined(__GNUC__) || \ - defined(__clang__)) && \ - defined(__x86_64__) - -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -namespace { - -inline uint64_t ctz64(uint64_t x) -{ -#if defined(HAS_TZCNT) - // No undefined behavior, TZCNT(0) = 64 - __asm__("tzcnt %1, %0" : "=r"(x) : "r"(x)); - return x; -#else - // REP BSF uses the TZCNT instruction on x64 CPUs with the BMI1 - // instruction set (>= 2013) and the BSF instruction on older x64 - // CPUs. BSF(0) is undefined behavior, it leaves the destination - // register unmodified. Fortunately, it is possible to avoid this - // undefined behavior by always setting the destination register - // to the same value before executing BSF(0). This works on all - // AMD & Intel CPUs since the i586 (from 1993), the Linux kernel - // also relies on this behavior, see this Linux commit: - // https://github.com/torvalds/linux/commit/ca3d30cc02f780f68771087040ce935add6ba2b7 - // - // The constraint "0" for input operand 1 says that it must occupy - // the same location as output operand 0. Hence the assembly below - // uses the same input & output register. This ensures that - // BSF(0) = 0, hence there is no undefined behavior. However, you - // cannot rely on ctz64(0) = 0 since TZCNT(0) = 64. - __asm__("rep bsf %1, %0" : "=r"(x) : "0"(x)); - ASSERT(x <= 64); - return x; -#endif -} - -} // namespace - -#elif (defined(__GNUC__) || \ - defined(__clang__)) && \ - defined(__aarch64__) - -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -namespace { - -inline uint64_t ctz64(uint64_t x) -{ - // No undefined behavior, CTZ(0) = 64. - // ARM64 has no CTZ instruction, we have to emulate it. - __asm__("rbit %0, %1 \n\t" - "clz %0, %0 \n\t" - : "=r" (x) - : "r" (x)); - - return x; -} - -} // namespace - -#elif defined(_MSC_VER) && \ - defined(HAS_TZCNT) && \ - __has_include() - -#include -#define HAS_CTZ64 -#define CTZ64_SUPPORTS_ZERO - -// This allows us to generate the TZCNT instruction for MSVC -// without C++20 support, hence without std::countr_zero(x). -// No undefined behavior, _tzcnt_u64(0) = 64. -#define ctz64(x) _tzcnt_u64(x) - -#elif defined(__GNUC__) || \ - __has_builtin(__builtin_ctzl) - -#define HAS_CTZ64 - -namespace { - -inline int ctz64(uint64_t x) -{ - // __builtin_ctz(0) is undefined behavior, - // we don't define CTZ64_SUPPORTS_ZERO. - ASSERT(x != 0); - -#if __cplusplus >= 201703L - if constexpr(sizeof(int) >= sizeof(uint64_t)) - return __builtin_ctz(x); - else if constexpr(sizeof(long) >= sizeof(uint64_t)) - return __builtin_ctzl(x); - else if constexpr(sizeof(long long) >= sizeof(uint64_t)) - return __builtin_ctzll(x); -#else - return __builtin_ctzll(x); -#endif -} - -} // namespace - -#endif - -#endif // INTRINSICS_HPP diff --git a/include/primesieve/popcnt.hpp b/include/primesieve/popcnt.hpp new file mode 100644 index 000000000..c81b9f389 --- /dev/null +++ b/include/primesieve/popcnt.hpp @@ -0,0 +1,214 @@ +/// +/// @file popcnt.hpp +/// @brief Functions to count the number of 1 bits inside +/// a 64-bit variable. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef POPCNT_HPP +#define POPCNT_HPP + +#include "macros.hpp" +#include + +#if defined(ENABLE_MULTIARCH_x86_POPCNT) + #include "cpu_supports_popcnt.hpp" +#endif + +namespace { + +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// +inline uint64_t popcnt64_bitwise(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ll; + uint64_t m2 = 0x3333333333333333ll; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; + uint64_t h01 = 0x0101010101010101ll; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +} // namespace + +// GCC & Clang +#if defined(__GNUC__) || \ + __has_builtin(__builtin_popcountl) + +// CPUID is only enabled on x86 and x86-64 CPUs +// if the user compiles without -mpopcnt. +#if defined(ENABLE_MULTIARCH_x86_POPCNT) +#if defined(__x86_64__) + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + // On my AMD EPYC 7642 CPU using GCC 12 this runtime + // check incurs an overall overhead of about 1%. + if_likely(cpu_supports_popcnt) + { + __asm__("popcnt %1, %0" : "=r"(x) : "r"(x)); + return x; + } + else + { + // On x86 and x64 CPUs when using the GCC compiler + // __builtin_popcount*(x) is slow (not inlined function call) + // when compiling without -mpopcnt. Therefore we avoid + // using __builtin_popcount*(x) here. + return popcnt64_bitwise(x); + } +} + +} // namespace + +#elif defined(__i386__) + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + if_likely(cpu_supports_popcnt) + { + uint32_t x0 = uint32_t(x); + uint32_t x1 = uint32_t(x >> 32); + __asm__("popcnt %1, %0" : "=r"(x0) : "r"(x0)); + __asm__("popcnt %1, %0" : "=r"(x1) : "r"(x1)); + return x0 + x1; + } + else + { + // On x86 and x64 CPUs when using the GCC compiler + // __builtin_popcount*(x) is slow (not inlined function call) + // when compiling without -mpopcnt. Therefore we avoid + // using __builtin_popcount*(x) here. + return popcnt64_bitwise(x); + } +} + +} // namespace + +#endif // i386 + +#else // GCC & Clang (no CPUID, not x86) + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ +#if __cplusplus >= 201703L + if constexpr(sizeof(int) >= sizeof(uint64_t)) + return (uint64_t) __builtin_popcount(x); + else if constexpr(sizeof(long) >= sizeof(uint64_t)) + return (uint64_t) __builtin_popcountl(x); + else if constexpr(sizeof(long long) >= sizeof(uint64_t)) + return (uint64_t) __builtin_popcountll(x); +#else + return (uint64_t) __builtin_popcountll(x); +#endif +} + +} // namespace + +#endif // GCC & Clang + +#elif defined(_MSC_VER) && \ + defined(_M_X64) && \ + __has_include() + +#include + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ +#if defined(__POPCNT__) || \ + defined(__AVX__) + return __popcnt64(x); + +#elif defined(ENABLE_MULTIARCH_x86_POPCNT) + if_likely(cpu_supports_popcnt) + return __popcnt64(x); + else + return popcnt64_bitwise(x); + +#else + return popcnt64_bitwise(x); +#endif +} + +} // namespace + +#elif defined(_MSC_VER) && \ + defined(_M_IX86) && \ + __has_include() + +#include + +namespace { + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ +#if defined(__POPCNT__) || \ + defined(__AVX__) + return __popcnt(uint32_t(x)) + + __popcnt(uint32_t(x >> 32)); + +#elif defined(ENABLE_MULTIARCH_x86_POPCNT) + if_likely(cpu_supports_popcnt) + return __popcnt(uint32_t(x)) + + __popcnt(uint32_t(x >> 32)); + else + return popcnt64_bitwise(x); + +#else + return popcnt64_bitwise(x); +#endif +} + +} // namespace + +#elif __cplusplus >= 202002L && \ + __has_include() + +#include + +namespace { + +/// We only use the C++ standard library as a fallback if there +/// are no compiler intrinsics available for POPCNT. +/// Compiler intrinsics often generate faster assembly. +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + return std::popcount(x); +} + +} // namespace + +#else + +namespace { + +/// Portable (but slow) popcount algorithm +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + return popcnt64_bitwise(x); +} + +} // namespace + +#endif + +#endif // POPCNT_HPP diff --git a/scripts/build_clang_multiarch_win_x64.bat b/scripts/build_clang_multiarch_win_x64.bat index d7b53231d..0e1204506 100644 --- a/scripts/build_clang_multiarch_win_x64.bat +++ b/scripts/build_clang_multiarch_win_x64.bat @@ -1 +1 @@ -clang++ -I../include -O3 -DNDEBUG -DENABLE_MULTIARCH_AVX512 ../src/*.cpp ../src/app/*.cpp -o primesieve.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" +clang++ -I../include -O3 -DNDEBUG -DENABLE_MULTIARCH_AVX512_VBMI2 ../src/*.cpp ../src/app/*.cpp -o primesieve.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib" diff --git a/src/CpuInfo.cpp b/src/CpuInfo.cpp index 365a24396..15a33e9a1 100644 --- a/src/CpuInfo.cpp +++ b/src/CpuInfo.cpp @@ -39,20 +39,12 @@ #define APPLE_SYSCTL #endif -#if defined(__i386__) || \ - defined(__x86_64__) || \ - defined(_M_IX86) || \ - defined(_M_X64) - #include - #include - #define HAS_CPUID -#endif - #if defined(_WIN32) #include #include +#include #include #include @@ -62,25 +54,29 @@ std::string getCpuName() { std::string cpuName; -#if defined(HAS_CPUID) +#if defined(__i386__) || \ + defined(__x86_64__) || \ + defined(_M_IX86) || \ + defined(_M_X64) + // Get the CPU name using CPUID. // Example: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz // https://en.wikipedia.org/wiki/CPUID int cpuInfo[4] = { 0, 0, 0, 0 }; - run_cpuid(0x80000000, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000000, 0); std::vector vect; // check if CPU name is supported if ((unsigned) cpuInfo[0] >= 0x80000004u) { - run_cpuid(0x80000002, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000002, 0); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); - run_cpuid(0x80000003, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000003, 0); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); - run_cpuid(0x80000004, 0, cpuInfo); + __cpuidex(cpuInfo, 0x80000004, 0); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); vect.push_back(0); @@ -755,15 +751,6 @@ std::string CpuInfo::cpuName() const } } -bool CpuInfo::hasAVX512() const -{ - #if defined(HAS_CPUID) - return cpu_supports_avx512_vbmi2; - #else - return false; - #endif -} - size_t CpuInfo::logicalCpuCores() const { return logicalCpuCores_; diff --git a/src/PrimeGenerator.cpp b/src/PrimeGenerator.cpp index 3be1e0a1b..5406b298b 100644 --- a/src/PrimeGenerator.cpp +++ b/src/PrimeGenerator.cpp @@ -27,16 +27,16 @@ #include #include #include +#include #include -#include #include #include #include #include -#if defined(ENABLE_AVX512) || \ - defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_AVX512_VBMI2) || \ + defined(ENABLE_MULTIARCH_AVX512_VBMI2) #include #endif @@ -458,8 +458,8 @@ void PrimeGenerator::fillNextPrimes_default(Vector& primes, std::size_ #endif -#if defined(ENABLE_AVX512) || \ - defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_AVX512_VBMI2) || \ + defined(ENABLE_MULTIARCH_AVX512_VBMI2) /// This algorithm converts 1 bits from the sieve array into primes /// using AVX512. The algorithm is a modified version of the AVX512 @@ -474,10 +474,10 @@ void PrimeGenerator::fillNextPrimes_default(Vector& primes, std::size_ /// benchmarks this algorithm ran about 10% faster than the default /// fillNextPrimes() algorithm which uses __builtin_ctzll(). /// -#if defined(ENABLE_MULTIARCH_AVX512) +#if defined(ENABLE_MULTIARCH_AVX512_VBMI2) __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) #endif -void PrimeGenerator::fillNextPrimes_avx512(Vector& primes, std::size_t* size) +void PrimeGenerator::fillNextPrimes_avx512_vbmi2(Vector& primes, std::size_t* size) { *size = 0; diff --git a/src/app/main.cpp b/src/app/main.cpp index 6b25675da..80f1ea5b7 100644 --- a/src/app/main.cpp +++ b/src/app/main.cpp @@ -35,6 +35,16 @@ #include #include +#if defined(ENABLE_MULTIARCH_AVX512_VBMI2) + +namespace primesieve { + +bool has_cpuid_avx512_vbmi2() + +} // namespace + +#endif + void help(int exitCode); void version(); void stressTest(const CmdOptions& opts); @@ -223,14 +233,9 @@ void cpuInfo() else std::cout << "Logical CPU cores: unknown" << std::endl; - // Enable on x86 CPUs - #if defined(__x86_64__) || \ - defined(__i386__) || \ - defined(_M_X64) || \ - defined(_M_IX86) || \ - defined(__AVX512F__) + #if defined(ENABLE_MULTIARCH_AVX512_VBMI2) - if (cpu.hasAVX512()) + if (primesieve::has_cpuid_avx512_vbmi2()) std::cout << "Has AVX512: yes" << std::endl; else std::cout << "Has AVX512: no" << std::endl; diff --git a/src/popcount.cpp b/src/popcount.cpp index 93eab08ce..adede0c35 100644 --- a/src/popcount.cpp +++ b/src/popcount.cpp @@ -8,7 +8,7 @@ /// file in the top level directory. /// -#include +#include #include #include diff --git a/src/x86/cpuid.cpp b/src/x86/cpuid.cpp new file mode 100644 index 000000000..b571057c9 --- /dev/null +++ b/src/x86/cpuid.cpp @@ -0,0 +1,125 @@ +/// +/// @file cpuid.cpp +/// @brief CPUID for x86 and x86-64 CPUs. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#include + +#if defined(_MSC_VER) + #include + #include +#endif + +// CPUID bits documentation: +// https://en.wikipedia.org/wiki/CPUID + +// %ebx bit flags +#define bit_AVX512F (1 << 16) + +// %ecx bit flags +#define bit_AVX512VBMI (1 << 1) +#define bit_AVX512VBMI2 (1 << 6) +#define bit_POPCNT (1 << 23) + +// xgetbv bit flags +#define XSTATE_SSE (1 << 1) +#define XSTATE_YMM (1 << 2) +#define XSTATE_ZMM (7 << 5) + +namespace { + +void run_cpuid(int eax, int ecx, int* abcd) +{ +#if defined(_MSC_VER) + __cpuidex(abcd, eax, ecx); +#else + int ebx = 0; + int edx = 0; + + #if defined(__i386__) && \ + defined(__PIC__) + // In case of PIC under 32-bit EBX cannot be clobbered + asm volatile("movl %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "+a" (eax), + "=D" (ebx), + "+c" (ecx), + "=d" (edx)); + #else + asm volatile("cpuid" + : "+a" (eax), + "+b" (ebx), + "+c" (ecx), + "=d" (edx)); + #endif + + abcd[0] = eax; + abcd[1] = ebx; + abcd[2] = ecx; + abcd[3] = edx; +#endif +} + +// Get Value of Extended Control Register +uint64_t get_xcr0() +{ +#if defined(_MSC_VER) + return _xgetbv(0); +#else + uint32_t eax; + uint32_t edx; + + asm volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); + return eax | (uint64_t(edx) << 32); +#endif +} + +} // namespace + +namespace primesieve { + +bool has_cpuid_popcnt() +{ + int abcd[4]; + run_cpuid(1, 0, abcd); + return (abcd[2] & bit_POPCNT) == bit_POPCNT; +} + +bool has_cpuid_avx512_vbmi2() +{ + int abcd[4]; + + run_cpuid(1, 0, abcd); + + int osxsave_mask = (1 << 27); + + // Ensure OS supports extended processor state management + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return false; + + uint64_t ymm_mask = XSTATE_SSE | XSTATE_YMM; + uint64_t zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + uint64_t xcr0 = get_xcr0(); + + // Check AVX OS support + if ((xcr0 & ymm_mask) != ymm_mask) + return false; + + // Check AVX512 OS support + if ((xcr0 & zmm_mask) != zmm_mask) + return false; + + run_cpuid(7, 0, abcd); + + // PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2 + return ((abcd[1] & bit_AVX512F) == bit_AVX512F && + (abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2)); +} + +} // namespace diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 190ed9e7e..77fb8845c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,6 +5,6 @@ foreach(file ${files}) get_filename_component(binary_name ${file} NAME_WE) add_executable(${binary_name} ${file}) target_link_libraries(${binary_name} primesieve::primesieve) - target_compile_definitions(${binary_name} PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") + target_compile_definitions(${binary_name} PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS}) add_test(NAME ${binary_name} COMMAND ${binary_name}) endforeach() diff --git a/test/cpuid.cpp b/test/cpuid.cpp index 64c0d9361..6ed4b85c1 100644 --- a/test/cpuid.cpp +++ b/test/cpuid.cpp @@ -8,7 +8,10 @@ /// file in the top level directory. /// -#include +#if defined(ENABLE_MULTIARCH_x86_POPCNT) + #include +#endif + #include int main() @@ -18,52 +21,22 @@ int main() defined(_M_X64) || \ defined(_M_IX86) - #if defined(__POPCNT__) - #if defined(HAS_POPCNT) - std::cout << "OK: __POPCNT__ and HAS_POPCNT are defined!" << std::endl; - #else - std::cerr << "Error: HAS_POPCNT must be defined if __POPCNT__ is defined!" << std::endl; - return 1; - #endif - #endif - - #if defined(__AVX__) - #if defined(HAS_POPCNT) - std::cout << "OK: __AVX__ and HAS_POPCNT are defined!" << std::endl; - #else - std::cerr << "Error: HAS_POPCNT must be defined if __AVX__ is defined!" << std::endl; - return 1; - #endif + #if defined(__POPCNT__) && defined(ENABLE_MULTIARCH_x86_POPCNT) + std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __POPCNT__ is defined!" << std::endl; #endif - #if defined(__AVX2__) - #if defined(HAS_POPCNT) - std::cout << "OK: __AVX2__ and HAS_POPCNT are defined!" << std::endl; - #else - std::cerr << "Error: HAS_POPCNT must be defined if __AVX2__ is defined!" << std::endl; - return 1; - #endif - #endif + #if defined(_MSC_VER) && \ + !defined(__POPCNT__) - #if defined(HAS_POPCNT) - #if !defined(ENABLE_CPUID_POPCNT) - std::cout << "OK: HAS_POPCNT is defined but ENABLE_CPUID_POPCNT is not defined!" << std::endl; - #else - std::cerr << "Error: ENABLE_CPUID_POPCNT must not be defined if HAS_POPCNT is defined!" << std::endl; - return 1; + #if defined(__AVX__) && defined(ENABLE_MULTIARCH_x86_POPCNT) + std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __AVX__ is defined!" << std::endl; #endif - #endif - - #if !defined(HAS_POPCNT) - #if defined(ENABLE_CPUID_POPCNT) - std::cout << "OK: HAS_POPCNT is not defined but ENABLE_CPUID_POPCNT is defined!" << std::endl; - #else - std::cerr << "Error: ENABLE_CPUID_POPCNT must be defined if HAS_POPCNT is not defined!" << std::endl; - return 1; + #if defined(__AVX2__) && defined(ENABLE_MULTIARCH_x86_POPCNT) + std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __AVX2__ is defined!" << std::endl; #endif #endif - #if defined(ENABLE_CPUID_POPCNT) + #if defined(ENABLE_MULTIARCH_x86_POPCNT) std::cout << "CPU supports POPCNT: " << (cpu_supports_popcnt ? "yes" : "no") << std::endl; #endif