From 9f62704a448178cf9c08c51eb3fdde6dcb2be8b0 Mon Sep 17 00:00:00 2001
From: Kim Walisch <kim.walisch@gmail.com>
Date: Sat, 22 Jun 2024 16:22:29 +0200
Subject: [PATCH] Move x86 CPUID code from cpuid.hpp to src/x86/cpuid.cpp

---
 CMakeLists.txt                            |  33 ++-
 ChangeLog                                 |   8 +
 cmake/auto_vectorization.cmake            |   4 +-
 cmake/libatomic.cmake                     |   5 +-
 cmake/multiarch_avx512_vbmi2.cmake        |  14 +-
 cmake/multiarch_x86_popcnt.cmake          |  53 ++++
 include/primesieve/CpuInfo.hpp            |   3 +-
 include/primesieve/Erat.hpp               |   4 +-
 include/primesieve/PrimeGenerator.hpp     |  20 +-
 include/primesieve/ctz.hpp                | 157 ++++++++++
 include/primesieve/intrinsics.hpp         | 345 ----------------------
 include/primesieve/popcnt.hpp             | 214 ++++++++++++++
 scripts/build_clang_multiarch_win_x64.bat |   2 +-
 src/CpuInfo.cpp                           |  33 +--
 src/PrimeGenerator.cpp                    |  14 +-
 src/app/main.cpp                          |  19 +-
 src/popcount.cpp                          |   2 +-
 src/x86/cpuid.cpp                         | 125 ++++++++
 test/CMakeLists.txt                       |   2 +-
 test/cpuid.cpp                            |  53 +---
 20 files changed, 647 insertions(+), 463 deletions(-)
 create mode 100644 cmake/multiarch_x86_popcnt.cmake
 create mode 100644 include/primesieve/ctz.hpp
 delete mode 100644 include/primesieve/intrinsics.hpp
 create mode 100644 include/primesieve/popcnt.hpp
 create mode 100644 src/x86/cpuid.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 84bc037c7..ab9275260 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,7 @@ if(NOT isMultiConfig AND NOT CMAKE_BUILD_TYPE)
 endif()
 
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(ENABLE_ASSERT "ENABLE_ASSERT")
+    list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_ASSERT")
 endif()
 
 # primesieve binary source files #####################################
@@ -82,6 +82,17 @@ set(LIB_SRC src/api-c.cpp
             src/RiemannR.cpp
             src/SievingPrimes.cpp)
 
+# Check if compiler supports CPU multiarch ###########################
+
+if(WITH_MULTIARCH)
+    include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake")
+    include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake")
+
+    if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2)
+        set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp)
+    endif()
+endif()
+
 # Required includes ##################################################
 
 include(GNUInstallDirs)
@@ -107,12 +118,6 @@ if(WITH_AUTO_VECTORIZATION)
     include("${PROJECT_SOURCE_DIR}/cmake/auto_vectorization.cmake")
 endif()
 
-# Check if compiler supports x64 multiarch ###########################
-
-if(WITH_MULTIARCH)
-    include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake")
-endif()
-
 # libprimesieve (shared library) #####################################
 
 find_package(Threads REQUIRED QUIET)
@@ -120,13 +125,13 @@ find_package(Threads REQUIRED QUIET)
 if(BUILD_SHARED_LIBS)
     add_library(libprimesieve SHARED ${LIB_SRC})
     set_target_properties(libprimesieve PROPERTIES OUTPUT_NAME primesieve)
-    target_link_libraries(libprimesieve PRIVATE Threads::Threads ${LIBATOMIC})
+    target_link_libraries(libprimesieve PRIVATE Threads::Threads ${PRIMESIEVE_LINK_LIBRARIES})
     string(REPLACE "." ";" SOVERSION_LIST ${PRIMESIEVE_SOVERSION})
     list(GET SOVERSION_LIST 0 PRIMESIEVE_SOVERSION_MAJOR)
     set_target_properties(libprimesieve PROPERTIES SOVERSION ${PRIMESIEVE_SOVERSION_MAJOR})
     set_target_properties(libprimesieve PROPERTIES VERSION ${PRIMESIEVE_SOVERSION})
-    target_compile_options(libprimesieve PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG})
-    target_compile_definitions(libprimesieve PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}")
+    target_compile_options(libprimesieve PRIVATE ${PRIMESIEVE_COMPILE_OPTIONS})
+    target_compile_definitions(libprimesieve PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS})
 
     if(WIN32_MSVC_COMPATIBLE)
         # On Windows the shared library will be named primesieve.dll
@@ -162,9 +167,9 @@ endif()
 if(BUILD_STATIC_LIBS)
     add_library(libprimesieve-static STATIC ${LIB_SRC})
     set_target_properties(libprimesieve-static PROPERTIES OUTPUT_NAME primesieve)
-    target_link_libraries(libprimesieve-static PRIVATE Threads::Threads ${LIBATOMIC})
-    target_compile_options(libprimesieve-static PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG})
-    target_compile_definitions(libprimesieve-static PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}")
+    target_link_libraries(libprimesieve-static PRIVATE Threads::Threads ${PRIMESIEVE_LINK_LIBRARIES})
+    target_compile_options(libprimesieve-static PRIVATE ${PRIMESIEVE_COMPILE_OPTIONS})
+    target_compile_definitions(libprimesieve-static PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS})
 
     if(WITH_MSVC_CRT_STATIC)
         set_target_properties(libprimesieve-static PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded")
@@ -219,7 +224,7 @@ endif()
 if(BUILD_PRIMESIEVE)
     add_executable(primesieve ${BIN_SRC})
     target_link_libraries(primesieve primesieve::primesieve Threads::Threads)
-    target_compile_definitions(primesieve PRIVATE "${ENABLE_ASSERT}")
+    target_compile_definitions(primesieve PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS})
     target_compile_features(primesieve PRIVATE cxx_auto_type)
     install(TARGETS primesieve DESTINATION ${CMAKE_INSTALL_BINDIR})
 
diff --git a/ChangeLog b/ChangeLog
index 4a07fa953..ded64abb2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+Changes in version 12.4, 22/06/2024
+===================================
+
+* Move x86 CPUID code from cpuid.hpp to src/x86/cpuid.cpp.
+* multiarch_x86_popcnt.cmake: Detect x86 POPCNT support.
+* CMakeLists.txt: Use CMake list for all compile time definitions.
+* CMakeLists.txt: Use CMake list for all link libraries.
+
 Changes in version 12.3, 15/04/2024
 ===================================
 
diff --git a/cmake/auto_vectorization.cmake b/cmake/auto_vectorization.cmake
index 84897cc45..4a243f70c 100644
--- a/cmake/auto_vectorization.cmake
+++ b/cmake/auto_vectorization.cmake
@@ -16,7 +16,7 @@ check_cxx_compiler_flag(-ftree-vectorize ftree_vectorize)
 cmake_pop_check_state()
 
 if(ftree_vectorize)
-    set(FTREE_VECTORIZE_FLAG "-ftree-vectorize")
+    list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-ftree-vectorize")
 
     cmake_push_check_state()
     set(CMAKE_REQUIRED_FLAGS -Werror)
@@ -24,6 +24,6 @@ if(ftree_vectorize)
     cmake_pop_check_state()
 
     if(fvect_cost_model)
-        set(FVECT_COST_MODEL_FLAG "-fvect-cost-model=dynamic")
+        list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-fvect-cost-model=dynamic")
     endif()
 endif()
diff --git a/cmake/libatomic.cmake b/cmake/libatomic.cmake
index 333926f2b..acd7ca993 100644
--- a/cmake/libatomic.cmake
+++ b/cmake/libatomic.cmake
@@ -57,7 +57,10 @@ if(NOT atomic64)
         }"
         atomic64_with_libatomic)
 
-    if (NOT atomic64_with_libatomic)
+    if(atomic64_with_libatomic)
+        list(APPEND PRIMESIEVE_LINK_LIBRARIES "${LIBATOMIC}")
+    else()
+        set(LIBATOMIC "")
         message(FATAL_ERROR "Failed to compile std::atomic, libatomic likely not found!")
     endif()
 endif()
diff --git a/cmake/multiarch_avx512_vbmi2.cmake b/cmake/multiarch_avx512_vbmi2.cmake
index f56d5e933..c0b7c0e96 100644
--- a/cmake/multiarch_avx512_vbmi2.cmake
+++ b/cmake/multiarch_avx512_vbmi2.cmake
@@ -7,7 +7,7 @@ include(CheckCXXSourceCompiles)
 include(CMakePushCheckState)
 
 cmake_push_check_state()
-set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include")
+set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}")
 
 check_cxx_source_compiles("
     // GCC/Clang function multiversioning for AVX512 is not needed if
@@ -20,19 +20,19 @@ check_cxx_source_compiles("
       Error: AVX512VBMI2 multiarch not needed!
     #endif
 
-    #include <primesieve/cpu_supports_avx512_vbmi2.hpp>
+    #include <src/x86/cpuid.cpp>
     #include <immintrin.h>
     #include <stdint.h>
 
     class PrimeGenerator {
     public:
         __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\")))
-        void fillNextPrimes_avx512(uint64_t* primes64);
+        void fillNextPrimes_avx512_vbmi2(uint64_t* primes64);
         void fillNextPrimes_default(uint64_t* primes64);
         void fillNextPrimes(uint64_t* primes64)
         {
-            if (cpu_supports_avx512_vbmi2)
-                fillNextPrimes_avx512(primes64);
+            if (primesieve::has_cpuid_avx512_vbmi2())
+                fillNextPrimes_avx512_vbmi2(primes64);
             else
                 fillNextPrimes_default(primes64);
         }
@@ -44,7 +44,7 @@ check_cxx_source_compiles("
     }
 
     __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\")))
-    void PrimeGenerator::fillNextPrimes_avx512(uint64_t* primes64)
+    void PrimeGenerator::fillNextPrimes_avx512_vbmi2(uint64_t* primes64)
     {
         __m512i bytes_0_to_7 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
         __m512i base = _mm512_set1_epi64(123);
@@ -64,7 +64,7 @@ check_cxx_source_compiles("
 " multiarch_avx512_vbmi2)
 
 if(multiarch_avx512_vbmi2)
-    set(ENABLE_MULTIARCH_AVX512 "ENABLE_MULTIARCH_AVX512")
+    list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_VBMI2")
 endif()
 
 cmake_pop_check_state()
diff --git a/cmake/multiarch_x86_popcnt.cmake b/cmake/multiarch_x86_popcnt.cmake
new file mode 100644
index 000000000..707a8e470
--- /dev/null
+++ b/cmake/multiarch_x86_popcnt.cmake
@@ -0,0 +1,53 @@
+# On x86 CPUs we need to enable the use of cpuid.cpp.
+# If cpuid.cpp compiles we assume it is a x86 CPU.
+
+include(CheckCXXSourceCompiles)
+include(CMakePushCheckState)
+
+cmake_push_check_state()
+set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}")
+
+check_cxx_source_compiles("
+    // Enable CPUID for POPCNT on x86 and x86-64 CPUs.
+    // This is required because not all x86 and x86-64 CPUs
+    // support the POPCNT instruction.
+    #if !(defined(__x86_64__) || \
+          defined(__i386__) || \
+          defined(_M_X64) || \
+          defined(_M_IX86))
+      Error: x86 POPCNT multiarch not needed!
+    #endif
+
+    // Both GCC and Clang (even Clang on Windows) define the __POPCNT__
+    // macro if the user compiles with -mpopcnt. The __POPCNT__
+    // macro is even defined if the user compiles with other flags
+    // such as -mavx or -march=native.
+    #if defined(__POPCNT__)
+        Error: x86 POPCNT multiarch not needed!
+
+    // The MSVC compiler does not support a POPCNT macro, but if the user
+    // compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines
+    // the __AVX__ macro and POPCNT is also supported.
+    #elif defined(_MSC_VER) && defined(__AVX__)
+        Error: x86 POPCNT multiarch not needed!
+    #endif
+
+    #include <src/x86/cpuid.cpp>
+    #include <iostream>
+
+    int main()
+    {
+        if (primesieve::has_cpuid_popcnt())
+            std::cout << \"CPU supports POPCNT!\" << std::endl;
+        else
+            std::cout << \"CPU does not support POPCNT!\" << std::endl;
+
+        return 0;
+    }
+" multiarch_x86_popcnt)
+
+if(multiarch_x86_popcnt)
+    list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_x86_POPCNT")
+endif()
+
+cmake_pop_check_state()
diff --git a/include/primesieve/CpuInfo.hpp b/include/primesieve/CpuInfo.hpp
index 426cc6323..5ccd1835a 100644
--- a/include/primesieve/CpuInfo.hpp
+++ b/include/primesieve/CpuInfo.hpp
@@ -1,7 +1,7 @@
 ///
 /// @file  CpuInfo.hpp
 ///
-/// Copyright (C) 2023 Kim Walisch, <kim.walisch@gmail.com>
+/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
 ///
 /// This file is distributed under the BSD License. See the COPYING
 /// file in the top level directory.
@@ -22,7 +22,6 @@ class CpuInfo
 public:
   CpuInfo();
   bool hasCpuName() const;
-  bool hasAVX512() const;
   bool hasLogicalCpuCores() const;
   bool hasL1Cache() const;
   bool hasL2Cache() const;
diff --git a/include/primesieve/Erat.hpp b/include/primesieve/Erat.hpp
index bfd952863..32c0f221d 100644
--- a/include/primesieve/Erat.hpp
+++ b/include/primesieve/Erat.hpp
@@ -1,7 +1,7 @@
 ///
 /// @file  Erat.hpp
 ///
-/// Copyright (C) 2023 Kim Walisch, <kim.walisch@gmail.com>
+/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
 ///
 /// This file is distributed under the BSD License. See the COPYING
 /// file in the top level directory.
@@ -15,8 +15,8 @@
 #include "EratMedium.hpp"
 #include "EratBig.hpp"
 #include "macros.hpp"
-#include "intrinsics.hpp"
 #include "Vector.hpp"
+#include "ctz.hpp"
 
 #include <stdint.h>
 
diff --git a/include/primesieve/PrimeGenerator.hpp b/include/primesieve/PrimeGenerator.hpp
index 59f7567d9..aa0cecdc6 100644
--- a/include/primesieve/PrimeGenerator.hpp
+++ b/include/primesieve/PrimeGenerator.hpp
@@ -27,9 +27,9 @@
     defined(__AVX512VBMI__) && \
     defined(__AVX512VBMI2__) && \
     __has_include(<immintrin.h>)
-  #define ENABLE_AVX512
+  #define ENABLE_AVX512_VBMI2
 
-#elif defined(ENABLE_MULTIARCH_AVX512) && \
+#elif defined(ENABLE_MULTIARCH_AVX512_VBMI2) && \
       __has_include(<immintrin.h>)
   #include "cpu_supports_avx512_vbmi2.hpp"
   #define ENABLE_DEFAULT
@@ -50,11 +50,11 @@ class PrimeGenerator : public Erat
 
   ALWAYS_INLINE void fillNextPrimes(Vector<uint64_t>& primes, std::size_t* size)
   {
-    #if defined(ENABLE_AVX512)
-      fillNextPrimes_avx512(primes, size);
-    #elif defined(ENABLE_MULTIARCH_AVX512)
+    #if defined(ENABLE_AVX512_VBMI2)
+      fillNextPrimes_avx512_vbmi2(primes, size);
+    #elif defined(ENABLE_MULTIARCH_AVX512_VBMI2)
       if (cpu_supports_avx512_vbmi2)
-        fillNextPrimes_avx512(primes, size);
+        fillNextPrimes_avx512_vbmi2(primes, size);
       else
         fillNextPrimes_default(primes, size);
     #else
@@ -68,13 +68,13 @@ class PrimeGenerator : public Erat
   void fillNextPrimes_default(Vector<uint64_t>& primes, std::size_t* size);
 #endif
 
-#if defined(ENABLE_AVX512) || \
-    defined(ENABLE_MULTIARCH_AVX512)
+#if defined(ENABLE_AVX512_VBMI2) || \
+    defined(ENABLE_MULTIARCH_AVX512_VBMI2)
 
-  #if defined(ENABLE_MULTIARCH_AVX512)
+  #if defined(ENABLE_MULTIARCH_AVX512_VBMI2)
     __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2")))
   #endif
-  void fillNextPrimes_avx512(Vector<uint64_t>& primes, std::size_t* size);
+  void fillNextPrimes_avx512_vbmi2(Vector<uint64_t>& primes, std::size_t* size);
 
 #endif
 
diff --git a/include/primesieve/ctz.hpp b/include/primesieve/ctz.hpp
new file mode 100644
index 000000000..06bf5eeee
--- /dev/null
+++ b/include/primesieve/ctz.hpp
@@ -0,0 +1,157 @@
+///
+/// @file   ctz.hpp
+/// @brief  Count the number of trailing zeros.
+///
+/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
+///
+/// This file is distributed under the BSD License. See the COPYING
+/// file in the top level directory.
+///
+
+#ifndef CTZ_HPP
+#define CTZ_HPP
+
+#include "macros.hpp"
+#include <stdint.h>
+
+// GCC/Clang & MSVC
+#if defined(__x86_64__) || \
+    defined(_M_X64)
+  #define IS_X64
+#endif
+
+// On x64 CPUs:
+// GCC & Clang enable TZCNT with -mbmi.
+// MSVC enables TZCNT with /arch:AVX2 or later.
+#if defined(__BMI__) || \
+   (defined(_MSC_VER) && defined(__AVX2__))
+  #define HAS_TZCNT
+#endif
+
+// In 2022 std::countr_zero(x) generates good assembly for
+// most compilers & CPU architectures, except for:
+// 1) GCC & Clang on x64 without __BMI__.
+// 2) MSVC on x64 without __AVX2__.
+// Hence on x64 CPUs we only use std::countr_zero(x) if
+// the compiler generates the TZCNT instruction.
+#if defined(HAS_CPP20_BIT_HEADER) && \
+   (defined(HAS_TZCNT) || !defined(IS_X64))
+
+#define HAS_CTZ64
+#define CTZ64_SUPPORTS_ZERO
+
+namespace {
+
+inline int ctz64(uint64_t x)
+{
+  // No undefined behavior, std::countr_zero(0) = 64
+  return std::countr_zero(x);
+}
+
+} // namespace
+
+#elif (defined(__GNUC__) || \
+       defined(__clang__)) && \
+       defined(__x86_64__)
+
+#define HAS_CTZ64
+#define CTZ64_SUPPORTS_ZERO
+
+namespace {
+
+inline uint64_t ctz64(uint64_t x)
+{
+#if defined(HAS_TZCNT)
+  // No undefined behavior, TZCNT(0) = 64
+  __asm__("tzcnt %1, %0" : "=r"(x) : "r"(x));
+  return x;
+#else
+  // REP BSF uses the TZCNT instruction on x64 CPUs with the BMI1
+  // instruction set (>= 2013) and the BSF instruction on older x64
+  // CPUs. BSF(0) is undefined behavior, it leaves the destination
+  // register unmodified. Fortunately, it is possible to avoid this
+  // undefined behavior by always setting the destination register
+  // to the same value before executing BSF(0). This works on all
+  // AMD & Intel CPUs since the i586 (from 1993), the Linux kernel
+  // also relies on this behavior, see this Linux commit:
+  // https://github.com/torvalds/linux/commit/ca3d30cc02f780f68771087040ce935add6ba2b7
+  //
+  // The constraint "0" for input operand 1 says that it must occupy
+  // the same location as output operand 0. Hence the assembly below
+  // uses the same input & output register. This ensures that
+  // BSF(0) = 0, hence there is no undefined behavior. However, you
+  // cannot rely on ctz64(0) = 0 since TZCNT(0) = 64.
+  __asm__("rep bsf %1, %0" : "=r"(x) : "0"(x));
+  ASSERT(x <= 64);
+  return x;
+#endif
+}
+
+} // namespace
+
+#elif (defined(__GNUC__) || \
+       defined(__clang__)) && \
+       defined(__aarch64__)
+
+#define HAS_CTZ64
+#define CTZ64_SUPPORTS_ZERO
+
+namespace {
+
+inline uint64_t ctz64(uint64_t x)
+{
+  // No undefined behavior, CTZ(0) = 64.
+  // ARM64 has no CTZ instruction, we have to emulate it.
+  __asm__("rbit %0, %1 \n\t"
+          "clz %0, %0  \n\t"
+          : "=r" (x)
+          : "r" (x));
+
+  return x;
+}
+
+} // namespace
+
+#elif defined(_MSC_VER) && \
+      defined(HAS_TZCNT) && \
+      __has_include(<immintrin.h>)
+
+#include <immintrin.h>
+#define HAS_CTZ64
+#define CTZ64_SUPPORTS_ZERO
+
+// This allows us to generate the TZCNT instruction for MSVC
+// without C++20 support, hence without std::countr_zero(x).
+// No undefined behavior, _tzcnt_u64(0) = 64.
+#define ctz64(x) _tzcnt_u64(x)
+
+#elif defined(__GNUC__) || \
+      __has_builtin(__builtin_ctzl)
+
+#define HAS_CTZ64
+
+namespace {
+
+inline int ctz64(uint64_t x)
+{
+  // __builtin_ctz(0) is undefined behavior,
+  // we don't define CTZ64_SUPPORTS_ZERO.
+  ASSERT(x != 0);
+
+#if __cplusplus >= 201703L
+  if constexpr(sizeof(int) >= sizeof(uint64_t))
+    return __builtin_ctz(x);
+  else if constexpr(sizeof(long) >= sizeof(uint64_t))
+    return __builtin_ctzl(x);
+  else if constexpr(sizeof(long long) >= sizeof(uint64_t))
+    return __builtin_ctzll(x);
+#else
+    return __builtin_ctzll(x);
+#endif
+}
+
+} // namespace
+
+#endif
+
+#endif // CTZ_HPP
diff --git a/include/primesieve/intrinsics.hpp b/include/primesieve/intrinsics.hpp
deleted file mode 100644
index 3ce0725d1..000000000
--- a/include/primesieve/intrinsics.hpp
+++ /dev/null
@@ -1,345 +0,0 @@
-///
-/// @file   intrinsics.hpp
-/// @brief  Wrappers for compiler intrinsics.
-///
-/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
-///
-/// This file is distributed under the BSD License. See the COPYING
-/// file in the top level directory.
-///
-
-#ifndef INTRINSICS_HPP
-#define INTRINSICS_HPP
-
-#include "cpu_supports_popcnt.hpp"
-#include "macros.hpp"
-
-#include <stdint.h>
-
-namespace {
-
-/// This uses fewer arithmetic operations than any other known
-/// implementation on machines with fast multiplication.
-/// It uses 12 arithmetic operations, one of which is a multiply.
-/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
-///
-inline uint64_t popcnt64_bitwise(uint64_t x)
-{
-  uint64_t m1 = 0x5555555555555555ll;
-  uint64_t m2 = 0x3333333333333333ll;
-  uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
-  uint64_t h01 = 0x0101010101010101ll;
-
-  x -= (x >> 1) & m1;
-  x = (x & m2) + ((x >> 2) & m2);
-  x = (x + (x >> 4)) & m4;
-
-  return (x * h01) >> 56;
-}
-
-} // namespace
-
-// GCC & Clang
-#if defined(__GNUC__) || \
-    __has_builtin(__builtin_popcountl)
-
-// CPUID is only enabled on x86 and x86-64 CPUs
-// if the user compiles without -mpopcnt.
-#if defined(ENABLE_CPUID_POPCNT)
-#if defined(__x86_64__)
-
-namespace {
-
-inline uint64_t popcnt64(uint64_t x)
-{
-  // On my AMD EPYC 7642 CPU using GCC 12 this runtime
-  // check incurs an overall overhead of about 1%.
-  if_likely(cpu_supports_popcnt)
-  {
-    __asm__("popcnt %1, %0" : "=r"(x) : "r"(x));
-    return x;
-  }
-  else
-  {
-    // On x86 and x64 CPUs when using the GCC compiler
-    // __builtin_popcount*(x) is slow (not inlined function call)
-    // when compiling without -mpopcnt. Therefore we avoid
-    // using __builtin_popcount*(x) here.
-    return popcnt64_bitwise(x);
-  }
-}
-
-} // namespace
-
-#elif defined(__i386__)
-
-namespace {
-
-inline uint64_t popcnt64(uint64_t x)
-{
-  if_likely(cpu_supports_popcnt)
-  {
-    uint32_t x0 = uint32_t(x);
-    uint32_t x1 = uint32_t(x >> 32);
-    __asm__("popcnt %1, %0" : "=r"(x0) : "r"(x0));
-    __asm__("popcnt %1, %0" : "=r"(x1) : "r"(x1));
-    return x0 + x1;
-  }
-  else
-  {
-    // On x86 and x64 CPUs when using the GCC compiler
-    // __builtin_popcount*(x) is slow (not inlined function call)
-    // when compiling without -mpopcnt. Therefore we avoid
-    // using __builtin_popcount*(x) here.
-    return popcnt64_bitwise(x);
-  }
-}
-
-} // namespace
-
-#endif // i386
-
-#else // GCC & Clang (no CPUID, not x86)
-
-namespace {
-
-inline int popcnt64(uint64_t x)
-{
-#if __cplusplus >= 201703L
-  if constexpr(sizeof(int) >= sizeof(uint64_t))
-    return __builtin_popcount(x);
-  else if constexpr(sizeof(long) >= sizeof(uint64_t))
-    return __builtin_popcountl(x);
-  else if constexpr(sizeof(long long) >= sizeof(uint64_t))
-    return __builtin_popcountll(x);
-#else
-    return __builtin_popcountll(x);
-#endif
-}
-
-} // namespace
-
-#endif // GCC & Clang
-
-#elif defined(_MSC_VER) && \
-      defined(_M_X64) && \
-      __has_include(<intrin.h>)
-
-#include <intrin.h>
-
-namespace {
-
-inline uint64_t popcnt64(uint64_t x)
-{
-#if defined(HAS_POPCNT)
-  return __popcnt64(x);
-#elif defined(ENABLE_CPUID_POPCNT)
-  if_likely(cpu_supports_popcnt)
-    return __popcnt64(x);
-  else
-    return popcnt64_bitwise(x);
-#else
-  return popcnt64_bitwise(x);
-#endif
-}
-
-} // namespace
-
-#elif defined(_MSC_VER) && \
-      defined(_M_IX86) && \
-      __has_include(<intrin.h>)
-
-#include <intrin.h>
-
-namespace {
-
-inline uint64_t popcnt64(uint64_t x)
-{
-#if defined(HAS_POPCNT)
-  return __popcnt(uint32_t(x)) +
-         __popcnt(uint32_t(x >> 32));
-#elif defined(ENABLE_CPUID_POPCNT)
-  if_likely(cpu_supports_popcnt)
-    return __popcnt(uint32_t(x)) +
-           __popcnt(uint32_t(x >> 32));
-  else
-    return popcnt64_bitwise(x);
-#else
-  return popcnt64_bitwise(x);
-#endif
-}
-
-} // namespace
-
-#elif #if __cplusplus >= 202002L && \
-      __has_include(<bit>)
-
-#include <bit>
-
-namespace {
-
-/// We only use the C++ standard library as a fallback if there
-/// are no compiler intrinsics available for POPCNT.
-/// Compiler intrinsics often generate faster assembly.
-inline int popcnt64(uint64_t x)
-{
-  return std::popcount(x);
-}
-
-} // namespace
-
-#else
-
-namespace {
-
-/// Portable (but slow) popcount algorithm
-inline uint64_t popcnt64(uint64_t x)
-{
-  return popcnt64_bitwise(x);
-}
-
-} // namespace
-
-#endif // popcnt64()
-
-// GCC/Clang & MSVC
-#if defined(__x86_64__) || \
-    defined(_M_X64)
-  #define IS_X64
-#endif
-
-// On x64 CPUs:
-// GCC & Clang enable TZCNT with -mbmi.
-// MSVC enables TZCNT with /arch:AVX2 or later.
-#if defined(__BMI__) || \
-   (defined(_MSC_VER) && defined(__AVX2__))
-  #define HAS_TZCNT
-#endif
-
-// In 2022 std::countr_zero(x) generates good assembly for
-// most compilers & CPU architectures, except for:
-// 1) GCC & Clang on x64 without __BMI__.
-// 2) MSVC on x64 without __AVX2__.
-// Hence on x64 CPUs we only use std::countr_zero(x) if
-// the compiler generates the TZCNT instruction.
-#if defined(HAS_CPP20_BIT_HEADER) && \
-   (defined(HAS_TZCNT) || !defined(IS_X64))
-
-#define HAS_CTZ64
-#define CTZ64_SUPPORTS_ZERO
-
-namespace {
-
-inline int ctz64(uint64_t x)
-{
-  // No undefined behavior, std::countr_zero(0) = 64
-  return std::countr_zero(x);
-}
-
-} // namespace
-
-#elif (defined(__GNUC__) || \
-       defined(__clang__)) && \
-       defined(__x86_64__)
-
-#define HAS_CTZ64
-#define CTZ64_SUPPORTS_ZERO
-
-namespace {
-
-inline uint64_t ctz64(uint64_t x)
-{
-#if defined(HAS_TZCNT)
-  // No undefined behavior, TZCNT(0) = 64
-  __asm__("tzcnt %1, %0" : "=r"(x) : "r"(x));
-  return x;
-#else
-  // REP BSF uses the TZCNT instruction on x64 CPUs with the BMI1
-  // instruction set (>= 2013) and the BSF instruction on older x64
-  // CPUs. BSF(0) is undefined behavior, it leaves the destination
-  // register unmodified. Fortunately, it is possible to avoid this
-  // undefined behavior by always setting the destination register
-  // to the same value before executing BSF(0). This works on all
-  // AMD & Intel CPUs since the i586 (from 1993), the Linux kernel
-  // also relies on this behavior, see this Linux commit:
-  // https://github.com/torvalds/linux/commit/ca3d30cc02f780f68771087040ce935add6ba2b7
-  //
-  // The constraint "0" for input operand 1 says that it must occupy
-  // the same location as output operand 0. Hence the assembly below
-  // uses the same input & output register. This ensures that
-  // BSF(0) = 0, hence there is no undefined behavior. However, you
-  // cannot rely on ctz64(0) = 0 since TZCNT(0) = 64.
-  __asm__("rep bsf %1, %0" : "=r"(x) : "0"(x));
-  ASSERT(x <= 64);
-  return x;
-#endif
-}
-
-} // namespace
-
-#elif (defined(__GNUC__) || \
-       defined(__clang__)) && \
-       defined(__aarch64__)
-
-#define HAS_CTZ64
-#define CTZ64_SUPPORTS_ZERO
-
-namespace {
-
-inline uint64_t ctz64(uint64_t x)
-{
-  // No undefined behavior, CTZ(0) = 64.
-  // ARM64 has no CTZ instruction, we have to emulate it.
-  __asm__("rbit %0, %1 \n\t"
-          "clz %0, %0  \n\t"
-          : "=r" (x)
-          : "r" (x));
-
-  return x;
-}
-
-} // namespace
-
-#elif defined(_MSC_VER) && \
-      defined(HAS_TZCNT) && \
-      __has_include(<immintrin.h>)
-
-#include <immintrin.h>
-#define HAS_CTZ64
-#define CTZ64_SUPPORTS_ZERO
-
-// This allows us to generate the TZCNT instruction for MSVC
-// without C++20 support, hence without std::countr_zero(x).
-// No undefined behavior, _tzcnt_u64(0) = 64.
-#define ctz64(x) _tzcnt_u64(x)
-
-#elif defined(__GNUC__) || \
-      __has_builtin(__builtin_ctzl)
-
-#define HAS_CTZ64
-
-namespace {
-
-inline int ctz64(uint64_t x)
-{
-  // __builtin_ctz(0) is undefined behavior,
-  // we don't define CTZ64_SUPPORTS_ZERO.
-  ASSERT(x != 0);
-
-#if __cplusplus >= 201703L
-  if constexpr(sizeof(int) >= sizeof(uint64_t))
-    return __builtin_ctz(x);
-  else if constexpr(sizeof(long) >= sizeof(uint64_t))
-    return __builtin_ctzl(x);
-  else if constexpr(sizeof(long long) >= sizeof(uint64_t))
-    return __builtin_ctzll(x);
-#else
-    return __builtin_ctzll(x);
-#endif
-}
-
-} // namespace
-
-#endif
-
-#endif // INTRINSICS_HPP
diff --git a/include/primesieve/popcnt.hpp b/include/primesieve/popcnt.hpp
new file mode 100644
index 000000000..c81b9f389
--- /dev/null
+++ b/include/primesieve/popcnt.hpp
@@ -0,0 +1,214 @@
+///
+/// @file  popcnt.hpp
+/// @brief Functions to count the number of 1 bits inside
+///        a 64-bit variable.
+///
+/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
+///
+/// This file is distributed under the BSD License. See the COPYING
+/// file in the top level directory.
+///
+
+#ifndef POPCNT_HPP
+#define POPCNT_HPP
+
+#include "macros.hpp"
+#include <stdint.h>
+
+#if defined(ENABLE_MULTIARCH_x86_POPCNT)
+  #include "cpu_supports_popcnt.hpp"
+#endif
+
+namespace {
+
+/// This uses fewer arithmetic operations than any other known
+/// implementation on machines with fast multiplication.
+/// It uses 12 arithmetic operations, one of which is a multiply.
+/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
+///
+inline uint64_t popcnt64_bitwise(uint64_t x)
+{
+  uint64_t m1 = 0x5555555555555555ll;
+  uint64_t m2 = 0x3333333333333333ll;
+  uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
+  uint64_t h01 = 0x0101010101010101ll;
+
+  x -= (x >> 1) & m1;
+  x = (x & m2) + ((x >> 2) & m2);
+  x = (x + (x >> 4)) & m4;
+
+  return (x * h01) >> 56;
+}
+
+} // namespace
+
+// GCC & Clang
+#if defined(__GNUC__) || \
+    __has_builtin(__builtin_popcountl)
+
+// CPUID is only enabled on x86 and x86-64 CPUs
+// if the user compiles without -mpopcnt.
+#if defined(ENABLE_MULTIARCH_x86_POPCNT)
+#if defined(__x86_64__)
+
+namespace {
+
+ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
+{
+  // On my AMD EPYC 7642 CPU using GCC 12 this runtime
+  // check incurs an overall overhead of about 1%.
+  if_likely(cpu_supports_popcnt)
+  {
+    __asm__("popcnt %1, %0" : "=r"(x) : "r"(x));
+    return x;
+  }
+  else
+  {
+    // On x86 and x64 CPUs when using the GCC compiler
+    // __builtin_popcount*(x) is slow (not inlined function call)
+    // when compiling without -mpopcnt. Therefore we avoid
+    // using __builtin_popcount*(x) here.
+    return popcnt64_bitwise(x);
+  }
+}
+
+} // namespace
+
+#elif defined(__i386__)
+
+namespace {
+
+ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
+{
+  if_likely(cpu_supports_popcnt)
+  {
+    uint32_t x0 = uint32_t(x);
+    uint32_t x1 = uint32_t(x >> 32);
+    __asm__("popcnt %1, %0" : "=r"(x0) : "r"(x0));
+    __asm__("popcnt %1, %0" : "=r"(x1) : "r"(x1));
+    return x0 + x1;
+  }
+  else
+  {
+    // On x86 and x64 CPUs when using the GCC compiler
+    // __builtin_popcount*(x) is slow (not inlined function call)
+    // when compiling without -mpopcnt. Therefore we avoid
+    // using __builtin_popcount*(x) here.
+    return popcnt64_bitwise(x);
+  }
+}
+
+} // namespace
+
+#endif // i386
+
+#else // GCC & Clang (no CPUID, not x86)
+
+namespace {
+
+ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
+{
+#if __cplusplus >= 201703L
+  if constexpr(sizeof(int) >= sizeof(uint64_t))
+    return (uint64_t) __builtin_popcount(x);
+  else if constexpr(sizeof(long) >= sizeof(uint64_t))
+    return (uint64_t) __builtin_popcountl(x);
+  else if constexpr(sizeof(long long) >= sizeof(uint64_t))
+    return (uint64_t) __builtin_popcountll(x);
+#else
+    return (uint64_t) __builtin_popcountll(x);
+#endif
+}
+
+} // namespace
+
+#endif // GCC & Clang
+
+#elif defined(_MSC_VER) && \
+      defined(_M_X64) && \
+      __has_include(<intrin.h>)
+
+#include <intrin.h>
+
+namespace {
+
+ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
+{
+#if defined(__POPCNT__) || \
+    defined(__AVX__)
+  return __popcnt64(x);
+
+#elif defined(ENABLE_MULTIARCH_x86_POPCNT)
+  if_likely(cpu_supports_popcnt)
+    return __popcnt64(x);
+  else
+    return popcnt64_bitwise(x);
+
+#else
+  return popcnt64_bitwise(x);
+#endif
+}
+
+} // namespace
+
+#elif defined(_MSC_VER) && \
+      defined(_M_IX86) && \
+      __has_include(<intrin.h>)
+
+#include <intrin.h>
+
+namespace {
+
+ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
+{
+#if defined(__POPCNT__) || \
+    defined(__AVX__)
+  return __popcnt(uint32_t(x)) +
+         __popcnt(uint32_t(x >> 32));
+
+#elif defined(ENABLE_MULTIARCH_x86_POPCNT)
+  if_likely(cpu_supports_popcnt)
+    return __popcnt(uint32_t(x)) +
+           __popcnt(uint32_t(x >> 32));
+  else
+    return popcnt64_bitwise(x);
+
+#else
+  return popcnt64_bitwise(x);
+#endif
+}
+
+} // namespace
+
+#elif __cplusplus >= 202002L && \
+      __has_include(<bit>)
+
+#include <bit>
+
+namespace {
+
+/// We only use the C++ standard library as a fallback if there
+/// are no compiler intrinsics available for POPCNT.
+/// Compiler intrinsics often generate faster assembly.
+ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
+{
+  return std::popcount(x);
+}
+
+} // namespace
+
+#else
+
+namespace {
+
+/// Portable (but slow) popcount algorithm
+ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
+{
+  return popcnt64_bitwise(x);
+}
+
+} // namespace
+
+#endif
+
+#endif // POPCNT_HPP
diff --git a/scripts/build_clang_multiarch_win_x64.bat b/scripts/build_clang_multiarch_win_x64.bat
index d7b53231d..0e1204506 100644
--- a/scripts/build_clang_multiarch_win_x64.bat
+++ b/scripts/build_clang_multiarch_win_x64.bat
@@ -1 +1 @@
-clang++ -I../include -O3 -DNDEBUG -DENABLE_MULTIARCH_AVX512 ../src/*.cpp ../src/app/*.cpp -o primesieve.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib"
+clang++ -I../include -O3 -DNDEBUG -DENABLE_MULTIARCH_AVX512_VBMI2 ../src/*.cpp ../src/app/*.cpp -o primesieve.exe "C:\Program Files\LLVM\lib\clang\18\lib\windows\clang_rt.builtins-x86_64.lib"
diff --git a/src/CpuInfo.cpp b/src/CpuInfo.cpp
index 365a24396..15a33e9a1 100644
--- a/src/CpuInfo.cpp
+++ b/src/CpuInfo.cpp
@@ -39,20 +39,12 @@
   #define APPLE_SYSCTL
 #endif
 
-#if defined(__i386__) || \
-    defined(__x86_64__) || \
-    defined(_M_IX86) || \
-    defined(_M_X64)
-  #include <primesieve/cpuid.hpp>
-  #include <primesieve/cpu_supports_avx512_vbmi2.hpp>
-  #define HAS_CPUID
-#endif
-
 #if defined(_WIN32)
 
 #include <primesieve/pmath.hpp>
 
 #include <windows.h>
+#include <intrin.h>
 #include <iterator>
 #include <map>
 
@@ -62,25 +54,29 @@ std::string getCpuName()
 {
   std::string cpuName;
 
-#if defined(HAS_CPUID)
+#if defined(__i386__) || \
+    defined(__x86_64__) || \
+    defined(_M_IX86) || \
+    defined(_M_X64)
+
   // Get the CPU name using CPUID.
   // Example: Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz
   // https://en.wikipedia.org/wiki/CPUID
 
   int cpuInfo[4] = { 0, 0, 0, 0 };
-  run_cpuid(0x80000000, 0, cpuInfo);
+  __cpuidex(cpuInfo, 0x80000000, 0);
   std::vector<int> vect;
 
   // check if CPU name is supported
   if ((unsigned) cpuInfo[0] >= 0x80000004u)
   {
-    run_cpuid(0x80000002, 0, cpuInfo);
+    __cpuidex(cpuInfo, 0x80000002, 0);
     std::copy_n(cpuInfo, 4, std::back_inserter(vect));
 
-    run_cpuid(0x80000003, 0, cpuInfo);
+    __cpuidex(cpuInfo, 0x80000003, 0);
     std::copy_n(cpuInfo, 4, std::back_inserter(vect));
 
-    run_cpuid(0x80000004, 0, cpuInfo);
+    __cpuidex(cpuInfo, 0x80000004, 0);
     std::copy_n(cpuInfo, 4, std::back_inserter(vect));
 
     vect.push_back(0);
@@ -755,15 +751,6 @@ std::string CpuInfo::cpuName() const
   }
 }
 
-bool CpuInfo::hasAVX512() const
-{
-  #if defined(HAS_CPUID)
-    return cpu_supports_avx512_vbmi2;
-  #else
-    return false;
-  #endif
-}
-
 size_t CpuInfo::logicalCpuCores() const
 {
   return logicalCpuCores_;
diff --git a/src/PrimeGenerator.cpp b/src/PrimeGenerator.cpp
index 3be1e0a1b..5406b298b 100644
--- a/src/PrimeGenerator.cpp
+++ b/src/PrimeGenerator.cpp
@@ -27,16 +27,16 @@
 #include <primesieve/primesieve_error.hpp>
 #include <primesieve/macros.hpp>
 #include <primesieve/pmath.hpp>
+#include <primesieve/popcnt.hpp>
 #include <primesieve/Vector.hpp>
-#include <primesieve/intrinsics.hpp>
 #include <primesieve/SievingPrimes.hpp>
 
 #include <stdint.h>
 #include <algorithm>
 #include <limits>
 
-#if defined(ENABLE_AVX512) || \
-    defined(ENABLE_MULTIARCH_AVX512)
+#if defined(ENABLE_AVX512_VBMI2) || \
+    defined(ENABLE_MULTIARCH_AVX512_VBMI2)
   #include <immintrin.h>
 #endif
 
@@ -458,8 +458,8 @@ void PrimeGenerator::fillNextPrimes_default(Vector<uint64_t>& primes, std::size_
 
 #endif
 
-#if defined(ENABLE_AVX512) || \
-    defined(ENABLE_MULTIARCH_AVX512)
+#if defined(ENABLE_AVX512_VBMI2) || \
+    defined(ENABLE_MULTIARCH_AVX512_VBMI2)
 
 /// This algorithm converts 1 bits from the sieve array into primes
 /// using AVX512. The algorithm is a modified version of the AVX512
@@ -474,10 +474,10 @@ void PrimeGenerator::fillNextPrimes_default(Vector<uint64_t>& primes, std::size_
 /// benchmarks this algorithm ran about 10% faster than the default
 /// fillNextPrimes() algorithm which uses __builtin_ctzll().
 ///
-#if defined(ENABLE_MULTIARCH_AVX512)
+#if defined(ENABLE_MULTIARCH_AVX512_VBMI2)
   __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2")))
 #endif
-void PrimeGenerator::fillNextPrimes_avx512(Vector<uint64_t>& primes, std::size_t* size)
+void PrimeGenerator::fillNextPrimes_avx512_vbmi2(Vector<uint64_t>& primes, std::size_t* size)
 {
   *size = 0;
 
diff --git a/src/app/main.cpp b/src/app/main.cpp
index 6b25675da..80f1ea5b7 100644
--- a/src/app/main.cpp
+++ b/src/app/main.cpp
@@ -35,6 +35,16 @@
 #include <sstream>
 #include <string>
 
+#if defined(ENABLE_MULTIARCH_AVX512_VBMI2)
+
+namespace primesieve {
+
+bool has_cpuid_avx512_vbmi2()
+
+} // namespace
+
+#endif
+
 void help(int exitCode);
 void version();
 void stressTest(const CmdOptions& opts);
@@ -223,14 +233,9 @@ void cpuInfo()
   else
     std::cout << "Logical CPU cores: unknown" << std::endl;
 
-  // Enable on x86 CPUs
-  #if defined(__x86_64__) || \
-      defined(__i386__) || \
-      defined(_M_X64) || \
-      defined(_M_IX86) || \
-      defined(__AVX512F__)
+  #if defined(ENABLE_MULTIARCH_AVX512_VBMI2)
 
-    if (cpu.hasAVX512())
+    if (primesieve::has_cpuid_avx512_vbmi2())
       std::cout << "Has AVX512: yes" << std::endl;
     else
       std::cout << "Has AVX512: no" << std::endl;
diff --git a/src/popcount.cpp b/src/popcount.cpp
index 93eab08ce..adede0c35 100644
--- a/src/popcount.cpp
+++ b/src/popcount.cpp
@@ -8,7 +8,7 @@
 /// file in the top level directory.
 ///
 
-#include <primesieve/intrinsics.hpp>
+#include <primesieve/popcnt.hpp>
 #include <primesieve/forward.hpp>
 #include <stdint.h>
 
diff --git a/src/x86/cpuid.cpp b/src/x86/cpuid.cpp
new file mode 100644
index 000000000..b571057c9
--- /dev/null
+++ b/src/x86/cpuid.cpp
@@ -0,0 +1,125 @@
+///
+/// @file  cpuid.cpp
+/// @brief CPUID for x86 and x86-64 CPUs.
+///
+/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
+///
+/// This file is distributed under the BSD License. See the COPYING
+/// file in the top level directory.
+///
+
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+  #include <intrin.h>
+  #include <immintrin.h>
+#endif
+
+// CPUID bits documentation:
+// https://en.wikipedia.org/wiki/CPUID
+
+// %ebx bit flags
+#define bit_AVX512F (1 << 16)
+
+// %ecx bit flags
+#define bit_AVX512VBMI  (1 << 1)
+#define bit_AVX512VBMI2 (1 << 6)
+#define bit_POPCNT      (1 << 23)
+
+// xgetbv bit flags
+#define XSTATE_SSE (1 << 1)
+#define XSTATE_YMM (1 << 2)
+#define XSTATE_ZMM (7 << 5)
+
+namespace {
+
+void run_cpuid(int eax, int ecx, int* abcd)
+{
+#if defined(_MSC_VER)
+  __cpuidex(abcd, eax, ecx);
+#else
+  int ebx = 0;
+  int edx = 0;
+
+  #if defined(__i386__) && \
+      defined(__PIC__)
+    // In case of PIC under 32-bit EBX cannot be clobbered
+    asm volatile("movl %%ebx, %%edi;"
+                 "cpuid;"
+                 "xchgl %%ebx, %%edi;"
+                 : "+a" (eax),
+                   "=D" (ebx),
+                   "+c" (ecx),
+                   "=d" (edx));
+  #else
+    asm volatile("cpuid"
+                 : "+a" (eax),
+                   "+b" (ebx),
+                   "+c" (ecx),
+                   "=d" (edx));
+  #endif
+
+  abcd[0] = eax;
+  abcd[1] = ebx;
+  abcd[2] = ecx;
+  abcd[3] = edx;
+#endif
+}
+
+// Get Value of Extended Control Register
+uint64_t get_xcr0()
+{
+#if defined(_MSC_VER)
+  return _xgetbv(0);
+#else
+  uint32_t eax;
+  uint32_t edx;
+
+  asm volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
+  return eax | (uint64_t(edx) << 32);
+#endif
+}
+
+} // namespace
+
+namespace primesieve {
+
+bool has_cpuid_popcnt()
+{
+  int abcd[4];
+  run_cpuid(1, 0, abcd);
+  return (abcd[2] & bit_POPCNT) == bit_POPCNT;
+}
+
+bool has_cpuid_avx512_vbmi2()
+{
+  int abcd[4];
+
+  run_cpuid(1, 0, abcd);
+
+  int osxsave_mask = (1 << 27);
+
+  // Ensure OS supports extended processor state management
+  if ((abcd[2] & osxsave_mask) != osxsave_mask)
+    return false;
+
+  uint64_t ymm_mask = XSTATE_SSE | XSTATE_YMM;
+  uint64_t zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM;
+  uint64_t xcr0 = get_xcr0();
+
+  // Check AVX OS support
+  if ((xcr0 & ymm_mask) != ymm_mask)
+    return false;
+
+  // Check AVX512 OS support
+  if ((xcr0 & zmm_mask) != zmm_mask)
+    return false;
+
+  run_cpuid(7, 0, abcd);
+
+  // PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2
+  return ((abcd[1] & bit_AVX512F) == bit_AVX512F &&
+          (abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2));
+}
+
+} // namespace
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 190ed9e7e..77fb8845c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -5,6 +5,6 @@ foreach(file ${files})
     get_filename_component(binary_name ${file} NAME_WE)
     add_executable(${binary_name} ${file})
     target_link_libraries(${binary_name} primesieve::primesieve)
-    target_compile_definitions(${binary_name} PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}")
+    target_compile_definitions(${binary_name} PRIVATE ${PRIMESIEVE_COMPILE_DEFINITIONS})
     add_test(NAME ${binary_name} COMMAND ${binary_name})
 endforeach()
diff --git a/test/cpuid.cpp b/test/cpuid.cpp
index 64c0d9361..6ed4b85c1 100644
--- a/test/cpuid.cpp
+++ b/test/cpuid.cpp
@@ -8,7 +8,10 @@
 /// file in the top level directory.
 ///
 
-#include <primesieve/cpu_supports_popcnt.hpp>
+#if defined(ENABLE_MULTIARCH_x86_POPCNT)
+  #include <primesieve/cpu_supports_popcnt.hpp>
+#endif
+
 #include <iostream>
 
 int main()
@@ -18,52 +21,22 @@ int main()
     defined(_M_X64) || \
     defined(_M_IX86)
 
-  #if defined(__POPCNT__)
-    #if defined(HAS_POPCNT)
-      std::cout << "OK: __POPCNT__ and HAS_POPCNT are defined!" << std::endl;
-    #else
-      std::cerr << "Error: HAS_POPCNT must be defined if __POPCNT__ is defined!" << std::endl;
-      return 1;
-    #endif
-  #endif
-
-  #if defined(__AVX__)
-    #if defined(HAS_POPCNT)
-      std::cout << "OK: __AVX__ and HAS_POPCNT are defined!" << std::endl;
-    #else
-      std::cerr << "Error: HAS_POPCNT must be defined if __AVX__ is defined!" << std::endl;
-      return 1;
-    #endif
+  #if defined(__POPCNT__) && defined(ENABLE_MULTIARCH_x86_POPCNT)
+    std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __POPCNT__ is defined!" << std::endl;
   #endif
 
-  #if defined(__AVX2__)
-    #if defined(HAS_POPCNT)
-      std::cout << "OK: __AVX2__ and HAS_POPCNT are defined!" << std::endl;
-    #else
-      std::cerr << "Error: HAS_POPCNT must be defined if __AVX2__ is defined!" << std::endl;
-      return 1;
-    #endif
-  #endif
+  #if defined(_MSC_VER) && \
+     !defined(__POPCNT__)
 
-  #if defined(HAS_POPCNT)
-    #if !defined(ENABLE_CPUID_POPCNT)
-      std::cout << "OK: HAS_POPCNT is defined but ENABLE_CPUID_POPCNT is not defined!" << std::endl;
-    #else
-      std::cerr << "Error: ENABLE_CPUID_POPCNT must not be defined if HAS_POPCNT is defined!" << std::endl;
-      return 1;
+    #if defined(__AVX__) && defined(ENABLE_MULTIARCH_x86_POPCNT)
+      std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __AVX__ is defined!" << std::endl;
     #endif
-  #endif
-
-  #if !defined(HAS_POPCNT)
-    #if defined(ENABLE_CPUID_POPCNT)
-      std::cout << "OK: HAS_POPCNT is not defined but ENABLE_CPUID_POPCNT is defined!" << std::endl;
-    #else
-      std::cerr << "Error: ENABLE_CPUID_POPCNT must be defined if HAS_POPCNT is not defined!" << std::endl;
-      return 1;
+    #if defined(__AVX2__) && defined(ENABLE_MULTIARCH_x86_POPCNT)
+      std::cerr << "Error: ENABLE_MULTIARCH_x86_POPCNT must not be defined if __AVX2__ is defined!" << std::endl;
     #endif
   #endif
 
-  #if defined(ENABLE_CPUID_POPCNT)
+  #if defined(ENABLE_MULTIARCH_x86_POPCNT)
     std::cout << "CPU supports POPCNT: " << (cpu_supports_popcnt ? "yes" : "no") << std::endl;
   #endif