Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Jul 11, 2024
1 parent 7c009ce commit e2f5f32
Showing 1 changed file with 137 additions and 23 deletions.
160 changes: 137 additions & 23 deletions include/primesieve/popcnt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@
#include "cpu_supports_popcnt.hpp"
#endif

// GCC & Clang
#if defined(__GNUC__) || \
__has_builtin(__builtin_popcountl)

// CPUID is only enabled on x86 and x86-64 CPUs
// if the user compiles without -mpopcnt.
#if defined(ENABLE_MULTIARCH_x86_POPCNT)
#if defined(__x86_64__)

namespace {

/// This uses fewer arithmetic operations than any other known
Expand All @@ -40,19 +49,6 @@ NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
return (x * h01) >> 56;
}

} // namespace

// GCC & Clang
#if defined(__GNUC__) || \
__has_builtin(__builtin_popcountl)

// CPUID is only enabled on x86 and x86-64 CPUs
// if the user compiles without -mpopcnt.
#if defined(ENABLE_MULTIARCH_x86_POPCNT)
#if defined(__x86_64__)

namespace {

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
// On my AMD EPYC 7642 CPU using GCC 12 this runtime
Expand All @@ -72,6 +68,25 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
if_likely(cpu_supports_popcnt)
Expand Down Expand Up @@ -120,23 +135,66 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
#if defined(__POPCNT__) || \
defined(__AVX__)

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
return __popcnt64(x);
}

#elif defined(ENABLE_MULTIARCH_x86_POPCNT)

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
if_likely(cpu_supports_popcnt)
return __popcnt64(x);
else
return popcnt64_bitwise(x);
}

#else
return popcnt64_bitwise(x);
#endif

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

#endif

} // namespace

#elif defined(_MSC_VER) && \
Expand All @@ -147,25 +205,68 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
#if defined(__POPCNT__) || \
defined(__AVX__)

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
return __popcnt(uint32_t(x)) +
__popcnt(uint32_t(x >> 32));
}

#elif defined(ENABLE_MULTIARCH_x86_POPCNT)

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
NOINLINE uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
if_likely(cpu_supports_popcnt)
return __popcnt(uint32_t(x)) +
__popcnt(uint32_t(x >> 32));
else
return popcnt64_bitwise(x);
}

#else
return popcnt64_bitwise(x);
#endif

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

#endif

} // namespace

#elif __cplusplus >= 202002L && \
Expand All @@ -189,10 +290,23 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x)

namespace {

/// Portable (but slow) popcount algorithm
/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
ALWAYS_INLINE uint64_t popcnt64(uint64_t x)
{
return popcnt64_bitwise(x);
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
}

} // namespace
Expand Down

0 comments on commit e2f5f32

Please sign in to comment.