mirror of
https://github.com/microsoft/terminal.git
synced 2025-12-12 00:07:24 -06:00
Update libpopcnt to 2.5 (#14140)
This commit is contained in:
parent
a63f060f72
commit
d7e24ad6d0
@ -6,7 +6,7 @@
|
||||
"type": "git",
|
||||
"git": {
|
||||
"repositoryUrl": "https://github.com/kimwalisch/libpopcnt",
|
||||
"commitHash": "043a99fba31121a70bcb2f589faa17f534ae6085"
|
||||
"commitHash": "c49987e90e56191c399cab881ab87b5daecc9b8e"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
* population count) in an array as quickly as possible using
|
||||
* specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
|
||||
*
|
||||
* Copyright (c) 2016 - 2019, Kim Walisch
|
||||
* Copyright (c) 2016 - 2020, Kim Walisch
|
||||
* Copyright (c) 2016 - 2018, Wojciech Muła
|
||||
*
|
||||
* All rights reserved.
|
||||
@ -33,6 +33,7 @@
|
||||
#define LIBPOPCNT_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
@ -68,14 +69,6 @@
|
||||
#define X86_OR_X64
|
||||
#endif
|
||||
|
||||
#if defined(X86_OR_X64) && \
|
||||
(defined(__cplusplus) || \
|
||||
defined(_MSC_VER) || \
|
||||
(GNUC_PREREQ(4, 2) || \
|
||||
__has_builtin(__sync_val_compare_and_swap)))
|
||||
#define HAVE_CPUID
|
||||
#endif
|
||||
|
||||
#if GNUC_PREREQ(4, 2) || \
|
||||
__has_builtin(__builtin_popcount)
|
||||
#define HAVE_BUILTIN_POPCOUNT
|
||||
@ -86,41 +79,64 @@
|
||||
#define HAVE_ASM_POPCNT
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CPUID) && \
|
||||
#if defined(X86_OR_X64) && \
|
||||
(defined(HAVE_ASM_POPCNT) || \
|
||||
defined(_MSC_VER))
|
||||
#define HAVE_POPCNT
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CPUID) && \
|
||||
#if defined(X86_OR_X64) && \
|
||||
GNUC_PREREQ(4, 9)
|
||||
#define HAVE_AVX2
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CPUID) && \
|
||||
#if defined(X86_OR_X64) && \
|
||||
GNUC_PREREQ(5, 0)
|
||||
#define HAVE_AVX512
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CPUID) && \
|
||||
defined(_MSC_VER) && \
|
||||
defined(__AVX2__)
|
||||
#if defined(X86_OR_X64)
|
||||
/* MSVC compatible compilers (Windows) */
|
||||
#if defined(_MSC_VER)
|
||||
/* clang-cl (LLVM 10 from 2020) requires /arch:AVX2 or
|
||||
* /arch:AVX512 to enable vector instructions */
|
||||
#if defined(__clang__)
|
||||
#if defined(__AVX2__)
|
||||
#define HAVE_AVX2
|
||||
#endif
|
||||
#if defined(__AVX512__)
|
||||
#define HAVE_AVX2
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CPUID) && \
|
||||
defined(_MSC_VER) && \
|
||||
defined(__AVX512__)
|
||||
#define HAVE_AVX512
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_CPUID) && \
|
||||
CLANG_PREREQ(3, 8) && \
|
||||
#endif
|
||||
/* MSVC 2017 or later does not require
|
||||
* /arch:AVX2 or /arch:AVX512 */
|
||||
#elif _MSC_VER >= 1910
|
||||
#define HAVE_AVX2
|
||||
#define HAVE_AVX512
|
||||
#endif
|
||||
/* Clang (Unix-like OSes) */
|
||||
#elif CLANG_PREREQ(3, 8) && \
|
||||
__has_attribute(target) && \
|
||||
(!defined(_MSC_VER) || defined(__AVX2__)) && \
|
||||
(!defined(__apple_build_version__) || __apple_build_version__ >= 8000000)
|
||||
#define HAVE_AVX2
|
||||
#define HAVE_AVX512
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Only enable CPUID runtime checks if this is really
|
||||
* needed. E.g. do not enable if user has compiled
|
||||
* using -march=native on a CPU that supports AVX512.
|
||||
*/
|
||||
#if defined(X86_OR_X64) && \
|
||||
(defined(__cplusplus) || \
|
||||
defined(_MSC_VER) || \
|
||||
(GNUC_PREREQ(4, 2) || \
|
||||
__has_builtin(__sync_val_compare_and_swap))) && \
|
||||
((defined(HAVE_AVX512) && !(defined(__AVX512__) || defined(__AVX512BW__))) || \
|
||||
(defined(HAVE_AVX2) && !defined(__AVX2__)) || \
|
||||
(defined(HAVE_POPCNT) && !defined(__POPCNT__)))
|
||||
#define HAVE_CPUID
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -211,26 +227,6 @@ static inline uint64_t popcnt64(uint64_t x)
|
||||
|
||||
#endif
|
||||
|
||||
static inline uint64_t popcnt64_unrolled(const uint64_t* data, uint64_t size)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
uint64_t limit = size - size % 4;
|
||||
uint64_t cnt = 0;
|
||||
|
||||
for (; i < limit; i += 4)
|
||||
{
|
||||
cnt += popcnt64(data[i+0]);
|
||||
cnt += popcnt64(data[i+1]);
|
||||
cnt += popcnt64(data[i+2]);
|
||||
cnt += popcnt64(data[i+3]);
|
||||
}
|
||||
|
||||
for (; i < size; i++)
|
||||
cnt += popcnt64(data[i]);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
#if defined(HAVE_CPUID)
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
@ -398,7 +394,7 @@ static inline __m256i popcnt256(__m256i v)
|
||||
#if !defined(_MSC_VER)
|
||||
__attribute__ ((target ("avx2")))
|
||||
#endif
|
||||
static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size)
|
||||
static inline uint64_t popcnt_avx2(const __m256i* ptr, uint64_t size)
|
||||
{
|
||||
__m256i cnt = _mm256_setzero_si256();
|
||||
__m256i ones = _mm256_setzero_si256();
|
||||
@ -414,18 +410,18 @@ static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size)
|
||||
|
||||
for(; i < limit; i += 16)
|
||||
{
|
||||
CSA256(&twosA, &ones, ones, data[i+0], data[i+1]);
|
||||
CSA256(&twosB, &ones, ones, data[i+2], data[i+3]);
|
||||
CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 0), _mm256_loadu_si256(ptr + i + 1));
|
||||
CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 2), _mm256_loadu_si256(ptr + i + 3));
|
||||
CSA256(&foursA, &twos, twos, twosA, twosB);
|
||||
CSA256(&twosA, &ones, ones, data[i+4], data[i+5]);
|
||||
CSA256(&twosB, &ones, ones, data[i+6], data[i+7]);
|
||||
CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 4), _mm256_loadu_si256(ptr + i + 5));
|
||||
CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 6), _mm256_loadu_si256(ptr + i + 7));
|
||||
CSA256(&foursB, &twos, twos, twosA, twosB);
|
||||
CSA256(&eightsA, &fours, fours, foursA, foursB);
|
||||
CSA256(&twosA, &ones, ones, data[i+8], data[i+9]);
|
||||
CSA256(&twosB, &ones, ones, data[i+10], data[i+11]);
|
||||
CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 8), _mm256_loadu_si256(ptr + i + 9));
|
||||
CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 10), _mm256_loadu_si256(ptr + i + 11));
|
||||
CSA256(&foursA, &twos, twos, twosA, twosB);
|
||||
CSA256(&twosA, &ones, ones, data[i+12], data[i+13]);
|
||||
CSA256(&twosB, &ones, ones, data[i+14], data[i+15]);
|
||||
CSA256(&twosA, &ones, ones, _mm256_loadu_si256(ptr + i + 12), _mm256_loadu_si256(ptr + i + 13));
|
||||
CSA256(&twosB, &ones, ones, _mm256_loadu_si256(ptr + i + 14), _mm256_loadu_si256(ptr + i + 15));
|
||||
CSA256(&foursB, &twos, twos, twosA, twosB);
|
||||
CSA256(&eightsB, &fours, fours, foursA, foursB);
|
||||
CSA256(&sixteens, &eights, eights, eightsA, eightsB);
|
||||
@ -440,7 +436,7 @@ static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size)
|
||||
cnt = _mm256_add_epi64(cnt, popcnt256(ones));
|
||||
|
||||
for(; i < size; i++)
|
||||
cnt = _mm256_add_epi64(cnt, popcnt256(data[i]));
|
||||
cnt = _mm256_add_epi64(cnt, popcnt256(_mm256_loadu_si256(ptr + i)));
|
||||
|
||||
cnt64 = (uint64_t*) &cnt;
|
||||
|
||||
@ -450,22 +446,6 @@ static inline uint64_t popcnt_avx2(const __m256i* data, uint64_t size)
|
||||
cnt64[3];
|
||||
}
|
||||
|
||||
/* Align memory to 32 bytes boundary */
|
||||
static inline void align_avx2(const uint8_t** p, uint64_t* size, uint64_t* cnt)
|
||||
{
|
||||
for (; (uintptr_t) *p % 8; (*p)++)
|
||||
{
|
||||
*cnt += popcnt64(**p);
|
||||
*size -= 1;
|
||||
}
|
||||
for (; (uintptr_t) *p % 32; (*p) += 8)
|
||||
{
|
||||
*cnt += popcnt64(
|
||||
*(const uint64_t*) *p);
|
||||
*size -= 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX512)
|
||||
@ -480,9 +460,13 @@ static inline __m512i popcnt512(__m512i v)
|
||||
__m512i m1 = _mm512_set1_epi8(0x55);
|
||||
__m512i m2 = _mm512_set1_epi8(0x33);
|
||||
__m512i m4 = _mm512_set1_epi8(0x0F);
|
||||
__m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1));
|
||||
__m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2));
|
||||
__m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4;
|
||||
__m512i vm = _mm512_and_si512(_mm512_srli_epi16(v, 1), m1);
|
||||
__m512i t1 = _mm512_sub_epi8(v, vm);
|
||||
__m512i tm = _mm512_and_si512(t1, m2);
|
||||
__m512i tm2 = _mm512_and_si512(_mm512_srli_epi16(t1, 2), m2);
|
||||
__m512i t2 = _mm512_add_epi8(tm, tm2);
|
||||
__m512i tt = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4));
|
||||
__m512i t3 = _mm512_and_si512(tt, m4);
|
||||
|
||||
return _mm512_sad_epu8(t3, _mm512_setzero_si512());
|
||||
}
|
||||
@ -506,7 +490,7 @@ static inline void CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i
|
||||
#if !defined(_MSC_VER)
|
||||
__attribute__ ((target ("avx512bw")))
|
||||
#endif
|
||||
static inline uint64_t popcnt_avx512(const __m512i* data, const uint64_t size)
|
||||
static inline uint64_t popcnt_avx512(const __m512i* ptr, const uint64_t size)
|
||||
{
|
||||
__m512i cnt = _mm512_setzero_si512();
|
||||
__m512i ones = _mm512_setzero_si512();
|
||||
@ -522,18 +506,18 @@ static inline uint64_t popcnt_avx512(const __m512i* data, const uint64_t size)
|
||||
|
||||
for(; i < limit; i += 16)
|
||||
{
|
||||
CSA512(&twosA, &ones, ones, data[i+0], data[i+1]);
|
||||
CSA512(&twosB, &ones, ones, data[i+2], data[i+3]);
|
||||
CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 0), _mm512_loadu_si512(ptr + i + 1));
|
||||
CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 2), _mm512_loadu_si512(ptr + i + 3));
|
||||
CSA512(&foursA, &twos, twos, twosA, twosB);
|
||||
CSA512(&twosA, &ones, ones, data[i+4], data[i+5]);
|
||||
CSA512(&twosB, &ones, ones, data[i+6], data[i+7]);
|
||||
CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 4), _mm512_loadu_si512(ptr + i + 5));
|
||||
CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 6), _mm512_loadu_si512(ptr + i + 7));
|
||||
CSA512(&foursB, &twos, twos, twosA, twosB);
|
||||
CSA512(&eightsA, &fours, fours, foursA, foursB);
|
||||
CSA512(&twosA, &ones, ones, data[i+8], data[i+9]);
|
||||
CSA512(&twosB, &ones, ones, data[i+10], data[i+11]);
|
||||
CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 8), _mm512_loadu_si512(ptr + i + 9));
|
||||
CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 10), _mm512_loadu_si512(ptr + i + 11));
|
||||
CSA512(&foursA, &twos, twos, twosA, twosB);
|
||||
CSA512(&twosA, &ones, ones, data[i+12], data[i+13]);
|
||||
CSA512(&twosB, &ones, ones, data[i+14], data[i+15]);
|
||||
CSA512(&twosA, &ones, ones, _mm512_loadu_si512(ptr + i + 12), _mm512_loadu_si512(ptr + i + 13));
|
||||
CSA512(&twosB, &ones, ones, _mm512_loadu_si512(ptr + i + 14), _mm512_loadu_si512(ptr + i + 15));
|
||||
CSA512(&foursB, &twos, twos, twosA, twosB);
|
||||
CSA512(&eightsB, &fours, fours, foursA, foursB);
|
||||
CSA512(&sixteens, &eights, eights, eightsA, eightsB);
|
||||
@ -548,7 +532,7 @@ static inline uint64_t popcnt_avx512(const __m512i* data, const uint64_t size)
|
||||
cnt = _mm512_add_epi64(cnt, popcnt512(ones));
|
||||
|
||||
for(; i < size; i++)
|
||||
cnt = _mm512_add_epi64(cnt, popcnt512(data[i]));
|
||||
cnt = _mm512_add_epi64(cnt, popcnt512(_mm512_loadu_si512(ptr + i)));
|
||||
|
||||
cnt64 = (uint64_t*) &cnt;
|
||||
|
||||
@ -562,57 +546,11 @@ static inline uint64_t popcnt_avx512(const __m512i* data, const uint64_t size)
|
||||
cnt64[7];
|
||||
}
|
||||
|
||||
/* Align memory to 64 bytes boundary */
|
||||
static inline void align_avx512(const uint8_t** p, uint64_t* size, uint64_t* cnt)
|
||||
{
|
||||
for (; (uintptr_t) *p % 8; (*p)++)
|
||||
{
|
||||
*cnt += popcnt64(**p);
|
||||
*size -= 1;
|
||||
}
|
||||
for (; (uintptr_t) *p % 64; (*p) += 8)
|
||||
{
|
||||
*cnt += popcnt64(
|
||||
*(const uint64_t*) *p);
|
||||
*size -= 8;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* x86 CPUs */
|
||||
#if defined(X86_OR_X64)
|
||||
|
||||
/* Align memory to 8 bytes boundary */
|
||||
static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
|
||||
{
|
||||
for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
|
||||
{
|
||||
*cnt += popcount64(**p);
|
||||
*size -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint64_t popcount64_unrolled(const uint64_t* data, uint64_t size)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
uint64_t limit = size - size % 4;
|
||||
uint64_t cnt = 0;
|
||||
|
||||
for (; i < limit; i += 4)
|
||||
{
|
||||
cnt += popcount64(data[i+0]);
|
||||
cnt += popcount64(data[i+1]);
|
||||
cnt += popcount64(data[i+2]);
|
||||
cnt += popcount64(data[i+3]);
|
||||
}
|
||||
|
||||
for (; i < size; i++)
|
||||
cnt += popcount64(data[i]);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Count the number of 1 bits in the data array
|
||||
* @data: An array
|
||||
@ -620,10 +558,15 @@ static inline uint64_t popcount64_unrolled(const uint64_t* data, uint64_t size)
|
||||
*/
|
||||
static inline uint64_t popcnt(const void* data, uint64_t size)
|
||||
{
|
||||
const uint8_t* ptr = (const uint8_t*) data;
|
||||
uint64_t i = 0;
|
||||
uint64_t cnt = 0;
|
||||
uint64_t i;
|
||||
const uint8_t* ptr = (const uint8_t*) data;
|
||||
|
||||
/*
|
||||
* CPUID runtime checks are only enabled if this is needed.
|
||||
* E.g. CPUID is disabled when a user compiles his
|
||||
* code using -march=native on a CPU with AVX512.
|
||||
*/
|
||||
#if defined(HAVE_CPUID)
|
||||
#if defined(__cplusplus)
|
||||
/* C++11 thread-safe singleton */
|
||||
@ -645,62 +588,75 @@ static inline uint64_t popcnt(const void* data, uint64_t size)
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX512)
|
||||
|
||||
#if defined(__AVX512__) || defined(__AVX512BW__)
|
||||
/* AVX512 requires arrays >= 1024 bytes */
|
||||
if (i + 1024 <= size)
|
||||
#else
|
||||
if ((cpuid & bit_AVX512) &&
|
||||
size >= 1024)
|
||||
i + 1024 <= size)
|
||||
#endif
|
||||
{
|
||||
align_avx512(&ptr, &size, &cnt);
|
||||
cnt += popcnt_avx512((const __m512i*) ptr, size / 64);
|
||||
ptr += size - size % 64;
|
||||
size = size % 64;
|
||||
const __m512i* ptr512 = (const __m512i*)(ptr + i);
|
||||
cnt += popcnt_avx512(ptr512, (size - i) / 64);
|
||||
i = size - size % 64;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX2)
|
||||
|
||||
#if defined(__AVX2__)
|
||||
/* AVX2 requires arrays >= 512 bytes */
|
||||
if (i + 512 <= size)
|
||||
#else
|
||||
if ((cpuid & bit_AVX2) &&
|
||||
size >= 512)
|
||||
i + 512 <= size)
|
||||
#endif
|
||||
{
|
||||
align_avx2(&ptr, &size, &cnt);
|
||||
cnt += popcnt_avx2((const __m256i*) ptr, size / 32);
|
||||
ptr += size - size % 32;
|
||||
size = size % 32;
|
||||
const __m256i* ptr256 = (const __m256i*)(ptr + i);
|
||||
cnt += popcnt_avx2(ptr256, (size - i) / 32);
|
||||
i = size - size % 32;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_POPCNT)
|
||||
|
||||
/*
|
||||
* The user has compiled without -mpopcnt.
|
||||
* Unfortunately the MSVC compiler does not have
|
||||
* a POPCNT macro so we cannot get rid of the
|
||||
* runtime check for MSVC.
|
||||
*/
|
||||
#if !defined(__POPCNT__)
|
||||
if (cpuid & bit_POPCNT)
|
||||
#endif
|
||||
{
|
||||
cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
|
||||
ptr += size - size % 8;
|
||||
size = size % 8;
|
||||
for (i = 0; i < size; i++)
|
||||
/* We use unaligned memory accesses here to improve performance */
|
||||
for (; i < size - size % 8; i += 8)
|
||||
cnt += popcnt64(*(const uint64_t*)(ptr + i));
|
||||
for (; i < size; i++)
|
||||
cnt += popcnt64(ptr[i]);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* pure integer popcount algorithm */
|
||||
if (size >= 8)
|
||||
#if !defined(HAVE_POPCNT) || \
|
||||
!defined(__POPCNT__)
|
||||
/*
|
||||
* Pure integer popcount algorithm.
|
||||
* We use unaligned memory accesses here to improve performance.
|
||||
*/
|
||||
for (; i < size - size % 8; i += 8)
|
||||
cnt += popcount64(*(const uint64_t*)(ptr + i));
|
||||
|
||||
if (i < size)
|
||||
{
|
||||
align_8(&ptr, &size, &cnt);
|
||||
cnt += popcount64_unrolled((const uint64_t*) ptr, size / 8);
|
||||
ptr += size - size % 8;
|
||||
size = size % 8;
|
||||
uint64_t val = 0;
|
||||
size_t bytes = (size_t)(size - i);
|
||||
memcpy(&val, &ptr[i], bytes);
|
||||
cnt += popcount64(val);
|
||||
}
|
||||
|
||||
/* pure integer popcount algorithm */
|
||||
for (i = 0; i < size; i++)
|
||||
cnt += popcount64(ptr[i]);
|
||||
|
||||
return cnt;
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif defined(__ARM_NEON) || \
|
||||
@ -708,16 +664,6 @@ static inline uint64_t popcnt(const void* data, uint64_t size)
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
/* Align memory to 8 bytes boundary */
|
||||
static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
|
||||
{
|
||||
for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
|
||||
{
|
||||
*cnt += popcnt64(**p);
|
||||
*size -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t)
|
||||
{
|
||||
return vpadalq_u32(sum, vpaddlq_u16(vpaddlq_u8(t)));
|
||||
@ -730,13 +676,13 @@ static inline uint64x2_t vpadalq(uint64x2_t sum, uint8x16_t t)
|
||||
*/
|
||||
static inline uint64_t popcnt(const void* data, uint64_t size)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
uint64_t cnt = 0;
|
||||
uint64_t chunk_size = 64;
|
||||
const uint8_t* ptr = (const uint8_t*) data;
|
||||
|
||||
if (size >= chunk_size)
|
||||
{
|
||||
uint64_t i = 0;
|
||||
uint64_t iters = size / chunk_size;
|
||||
uint64x2_t sum = vcombine_u64(vcreate_u64(0), vcreate_u64(0));
|
||||
uint8x16_t zero = vcombine_u8(vcreate_u8(0), vcreate_u8(0));
|
||||
@ -775,25 +721,37 @@ static inline uint64_t popcnt(const void* data, uint64_t size)
|
||||
}
|
||||
while (i < iters);
|
||||
|
||||
i = 0;
|
||||
size %= chunk_size;
|
||||
|
||||
uint64_t tmp[2];
|
||||
vst1q_u64(tmp, sum);
|
||||
cnt += tmp[0];
|
||||
cnt += tmp[1];
|
||||
}
|
||||
|
||||
size %= chunk_size;
|
||||
align_8(&ptr, &size, &cnt);
|
||||
const uint64_t* ptr64 = (const uint64_t*) ptr;
|
||||
uint64_t iters = size / 8;
|
||||
|
||||
for (uint64_t i = 0; i < iters; i++)
|
||||
cnt += popcnt64(ptr64[i]);
|
||||
|
||||
ptr += size - size % 8;
|
||||
size = size % 8;
|
||||
|
||||
for (uint64_t i = 0; i < size; i++)
|
||||
#if defined(__ARM_FEATURE_UNALIGNED)
|
||||
/* We use unaligned memory accesses here to improve performance */
|
||||
for (; i < size - size % 8; i += 8)
|
||||
cnt += popcnt64(*(const uint64_t*)(ptr + i));
|
||||
#else
|
||||
if (i + 8 <= size)
|
||||
{
|
||||
/* Align memory to an 8 byte boundary */
|
||||
for (; (uintptr_t)(ptr + i) % 8; i++)
|
||||
cnt += popcnt64(ptr[i]);
|
||||
for (; i < size - size % 8; i += 8)
|
||||
cnt += popcnt64(*(const uint64_t*)(ptr + i));
|
||||
}
|
||||
#endif
|
||||
|
||||
if (i < size)
|
||||
{
|
||||
uint64_t val = 0;
|
||||
size_t bytes = (size_t)(size - i);
|
||||
memcpy(&val, &ptr[i], bytes);
|
||||
cnt += popcount64(val);
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
@ -801,16 +759,6 @@ static inline uint64_t popcnt(const void* data, uint64_t size)
|
||||
/* all other CPUs */
|
||||
#else
|
||||
|
||||
/* Align memory to 8 bytes boundary */
|
||||
static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
|
||||
{
|
||||
for (; *size > 0 && (uintptr_t) *p % 8; (*p)++)
|
||||
{
|
||||
*cnt += popcnt64(**p);
|
||||
*size -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Count the number of 1 bits in the data array
|
||||
* @data: An array
|
||||
@ -818,15 +766,24 @@ static inline void align_8(const uint8_t** p, uint64_t* size, uint64_t* cnt)
|
||||
*/
|
||||
static inline uint64_t popcnt(const void* data, uint64_t size)
|
||||
{
|
||||
const uint8_t* ptr = (const uint8_t*) data;
|
||||
uint64_t i = 0;
|
||||
uint64_t cnt = 0;
|
||||
uint64_t i;
|
||||
const uint8_t* ptr = (const uint8_t*) data;
|
||||
|
||||
align_8(&ptr, &size, &cnt);
|
||||
cnt += popcnt64_unrolled((const uint64_t*) ptr, size / 8);
|
||||
ptr += size - size % 8;
|
||||
size = size % 8;
|
||||
for (i = 0; i < size; i++)
|
||||
if (size >= 8)
|
||||
{
|
||||
/*
|
||||
* Since we don't know whether this CPU architecture
|
||||
* supports unaligned memory accesses we align
|
||||
* memory to an 8 byte boundary.
|
||||
*/
|
||||
for (; (uintptr_t)(ptr + i) % 8; i++)
|
||||
cnt += popcnt64(ptr[i]);
|
||||
for (; i < size - size % 8; i += 8)
|
||||
cnt += popcnt64(*(const uint64_t*)(ptr + i));
|
||||
}
|
||||
|
||||
for (; i < size; i++)
|
||||
cnt += popcnt64(ptr[i]);
|
||||
|
||||
return cnt;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user