So I happened to be implementing a sparse hashtable in C. I wanted to have lazily allocated buckets kept in a linear array and indexed using a bitmap, where the number of previous `1`

before the index in the bitmap is used as an index into the linear array of buckets. Getting the number of `1`

bits set is called a population count and is also provided by the compiler intrinsics `__builtin_popcount`

in various forms.

# Why not just use intrinsics?

At least for LLVM and GCC, the code emitted for `__builtin_popcount`

uses the counting bits in parallel bit-twiddling hack. While the code translates rather nicely to assembly language for most platforms, it can never reach the speeds of really hardware accelerated population count. For more information about how to speed-up population count on Intel refer to this excellent analysis on the topic.

Let's take a look how the `__builtin_popcountll`

translates into assembly, as we want to operate on the widest value type supported:

```
__popcountdi2:
0x0000000000400bd8 <+0>: lsr x1, x0, #1
0x0000000000400bdc <+4>: and x1, x1, #0x5555555555555555
0x0000000000400be0 <+8>: sub x1, x0, x1
0x0000000000400be4 <+12>: and x0, x1, #0x3333333333333333
0x0000000000400be8 <+16>: lsr x1, x1, #2
0x0000000000400bec <+20>: and x1, x1, #0x3333333333333333
0x0000000000400bf0 <+24>: add x0, x0, x1
0x0000000000400bf4 <+28>: add x0, x0, x0, lsr #4
0x0000000000400bf8 <+32>: and x0, x0, #0xf0f0f0f0f0f0f0f
0x0000000000400bfc <+36>: add x0, x0, x0, lsl #8
0x0000000000400c00 <+40>: add x0, x0, x0, lsl #16
0x0000000000400c04 <+44>: add x0, x0, x0, lsl #32
0x0000000000400c08 <+48>: lsr x0, x0, #56
0x0000000000400c0c <+52>: ret
```

Only 2 registers clobbered, 14 instructions in total, I can't help myself but this looks actually more *readable* than the original C implementation. Is this the best we can do? Nope.

# Let's SIMD do it in 8 instructions

AArch64 has in its arsenal an instruction called `CNT`

whose sole purpose is to count bits in the target register, even better it's actually a `SIMD`

instruction, so we can swizzle it up to operate on 128bits at once and store the result within a single vector register:

```
LD1 {v0.2D, v1.2D}, [%1], #32 ; load 32 bytes from where %1 points and
; increments %1 by 32, data are loaded into
; v0 and v1, `2D` means 2*8 bytes per
; register (128 bit register width)
```

Incrementing the pointer is important because we don't want to index the incoming buffer, that would take just additional time, let's popcount:

```
CNT v0.16b, v0.16b
CNT v1.16b, v1.16b
```

Like I mentioned above, the `CNT`

instruction can count and store within a single vector register, the `16b`

swizzle means to interpret the data in the vector register as 16 consecutive bytes, and we'll see shortly how this affects the return layout of the vector register.

```
union vector_register {
struct swizzle_16b {
union {
unsigned char b0;
unsigned char b1;
...
unsigned char b15;
};
unsigned char data[16];
};
};
for (i = 0; i < sizeof(vr.swizzle_16b.data); i++) {
vr.swizzle_16b.data[i] = __num_set_bits(vr.swizzle_16b.data[i]);
}
```

Executing this algorithm over 2 bytes (just for illustration) we get this:

```
i = 65534 = '11111111 11111110'
i0 = '11111111 00000111' ; 7 bits set in first byte
i1 = '00001000 00000111' ; 8 bits set in second byte
```

Given this storage semantics, we need to do a sum across vector to get the total number of bits set, and luckily there's a single instruction that does that:

```
UADDLV h2, v0.16b
UADDLV h3, v1.16b
```

`h{2,3}`

are target register, the `h`

storage qualifier translates to 2 byte integer, that's also the narrowest possible specifier. We could actually do with just 8 bits, as the highest possible sum can be `16*8 = 128`

. But it doesn't matter as we still clobber whole vector registers `v2`

and `v3`

.

`v2`

and `v3`

now contain final sums, we need to add them together:

```
ADD d2, d3, d2
```

Again `d`

is the narrowest possible specifier for `ADD`

, the result is stored in `v2`

.

After that, we want to transfer the result to a general purpose register:

```
UMOV x0, v2.d[0]
```

Moves the lowest 8 bytes of register `v2`

to a general purpose register `x0`

.

# Example benchmark

Optimizations are worthless without proving that they actually improve the situation, so let's see the numbers for `__builtin_popcnout`

and for our `SIMD`

version, we'll slightly unroll both loops so they both process the same amount of data per each iteration:

```
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
int count_multiple_bits(unsigned long long *b, unsigned int size) {
unsigned long long *d = b;
unsigned int masked = 0, i = 0;
int c = 0;
masked = size & ~3;
for (; i < masked; i += 4)
__asm__("LD1 {v0.2D, v1.2D}, [%1], #32 \n\t"
"CNT v0.16b, v0.16b \n\t"
"CNT v1.16b, v1.16b \n\t"
"UADDLV h2, v0.16b \n\t"
"UADDLV h3, v1.16b \n\t"
"ADD d2, d3, d2 \n\t"
"UMOV x0, v2.d[0] \n\t"
"ADD %0, x0, %0 \n\t"
: "+r"(c), "+r"(d)
:: "x0", "v0", "v1", "v2", "v3");
for (; i < size; ++i)
__asm__("LD1 {v0.D}[0], [%1], #8 \n\t"
"CNT v0.8b, v0.8b \n\t"
"UADDLV h1, v0.8b \n\t"
"UMOV x0, v1.d[0] \n\t"
"ADD %0, x0, %0 \n\t"
: "+r"(c), "+r"(d)
:: "x0", "v0", "v1");
return c;
}
int count_bits_naive(unsigned long long *p, unsigned int size) {
unsigned int masked = 0, i = 0;
int c = 0;
masked = size & ~3;
for (; i < masked; i += 4) {
c += __builtin_popcountll(p[i + 0]);
c += __builtin_popcountll(p[i + 1]);
c += __builtin_popcountll(p[i + 2]);
c += __builtin_popcountll(p[i + 3]);
}
for (; i < size; ++i)
c += __builtin_popcountll(p[i]);
return c;
}
int main(int argc, const char *argv[]) {
char *endptr = NULL;
long int size = 0;
size_t c1 = 0, c2 = 0;
if (argc < 2) {
puts("Please provide buffer size argument");
return 1;
}
size = strtol(argv[1], &endptr, 10);
if (size < 0) {
puts("Negative buffer size? Cowardly flipping sign");
size = -size;
}
if (*endptr) {
switch (*endptr) {
case 'm':
case 'M':
size *= 1024 * 1024; break;
case 'k':
case 'K':
size *= 1024; break;
default: break;
}
}
srand(clock());
unsigned long long *b = malloc(sizeof(unsigned long long) * size);
for (unsigned long long i = 0; i < size; ++i)
b[i] = (unsigned long long)random() << 32 | (unsigned long long)random();
clock_t start = clock();
c1 = count_multiple_bits(b, size);
clock_t end = clock();
c2 = count_bits_naive(b, size);
clock_t end2 = clock();
printf("Test: %zu\n", c1);
printf("Test: %zu\n", c2);
printf("Time taken SIMD: %d (%f seconds) \nTime taken built-in: %d (%f seconds)\n",
end - start, (float)(end-start) / CLOCKS_PER_SEC, end2 - end, (float)(end2-end) / CLOCKS_PER_SEC);
printf("Buffer size: %s\n", argv[1]);
return 0;
}
```

You can also find this source code in the following Gist.

# What are the numbers?

If you've read the comments at the top of the linked Gist, you may be wondering if this is actually an optimization at all? Running the example under QEMU system emulation suggests that the SIMD version performs better only for buffers bigger than 100K. After running several hundred tests on real Aarch64 hardware, both version run equally for inputs up to 32 bytes, then the SIMD version starts overtaking the intrinsics version, and around 512 bytes inputs the performance gain stabilizes at 3.5X

The test also verifies that both methods compute the same number (`c1`

, `c2`

).

# Conclusion

If you need to `popcnt`

across large buffers, using the SIMD version can give you up to 3.5X speed-up. This speed-up is achieved roughly at 512 byte input buffer boundary.