So, I want to set an individual bit of a __m256i register.
Say, my __m256i contains: [ 1 0 1 0 | 1 0 1 0 | ... | 1 0 1 0 ], how do I set and unset the n-th bit?
So, I want to set an individual bit of a __m256i register.
Say, my __m256i contains: [ 1 0 1 0 | 1 0 1 0 | ... | 1 0 1 0 ], how do I set and unset the n-th bit?
This is an implementation of function which can set individual bit inside a vector:
#include <immintrin.h>
#include <assert.h>
void SetBit(__m256i & vector, size_t position, bool value)
{
assert(position <= 255);
uint8_t lut[32] = { 0 };
lut[position >> 3] = 1 << (position & 7);
__m256i mask = _mm256_loadu_si256((__m256i*)lut);
if (value)
vector = _mm256_or_si256(mask, vector);
else
vector = _mm256_andnot_si256(mask, vector);
}
int main(int argc, char* argv[])
{
__m256i a = _mm256_set1_epi8(-1);
SetBit(a, 54, false);
__m256i b = _mm256_set1_epi8(0);
SetBit(b, 54, true);
return 0;
}
There is another implementation:
#include <immintrin.h>
#include <assert.h>
template <bool value> void SetMask(const __m256i & mask, __m256i & vector);
template <> inline void SetMask<true>(const __m256i & mask, __m256i & vector)
{
vector = _mm256_or_si256(mask, vector);
}
template <> inline void SetMask<false>(const __m256i & mask, __m256i & vector)
{
vector = _mm256_andnot_si256(mask, vector);
}
template <int position, bool value> void SetBit(__m256i & vector)
{
const uint8_t mask8 = 1 << (position & 7);
const __m128i mask128 = _mm_insert_epi8(_mm_setzero_si128(), mask8, (position >> 3)&15);
const __m256i mask256 = _mm256_inserti128_si256(_mm256_setzero_si256(), mask128, position >> 7);
SetMask<value>(mask256, vector);
}
int main(int argc, char* argv[])
{
__m256i a = _mm256_set1_epi8(-1);
SetBit<50, false>(a);
__m256i b = _mm256_set1_epi8(0);
SetBit<50, true>(b);
return 0;
}
If you'd like to avoid a LUT and/or store-forwarding stalls, you can do this to set the k-th bit of an avx-256 register:
inline __m256i setbit_256(__m256i x,int k){
// constants that will (hopefully) be hoisted out of a loop after inlining
__m256i indices = _mm256_set_epi32(224,192,160,128,96,64,32,0);
__m256i one = _mm256_set1_epi32(-1);
one = _mm256_srli_epi32(one, 31); // set1(0x1)
__m256i kvec = _mm256_set1_epi32(k);
// if 0<=k<=255 then kvec-indices has exactly one element with a value between 0 and 31
__m256i shiftcounts = _mm256_sub_epi32(kvec, indices);
__m256i kbit = _mm256_sllv_epi32(one, shiftcounts); // shift counts outside 0..31 shift the bit out of the element
// kth bit set, all 255 other bits zero.
return _mm256_or_si256(kbit, x); // use _mm256_andnot_si256 to unset the k-th bit
}
Below is my previous answer, which is less straight forward and now obsolete.
#include <immintrin.h>
inline __m256i setbit_256(__m256i x,int k){
__m256i c1, c2, c3;
__m256i t, y, msk;
// constants that will (hopefully) be hoisted out of a loop after inlining
c1=_mm256_set_epi32(7,6,5,4,3,2,1,0);
c2=_mm256_set1_epi32(-1);
c3=_mm256_srli_epi32(c2,27); // set1(0x1f) mask for the shift within elements
c2=_mm256_srli_epi32(c2,31); // set1(0x1)
// create a vector with the kth bit set
t=_mm256_set1_epi32(k);
y=_mm256_and_si256(c3,t); // shift count % 32: distance within each elem
y=_mm256_sllv_epi32(c2,y); // set1( 1<<(k%32) )
t=_mm256_srli_epi32(t,5); // set1( k>>5 )
msk=_mm256_cmpeq_epi32(t,c1); // all-ones in the selected element
y=_mm256_and_si256(y,msk); // kth bit set, all 255 other bits zero.
x=_mm256_or_si256(y,x); /* use _mm256_andnot_si256 to unset the k-th bit */
return x;
}
I'am not sure if this will be any faster than the approaches suggested in the other answers.
This compiles to pretty good asm with clang or gcc (Godbolt compiler explorer), considering that the constants will be hoisted out of loops. As usual, clang defeats the attempt to generate constants on the fly, and broadcast-loads them from memory (which is very efficient on modern CPUs).
If you like to avoid a LUT, you can use BTS for setting a single bit (or BTR for clearing it, respectively). There seems to be no intrinsic for this instruction (at least in GCC), so inline-assembly is required (so for x86 architecture only).
0F AB /r --- BTS r/m32, r32 --- Store selected bit in CF flag and set.
They're very slow with memory operands, but these Bit-String instructions allow bit-offsets that go outside of the byte or dword referenced by the addressing mode. The manual explains:
Some assemblers support immediate bit offsets larger than 31 by using the immediate bit offset field in combination with the displacement field of the memory operand. In this case, the low-order 3 or 5 bits (3 for 16-bit oper-ands, 5 for 32-bit operands) of the immediate bit offset are stored in the immediate bit offset field, and the high-order bits are shifted and combined with the byte displacement in the addressing mode by the assembler. The processor will ignore the high order bits if they are not zero.
When accessing a bit in memory, the processor may access 4 bytes starting from the memory address for a 32-bit operand size, using by the following relationship:
Effective Address + (4 ∗ (BitOffset DIV 32))
In pure assembler (Intel-MASM-syntax) this would look like this:
.data
.align 16
save db 32 dup(0) ; 256bit = 32 byte YMM/__m256i temp variable space
bitNumber dd 254 ; use an UINT for the bit to set (here the second to last)
.code
mov eax, bitNumber
...
lea edx, save
movdqa xmmword ptr [edx], xmm0 ; save __m256i to to memory
bts dword ptr [edx], eax ; set the 255st bit
movdqa xmm0, xmmword ptr [edx] ; read __m256i back to register
...
If the variable already is in memory, this would be even easier.
Using inline assembly, this would result in the following functions:
static inline
void set_m256i_bit(__m256i * value, uint32_t bit)
{
// doesn't need to be volatile: we only want to run this for its effect on *value.
__asm__ ("btsl %[bit], %[memval]\n\t"
: [memval] "+m" (*value) : [bit] "ri" (bit));
}
static inline
void clear_m256i_bit(__m256i * value, uint32_t bit)
{
__asm__ ( "btrl %[bit], %[memval]\n\t"
: [memval] "+m" (*value) : [bit] "ri" (bit));
}
These compile to what you'd expect on the Godbolt compiler explorer
And some test code similar to the assembler code above:
__m256i value = _mm256_set_epi32(0,0,0,0,0,0,0,0);
set_m256i_bit(&value,254);
clear_m256i_bit(&value,254);