Use the following SSSE3 intrinsics for shuffle.
extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b);
Shuffle bytes from a according to contents of b.
Interpreting a, b, and r as arrays of unsigned 8-bit integers:
for (i = 0; i < 16; i++) {
if (b[i] & 0x80) {
r[i] = 0;
}
else {
r[i] = a[b[i] & 0x0F];
}
}
extern __m64 _mm_shuffle_pi8 (__m64 a, __m64 b);
Shuffle bytes from a according to contents of b.
Interpreting a, b, and r as arrays of unsigned 8-bit integers:
for (i = 0; i < 8; i++) {
if (b[i] & 0x80) {
r[i] = 0;
}
else {
r[i] = a[b[i] & 0x07];
}
}