Mercurial > crc32
view crc32x86.c @ 0:422835bc1aca
*: checkin
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 09 Feb 2026 01:15:00 -0500 |
| parents | |
| children | ead9f84d11db |
line wrap: on
line source
/* x86-specific CRC routines */ #include "crc32.h" #include "crc32i.h" #include <stdio.h> #include <immintrin.h> #define BITREVERSE64EX(THIS) \ ( \ (((THIS) & 0x0000000000000001) << 63) \ | (((THIS) & 0x0000000000000002) << 61) \ | (((THIS) & 0x0000000000000004) << 59) \ | (((THIS) & 0x0000000000000008) << 57) \ | (((THIS) & 0x0000000000000010) << 55) \ | (((THIS) & 0x0000000000000020) << 53) \ | (((THIS) & 0x0000000000000040) << 51) \ | (((THIS) & 0x0000000000000080) << 49) \ | (((THIS) & 0x0000000000000100) << 47) \ | (((THIS) & 0x0000000000000200) << 45) \ | (((THIS) & 0x0000000000000400) << 43) \ | (((THIS) & 0x0000000000000800) << 41) \ | (((THIS) & 0x0000000000001000) << 39) \ | (((THIS) & 0x0000000000002000) << 37) \ | (((THIS) & 0x0000000000004000) << 35) \ | (((THIS) & 0x0000000000008000) << 33) \ | (((THIS) & 0x0000000000010000) << 31) \ | (((THIS) & 0x0000000000020000) << 29) \ | (((THIS) & 0x0000000000040000) << 27) \ | (((THIS) & 0x0000000000080000) << 25) \ | (((THIS) & 0x0000000000100000) << 23) \ | (((THIS) & 0x0000000000200000) << 21) \ | (((THIS) & 0x0000000000400000) << 19) \ | (((THIS) & 0x0000000000800000) << 17) \ | (((THIS) & 0x0000000001000000) << 15) \ | (((THIS) & 0x0000000002000000) << 13) \ | (((THIS) & 0x0000000004000000) << 11) \ | (((THIS) & 0x0000000008000000) << 9) \ | (((THIS) & 0x0000000010000000) << 7) \ | (((THIS) & 0x0000000020000000) << 5) \ | (((THIS) & 0x0000000040000000) << 3) \ | (((THIS) & 0x0000000080000000) << 1) \ | (((THIS) & 0x0000000100000000) >> 1) \ | (((THIS) & 0x0000000200000000) >> 3) \ | (((THIS) & 0x0000000400000000) >> 5) \ | (((THIS) & 0x0000000800000000) >> 7) \ | (((THIS) & 0x0000001000000000) >> 9) \ | (((THIS) & 0x0000002000000000) >> 11) \ | (((THIS) & 0x0000004000000000) >> 13) \ | (((THIS) & 0x0000008000000000) >> 15) \ | (((THIS) & 0x0000010000000000) >> 17) \ | (((THIS) & 0x0000020000000000) >> 19) \ | (((THIS) & 0x0000040000000000) >> 21) \ | (((THIS) & 0x0000080000000000) >> 23) \ | (((THIS) & 0x0000100000000000) >> 25) \ | (((THIS) & 0x0000200000000000) >> 27) \ | (((THIS) & 0x0000400000000000) >> 29) \ | (((THIS) & 0x0000800000000000) >> 31) \ | (((THIS) & 0x0001000000000000) >> 33) \ | (((THIS) & 0x0002000000000000) >> 35) \ | (((THIS) & 0x0004000000000000) >> 37) \ | (((THIS) & 0x0008000000000000) >> 39) \ | (((THIS) & 0x0010000000000000) >> 41) \ | (((THIS) & 0x0020000000000000) >> 43) \ | (((THIS) & 0x0040000000000000) >> 45) \ | (((THIS) & 0x0080000000000000) >> 47) \ | (((THIS) & 0x0100000000000000) >> 49) \ | (((THIS) & 0x0200000000000000) >> 51) \ | (((THIS) & 0x0400000000000000) >> 53) \ | (((THIS) & 0x0800000000000000) >> 55) \ | (((THIS) & 0x1000000000000000) >> 57) \ | (((THIS) & 0x2000000000000000) >> 59) \ | (((THIS) & 0x4000000000000000) >> 61) \ | (((THIS) & 0x8000000000000000) >> 63) \ ) #define BITREVERSE64(THIS) \ (BITREVERSE64EX((uint64_t)(THIS))) #define BITREVERSE32EX(THIS) \ ( \ (((THIS) & 0x00000001) << 31) \ | (((THIS) & 0x00000002) << 29) \ | (((THIS) & 0x00000004) << 27) \ | (((THIS) & 0x00000008) << 25) \ | (((THIS) & 0x00000010) << 23) \ | (((THIS) & 0x00000020) << 21) \ | (((THIS) & 0x00000040) << 19) \ | (((THIS) & 0x00000080) << 17) \ | (((THIS) & 0x00000100) << 15) \ | (((THIS) & 0x00000200) << 13) \ | (((THIS) & 0x00000400) << 11) \ | (((THIS) & 0x00000800) << 9) \ | (((THIS) & 0x00001000) << 7) \ | (((THIS) & 0x00002000) << 5) \ | (((THIS) & 0x00004000) << 3) \ | (((THIS) & 0x00008000) << 1) \ | (((THIS) & 0x00010000) >> 1) \ | (((THIS) & 0x00020000) >> 3) \ | (((THIS) & 0x00040000) >> 5) \ | (((THIS) & 0x00080000) >> 7) \ | (((THIS) & 0x00100000) >> 9) \ | (((THIS) & 0x00200000) >> 11) \ | (((THIS) & 0x00400000) >> 13) \ | (((THIS) & 0x00800000) >> 15) \ | (((THIS) & 0x01000000) >> 17) \ | (((THIS) & 0x02000000) >> 19) \ | (((THIS) & 0x04000000) >> 21) \ | (((THIS) & 0x08000000) >> 23) \ | (((THIS) & 0x10000000) >> 25) \ | (((THIS) & 0x20000000) >> 27) \ | (((THIS) & 0x40000000) >> 29) \ | (((THIS) & 0x80000000) >> 31) \ ) #define BITREVERSE32(THIS) \ (BITREVERSE32EX((uint32_t)(THIS))) enum { XNDIVP_RK08F = (BITREVERSE32(CRC32_POLYNOMIAL)) | 0x100000000ull, XNDIVP_RK08R = (BITREVERSE64(XNDIVP_RK08F) >> 31) | 1, /* The beginning ... */ XNDIVP_MOD_ITER_0 = XNDIVP_RK08F, XNDIVP_DIV_ITER_0 = 1, /* to generate table, run this: #include <stdio.h> int main(void) { unsigned i; for (i = 1; i <= (4*128+32); i++) { printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1); printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1); } return 0; } */ #define XNDIVP_MOD_ITER(This, last) \ XNDIVP_MOD_ITER_##This = (uint64_t)((XNDIVP_MOD_ITER_##last << 1) ^ ((XNDIVP_MOD_ITER_##last & 0x80000000) ? (XNDIVP_RK08F) : 0)), #define XNDIVP_DIV_ITER(This, last) \ XNDIVP_DIV_ITER_##This = (uint64_t)(((uint64_t)XNDIVP_DIV_ITER_##last << 1) | ((XNDIVP_MOD_ITER_##last & 0x80000000ull) ? 1 : 0)), #include "crc32x86-tab.h" #undef XNDIVP_MOD_ITER #undef XNDIVP_DIV_ITER #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31) RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128), RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32), RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32), RK08 = XNDIVP_RK08R, #undef FIXUPCONSTANTS }; __attribute__((__target__("vpclmulqdq"))) uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz) { /* This actually works for 16-byte buffers too, but whether it's actually * useful or faster is another question entirely */ if (sz >= 32) { static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02}, rk05[2] = {RK05, RK06}, rk07[2] = {RK07, RK08}, mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; __m128i rk, msgxmm; msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc)); rk = _mm_load_si128((__m128i *)rk01); for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) { msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg)); } rk = _mm_load_si128((__m128i *)rk05); msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); /* Barrett Reduction */ rk = _mm_load_si128((__m128i *)rk07); msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); crc = _mm_extract_epi32(msgxmm, 2); } if (!sz) return crc; /* We were already aligned on a 16-byte boundary going in (hopefully * or else it will break), and we process 16-bytes at a time. This * means `msg` is aligned 16-bytes, a multiple of 4-byte, so we don't * need to align any more (or use crc32c_r). */ return crc32qw_r(crc, msg, sz); }
