Mercurial > crc32
diff crc32x86.c @ 3:6483683ac857 default tip
*: add profiling code too; expand x86 to use all eight XMM registers
basically ported verbatim from the assembly
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 09 Feb 2026 21:30:30 -0500 |
| parents | ead9f84d11db |
| children |
line wrap: on
line diff
--- a/crc32x86.c Mon Feb 09 01:21:00 2026 -0500 +++ b/crc32x86.c Mon Feb 09 21:30:30 2026 -0500 @@ -2,6 +2,16 @@ #ifdef __x86_64__ +/* NOTE: None of this is really x86-specific. + * There are probably many other architectures with + * native 64x64->128. + * + * We could adapt this to use just the gcc uint128_t + * instead of x86 intrinsics, but it may slow things + * down a bit. */ + +#define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq"))) + #include "crc32.h" #include "crc32i.h" #include <stdio.h> @@ -132,7 +142,7 @@ { unsigned i; - for (i = 1; i <= (4*128+32); i++) { + for (i = 1; i <= 1024; i++) { printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1); printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1); } @@ -155,44 +165,135 @@ #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31) RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128), + RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960), + RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024), RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32), RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32), RK08 = XNDIVP_RK08R, + RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832), + RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896), + RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704), + RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768), + RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576), + RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640), + RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448), + RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512), + RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320), + RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384), + RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192), + RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256), #undef FIXUPCONSTANTS }; -__attribute__((__target__("vpclmulqdq"))) +VPCLMULQDQ_TARGET +CRC32_FORCEINLINE +uint32_t crc32x86_barrett_reduction(__m128i msgxmm) +{ + static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06}, + rk07[2] = {RK07, RK08}, + mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; + __m128i rk; + + rk = _mm_load_si128((__m128i *)rk05); + + msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); + + msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); + + /* Barrett Reduction */ + rk = _mm_load_si128((__m128i *)rk07); + msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); + + return _mm_extract_epi32(msgxmm, 2); +} + +VPCLMULQDQ_TARGET +CRC32_FORCEINLINE +__m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next) +{ + return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10))); +} + +/* GCC-specific shit */ +VPCLMULQDQ_TARGET uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz) { + static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02}, + rk03[2] = {RK03, RK04}, + rk09[2] = {RK09, RK10}, + rk11[2] = {RK11, RK12}, + rk13[2] = {RK13, RK14}, + rk15[2] = {RK15, RK16}, + rk17[2] = {RK17, RK18}, + rk19[2] = {RK19, RK20}; + __m128i msgxmm; + + if (sz >= 256) { + __m128i rk, msgxmma[8], xmm8; + + /* receive first 128 bytes */ + msgxmma[0] = _mm_load_si128((__m128i *)msg + 0); + msgxmma[1] = _mm_load_si128((__m128i *)msg + 1); + msgxmma[2] = _mm_load_si128((__m128i *)msg + 2); + msgxmma[3] = _mm_load_si128((__m128i *)msg + 3); + msgxmma[4] = _mm_load_si128((__m128i *)msg + 4); + msgxmma[5] = _mm_load_si128((__m128i *)msg + 5); + msgxmma[6] = _mm_load_si128((__m128i *)msg + 6); + msgxmma[7] = _mm_load_si128((__m128i *)msg + 7); + msg += 128; + sz -= 128; + + /* XOR the initial CRC */ + msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc)); + + rk = _mm_load_si128((__m128i *)rk03); + + for (; sz >= 128; msg += 128, sz -= 128) { + /* loop unrolled */ + msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0)); + msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1)); + msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2)); + msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3)); + msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4)); + msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5)); + msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6)); + msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7)); + } + + /* Fold it all into one xmm register */ + msgxmm = msgxmma[7]; + + msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm); + msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm); + msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm); + msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm); + msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm); + msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm); + msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm); + + /* Jump across into the 16-byte code, skipping the loading. + * This is much simpler than either doing two barrett reductions or + * adding a whole ton of branches... */ + goto jmpFrom128byte; + } + /* This actually works for 16-byte buffers too, but whether it's actually * useful or faster is another question entirely */ if (sz >= 32) { - static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02}, - rk05[2] = {RK05, RK06}, - rk07[2] = {RK07, RK08}, - mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; - __m128i rk, msgxmm; + __m128i rk; msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc)); + msg += 16; + sz -= 16; +jmpFrom128byte: rk = _mm_load_si128((__m128i *)rk01); - for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) { - msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg)); - } - - rk = _mm_load_si128((__m128i *)rk05); - - msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); + for (; sz >= 16; msg += 16, sz -= 16) + msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg)); - msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); - - /* Barrett Reduction */ - rk = _mm_load_si128((__m128i *)rk07); - msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); - - crc = _mm_extract_epi32(msgxmm, 2); + crc = crc32x86_barrett_reduction(msgxmm); } if (!sz) return crc;
