diff crc32x86.c @ 3:6483683ac857 default tip

*: add profiling code too; expand x86 to use all eight XMM registers basically ported verbatim from the assembly
author Paper <paper@tflc.us>
date Mon, 09 Feb 2026 21:30:30 -0500
parents ead9f84d11db
children
line wrap: on
line diff
--- a/crc32x86.c	Mon Feb 09 01:21:00 2026 -0500
+++ b/crc32x86.c	Mon Feb 09 21:30:30 2026 -0500
@@ -2,6 +2,16 @@
 
 #ifdef __x86_64__
 
+/* NOTE: None of this is really x86-specific.
+ * There are probably many other architectures with
+ * native 64x64->128.
+ *
+ * We could adapt this to use just the gcc uint128_t
+ * instead of x86 intrinsics, but it may slow things
+ * down a bit. */
+
+#define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq")))
+
 #include "crc32.h"
 #include "crc32i.h"
 #include <stdio.h>
@@ -132,7 +142,7 @@
 {
 	unsigned i;
 
-	for (i = 1; i <= (4*128+32); i++) {
+	for (i = 1; i <= 1024; i++) {
 		printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1);
 		printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1);
 	}
@@ -155,44 +165,135 @@
 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31)
 	RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
 	RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128),
+	RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960),
+	RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024),
 	RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
 	RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32),
 	RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32),
 	RK08 = XNDIVP_RK08R,
+	RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832),
+	RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896),
+	RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704),
+	RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768),
+	RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576),
+	RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640),
+	RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448),
+	RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512),
+	RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320),
+	RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384),
+	RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192),
+	RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256),
 #undef FIXUPCONSTANTS
 };
 
-__attribute__((__target__("vpclmulqdq")))
+VPCLMULQDQ_TARGET
+CRC32_FORCEINLINE
+uint32_t crc32x86_barrett_reduction(__m128i msgxmm)
+{
+	static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06},
+			rk07[2] = {RK07, RK08},
+			mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
+	__m128i rk;
+
+	rk = _mm_load_si128((__m128i *)rk05);
+
+	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));
+
+	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));
+
+	/* Barrett Reduction */
+	rk = _mm_load_si128((__m128i *)rk07);
+	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);
+
+	return _mm_extract_epi32(msgxmm, 2);
+}
+
+VPCLMULQDQ_TARGET
+CRC32_FORCEINLINE
+__m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next)
+{
+	return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10)));
+}
+
+/* GCC-specific shit */
+VPCLMULQDQ_TARGET
 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz)
 {
+	static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02},
+			rk03[2] = {RK03, RK04},
+			rk09[2] = {RK09, RK10},
+			rk11[2] = {RK11, RK12},
+			rk13[2] = {RK13, RK14},
+			rk15[2] = {RK15, RK16},
+			rk17[2] = {RK17, RK18},
+			rk19[2] = {RK19, RK20};
+	__m128i msgxmm;
+
+	if (sz >= 256) {
+		__m128i rk, msgxmma[8], xmm8;
+
+		/* receive first 128 bytes */
+		msgxmma[0] = _mm_load_si128((__m128i *)msg + 0);
+		msgxmma[1] = _mm_load_si128((__m128i *)msg + 1);
+		msgxmma[2] = _mm_load_si128((__m128i *)msg + 2);
+		msgxmma[3] = _mm_load_si128((__m128i *)msg + 3);
+		msgxmma[4] = _mm_load_si128((__m128i *)msg + 4);
+		msgxmma[5] = _mm_load_si128((__m128i *)msg + 5);
+		msgxmma[6] = _mm_load_si128((__m128i *)msg + 6);
+		msgxmma[7] = _mm_load_si128((__m128i *)msg + 7);
+		msg += 128;
+		sz -= 128;
+
+		/* XOR the initial CRC */
+		msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc));
+
+		rk = _mm_load_si128((__m128i *)rk03);
+
+		for (; sz >= 128; msg += 128, sz -= 128) {
+			/* loop unrolled */
+			msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0));
+			msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1));
+			msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2));
+			msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3));
+			msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4));
+			msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5));
+			msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6));
+			msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7));	
+		}
+
+		/* Fold it all into one xmm register */
+		msgxmm = msgxmma[7];
+
+		msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm);
+
+		/* Jump across into the 16-byte code, skipping the loading.
+		 * This is much simpler than either doing two barrett reductions or
+		 * adding a whole ton of branches... */
+		goto jmpFrom128byte;
+	}
+
 	/* This actually works for 16-byte buffers too, but whether it's actually
 	 * useful or faster is another question entirely */
 	if (sz >= 32) {
-		static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02},
-				rk05[2] = {RK05, RK06},
-				rk07[2] = {RK07, RK08},
-				mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
-		__m128i rk, msgxmm;
+		__m128i rk;
 
 		msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc));
+		msg += 16;
+		sz -= 16;
 
+jmpFrom128byte:
 		rk = _mm_load_si128((__m128i *)rk01);
 
-		for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) {
-			msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg));
-		}
-
-		rk = _mm_load_si128((__m128i *)rk05);
-
-		msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));
+		for (; sz >= 16; msg += 16, sz -= 16)
+			msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg));
 
-		msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));
-
-		/* Barrett Reduction */
-		rk = _mm_load_si128((__m128i *)rk07);
-		msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);
-
-		crc = _mm_extract_epi32(msgxmm, 2);
+		crc = crc32x86_barrett_reduction(msgxmm);
 	}
 
 	if (!sz) return crc;