view crc32x86.c @ 3:6483683ac857 default tip

*: add profiling code too; expand x86 to use all eight XMM registers basically ported verbatim from the assembly
author Paper <paper@tflc.us>
date Mon, 09 Feb 2026 21:30:30 -0500
parents ead9f84d11db
children
line wrap: on
line source

/* x86-specific CRC routines */

#ifdef __x86_64__

/* NOTE: None of this is really x86-specific.
 * There are probably many other architectures with
 * native 64x64->128.
 *
 * We could adapt this to use just the gcc uint128_t
 * instead of x86 intrinsics, but it may slow things
 * down a bit. */

#define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq")))

#include "crc32.h"
#include "crc32i.h"
#include <stdio.h>
#include <immintrin.h>

#define BITREVERSE64EX(THIS) \
( \
	  (((THIS) & 0x0000000000000001) << 63) \
	| (((THIS) & 0x0000000000000002) << 61) \
	| (((THIS) & 0x0000000000000004) << 59) \
	| (((THIS) & 0x0000000000000008) << 57) \
	| (((THIS) & 0x0000000000000010) << 55) \
	| (((THIS) & 0x0000000000000020) << 53) \
	| (((THIS) & 0x0000000000000040) << 51) \
	| (((THIS) & 0x0000000000000080) << 49) \
	| (((THIS) & 0x0000000000000100) << 47) \
	| (((THIS) & 0x0000000000000200) << 45) \
	| (((THIS) & 0x0000000000000400) << 43) \
	| (((THIS) & 0x0000000000000800) << 41) \
	| (((THIS) & 0x0000000000001000) << 39) \
	| (((THIS) & 0x0000000000002000) << 37) \
	| (((THIS) & 0x0000000000004000) << 35) \
	| (((THIS) & 0x0000000000008000) << 33) \
	| (((THIS) & 0x0000000000010000) << 31) \
	| (((THIS) & 0x0000000000020000) << 29) \
	| (((THIS) & 0x0000000000040000) << 27) \
	| (((THIS) & 0x0000000000080000) << 25) \
	| (((THIS) & 0x0000000000100000) << 23) \
	| (((THIS) & 0x0000000000200000) << 21) \
	| (((THIS) & 0x0000000000400000) << 19) \
	| (((THIS) & 0x0000000000800000) << 17) \
	| (((THIS) & 0x0000000001000000) << 15) \
	| (((THIS) & 0x0000000002000000) << 13) \
	| (((THIS) & 0x0000000004000000) << 11) \
	| (((THIS) & 0x0000000008000000) <<  9) \
	| (((THIS) & 0x0000000010000000) <<  7) \
	| (((THIS) & 0x0000000020000000) <<  5) \
	| (((THIS) & 0x0000000040000000) <<  3) \
	| (((THIS) & 0x0000000080000000) <<  1) \
	| (((THIS) & 0x0000000100000000) >>  1) \
	| (((THIS) & 0x0000000200000000) >>  3) \
	| (((THIS) & 0x0000000400000000) >>  5) \
	| (((THIS) & 0x0000000800000000) >>  7) \
	| (((THIS) & 0x0000001000000000) >>  9) \
	| (((THIS) & 0x0000002000000000) >> 11) \
	| (((THIS) & 0x0000004000000000) >> 13) \
	| (((THIS) & 0x0000008000000000) >> 15) \
	| (((THIS) & 0x0000010000000000) >> 17) \
	| (((THIS) & 0x0000020000000000) >> 19) \
	| (((THIS) & 0x0000040000000000) >> 21) \
	| (((THIS) & 0x0000080000000000) >> 23) \
	| (((THIS) & 0x0000100000000000) >> 25) \
	| (((THIS) & 0x0000200000000000) >> 27) \
	| (((THIS) & 0x0000400000000000) >> 29) \
	| (((THIS) & 0x0000800000000000) >> 31) \
	| (((THIS) & 0x0001000000000000) >> 33) \
	| (((THIS) & 0x0002000000000000) >> 35) \
	| (((THIS) & 0x0004000000000000) >> 37) \
	| (((THIS) & 0x0008000000000000) >> 39) \
	| (((THIS) & 0x0010000000000000) >> 41) \
	| (((THIS) & 0x0020000000000000) >> 43) \
	| (((THIS) & 0x0040000000000000) >> 45) \
	| (((THIS) & 0x0080000000000000) >> 47) \
	| (((THIS) & 0x0100000000000000) >> 49) \
	| (((THIS) & 0x0200000000000000) >> 51) \
	| (((THIS) & 0x0400000000000000) >> 53) \
	| (((THIS) & 0x0800000000000000) >> 55) \
	| (((THIS) & 0x1000000000000000) >> 57) \
	| (((THIS) & 0x2000000000000000) >> 59) \
	| (((THIS) & 0x4000000000000000) >> 61) \
	| (((THIS) & 0x8000000000000000) >> 63) \
)

#define BITREVERSE64(THIS) \
	(BITREVERSE64EX((uint64_t)(THIS)))

#define BITREVERSE32EX(THIS) \
( \
	(((THIS) & 0x00000001) << 31) \
	| (((THIS) & 0x00000002) << 29) \
	| (((THIS) & 0x00000004) << 27) \
	| (((THIS) & 0x00000008) << 25) \
	| (((THIS) & 0x00000010) << 23) \
	| (((THIS) & 0x00000020) << 21) \
	| (((THIS) & 0x00000040) << 19) \
	| (((THIS) & 0x00000080) << 17) \
	| (((THIS) & 0x00000100) << 15) \
	| (((THIS) & 0x00000200) << 13) \
	| (((THIS) & 0x00000400) << 11) \
	| (((THIS) & 0x00000800) <<  9) \
	| (((THIS) & 0x00001000) <<  7) \
	| (((THIS) & 0x00002000) <<  5) \
	| (((THIS) & 0x00004000) <<  3) \
	| (((THIS) & 0x00008000) <<  1) \
	| (((THIS) & 0x00010000) >>  1) \
	| (((THIS) & 0x00020000) >>  3) \
	| (((THIS) & 0x00040000) >>  5) \
	| (((THIS) & 0x00080000) >>  7) \
	| (((THIS) & 0x00100000) >>  9) \
	| (((THIS) & 0x00200000) >> 11) \
	| (((THIS) & 0x00400000) >> 13) \
	| (((THIS) & 0x00800000) >> 15) \
	| (((THIS) & 0x01000000) >> 17) \
	| (((THIS) & 0x02000000) >> 19) \
	| (((THIS) & 0x04000000) >> 21) \
	| (((THIS) & 0x08000000) >> 23) \
	| (((THIS) & 0x10000000) >> 25) \
	| (((THIS) & 0x20000000) >> 27) \
	| (((THIS) & 0x40000000) >> 29) \
	| (((THIS) & 0x80000000) >> 31) \
)

#define BITREVERSE32(THIS) \
	(BITREVERSE32EX((uint32_t)(THIS)))

enum {
	XNDIVP_RK08F = (BITREVERSE32(CRC32_POLYNOMIAL)) | 0x100000000ull,
	XNDIVP_RK08R = (BITREVERSE64(XNDIVP_RK08F) >> 31) | 1,

	/* The beginning ... */
	XNDIVP_MOD_ITER_0 = XNDIVP_RK08F,
	XNDIVP_DIV_ITER_0 = 1,

/* to generate table, run this:

#include <stdio.h>
int main(void)
{
	unsigned i;

	for (i = 1; i <= 1024; i++) {
		printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1);
		printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1);
	}

	return 0;
}
*/

#define XNDIVP_MOD_ITER(This, last) \
	XNDIVP_MOD_ITER_##This = (uint64_t)((XNDIVP_MOD_ITER_##last << 1) ^ ((XNDIVP_MOD_ITER_##last & 0x80000000) ? (XNDIVP_RK08F) : 0)),

#define XNDIVP_DIV_ITER(This, last) \
	XNDIVP_DIV_ITER_##This = (uint64_t)(((uint64_t)XNDIVP_DIV_ITER_##last << 1) | ((XNDIVP_MOD_ITER_##last & 0x80000000ull) ? 1 : 0)),

#include "crc32x86-tab.h"

#undef XNDIVP_MOD_ITER
#undef XNDIVP_DIV_ITER

#define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31)
	RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
	RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128),
	RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960),
	RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024),
	RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
	RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32),
	RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32),
	RK08 = XNDIVP_RK08R,
	RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832),
	RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896),
	RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704),
	RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768),
	RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576),
	RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640),
	RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448),
	RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512),
	RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320),
	RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384),
	RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192),
	RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256),
#undef FIXUPCONSTANTS
};

VPCLMULQDQ_TARGET
CRC32_FORCEINLINE
uint32_t crc32x86_barrett_reduction(__m128i msgxmm)
{
	static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06},
			rk07[2] = {RK07, RK08},
			mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
	__m128i rk;

	rk = _mm_load_si128((__m128i *)rk05);

	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));

	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));

	/* Barrett Reduction */
	rk = _mm_load_si128((__m128i *)rk07);
	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);

	return _mm_extract_epi32(msgxmm, 2);
}

VPCLMULQDQ_TARGET
CRC32_FORCEINLINE
__m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next)
{
	return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10)));
}

/* GCC-specific shit */
VPCLMULQDQ_TARGET
uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz)
{
	static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02},
			rk03[2] = {RK03, RK04},
			rk09[2] = {RK09, RK10},
			rk11[2] = {RK11, RK12},
			rk13[2] = {RK13, RK14},
			rk15[2] = {RK15, RK16},
			rk17[2] = {RK17, RK18},
			rk19[2] = {RK19, RK20};
	__m128i msgxmm;

	if (sz >= 256) {
		__m128i rk, msgxmma[8], xmm8;

		/* receive first 128 bytes */
		msgxmma[0] = _mm_load_si128((__m128i *)msg + 0);
		msgxmma[1] = _mm_load_si128((__m128i *)msg + 1);
		msgxmma[2] = _mm_load_si128((__m128i *)msg + 2);
		msgxmma[3] = _mm_load_si128((__m128i *)msg + 3);
		msgxmma[4] = _mm_load_si128((__m128i *)msg + 4);
		msgxmma[5] = _mm_load_si128((__m128i *)msg + 5);
		msgxmma[6] = _mm_load_si128((__m128i *)msg + 6);
		msgxmma[7] = _mm_load_si128((__m128i *)msg + 7);
		msg += 128;
		sz -= 128;

		/* XOR the initial CRC */
		msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc));

		rk = _mm_load_si128((__m128i *)rk03);

		for (; sz >= 128; msg += 128, sz -= 128) {
			/* loop unrolled */
			msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0));
			msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1));
			msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2));
			msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3));
			msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4));
			msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5));
			msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6));
			msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7));	
		}

		/* Fold it all into one xmm register */
		msgxmm = msgxmma[7];

		msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm);
		msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm);
		msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm);
		msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm);
		msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm);
		msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm);
		msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm);

		/* Jump across into the 16-byte code, skipping the loading.
		 * This is much simpler than either doing two barrett reductions or
		 * adding a whole ton of branches... */
		goto jmpFrom128byte;
	}

	/* This actually works for 16-byte buffers too, but whether it's actually
	 * useful or faster is another question entirely */
	if (sz >= 32) {
		__m128i rk;

		msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc));
		msg += 16;
		sz -= 16;

jmpFrom128byte:
		rk = _mm_load_si128((__m128i *)rk01);

		for (; sz >= 16; msg += 16, sz -= 16)
			msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg));

		crc = crc32x86_barrett_reduction(msgxmm);
	}

	if (!sz) return crc;

	/* We were already aligned on a 16-byte boundary going in (hopefully
	 * or else it will break), and we process 16-bytes at a time. This
	 * means `msg` is aligned 16-bytes, a multiple of 4-byte, so we don't
	 * need to align any more (or use crc32c_r). */
	return crc32qw_r(crc, msg, sz);
}

#endif