comparison crc32x86.c @ 3:6483683ac857 default tip

*: add profiling code too; expand x86 to use all eight XMM registers basically ported verbatim from the assembly
author Paper <paper@tflc.us>
date Mon, 09 Feb 2026 21:30:30 -0500
parents ead9f84d11db
children
comparison
equal deleted inserted replaced
2:ead9f84d11db 3:6483683ac857
1 /* x86-specific CRC routines */ 1 /* x86-specific CRC routines */
2 2
3 #ifdef __x86_64__ 3 #ifdef __x86_64__
4
5 /* NOTE: None of this is really x86-specific.
6 * There are probably many other architectures with
7 * native 64x64->128.
8 *
9 * We could adapt this to use just the gcc uint128_t
10 * instead of x86 intrinsics, but it may slow things
11 * down a bit. */
12
13 #define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq")))
4 14
5 #include "crc32.h" 15 #include "crc32.h"
6 #include "crc32i.h" 16 #include "crc32i.h"
7 #include <stdio.h> 17 #include <stdio.h>
8 #include <immintrin.h> 18 #include <immintrin.h>
130 #include <stdio.h> 140 #include <stdio.h>
131 int main(void) 141 int main(void)
132 { 142 {
133 unsigned i; 143 unsigned i;
134 144
135 for (i = 1; i <= (4*128+32); i++) { 145 for (i = 1; i <= 1024; i++) {
136 printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1); 146 printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1);
137 printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1); 147 printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1);
138 } 148 }
139 149
140 return 0; 150 return 0;
153 #undef XNDIVP_DIV_ITER 163 #undef XNDIVP_DIV_ITER
154 164
155 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31) 165 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31)
156 RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), 166 RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
157 RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128), 167 RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128),
168 RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960),
169 RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024),
158 RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), 170 RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
159 RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32), 171 RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32),
160 RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32), 172 RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32),
161 RK08 = XNDIVP_RK08R, 173 RK08 = XNDIVP_RK08R,
174 RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832),
175 RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896),
176 RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704),
177 RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768),
178 RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576),
179 RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640),
180 RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448),
181 RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512),
182 RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320),
183 RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384),
184 RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192),
185 RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256),
162 #undef FIXUPCONSTANTS 186 #undef FIXUPCONSTANTS
163 }; 187 };
164 188
165 __attribute__((__target__("vpclmulqdq"))) 189 VPCLMULQDQ_TARGET
190 CRC32_FORCEINLINE
191 uint32_t crc32x86_barrett_reduction(__m128i msgxmm)
192 {
193 static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06},
194 rk07[2] = {RK07, RK08},
195 mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
196 __m128i rk;
197
198 rk = _mm_load_si128((__m128i *)rk05);
199
200 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));
201
202 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));
203
204 /* Barrett Reduction */
205 rk = _mm_load_si128((__m128i *)rk07);
206 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);
207
208 return _mm_extract_epi32(msgxmm, 2);
209 }
210
211 VPCLMULQDQ_TARGET
212 CRC32_FORCEINLINE
213 __m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next)
214 {
215 return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10)));
216 }
217
218 /* GCC-specific shit */
219 VPCLMULQDQ_TARGET
166 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz) 220 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz)
167 { 221 {
222 static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02},
223 rk03[2] = {RK03, RK04},
224 rk09[2] = {RK09, RK10},
225 rk11[2] = {RK11, RK12},
226 rk13[2] = {RK13, RK14},
227 rk15[2] = {RK15, RK16},
228 rk17[2] = {RK17, RK18},
229 rk19[2] = {RK19, RK20};
230 __m128i msgxmm;
231
232 if (sz >= 256) {
233 __m128i rk, msgxmma[8], xmm8;
234
235 /* receive first 128 bytes */
236 msgxmma[0] = _mm_load_si128((__m128i *)msg + 0);
237 msgxmma[1] = _mm_load_si128((__m128i *)msg + 1);
238 msgxmma[2] = _mm_load_si128((__m128i *)msg + 2);
239 msgxmma[3] = _mm_load_si128((__m128i *)msg + 3);
240 msgxmma[4] = _mm_load_si128((__m128i *)msg + 4);
241 msgxmma[5] = _mm_load_si128((__m128i *)msg + 5);
242 msgxmma[6] = _mm_load_si128((__m128i *)msg + 6);
243 msgxmma[7] = _mm_load_si128((__m128i *)msg + 7);
244 msg += 128;
245 sz -= 128;
246
247 /* XOR the initial CRC */
248 msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc));
249
250 rk = _mm_load_si128((__m128i *)rk03);
251
252 for (; sz >= 128; msg += 128, sz -= 128) {
253 /* loop unrolled */
254 msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0));
255 msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1));
256 msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2));
257 msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3));
258 msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4));
259 msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5));
260 msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6));
261 msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7));
262 }
263
264 /* Fold it all into one xmm register */
265 msgxmm = msgxmma[7];
266
267 msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm);
268 msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm);
269 msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm);
270 msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm);
271 msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm);
272 msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm);
273 msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm);
274
275 /* Jump across into the 16-byte code, skipping the loading.
276 * This is much simpler than either doing two barrett reductions or
277 * adding a whole ton of branches... */
278 goto jmpFrom128byte;
279 }
280
168 /* This actually works for 16-byte buffers too, but whether it's actually 281 /* This actually works for 16-byte buffers too, but whether it's actually
169 * useful or faster is another question entirely */ 282 * useful or faster is another question entirely */
170 if (sz >= 32) { 283 if (sz >= 32) {
171 static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02}, 284 __m128i rk;
172 rk05[2] = {RK05, RK06},
173 rk07[2] = {RK07, RK08},
174 mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
175 __m128i rk, msgxmm;
176 285
177 msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc)); 286 msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc));
178 287 msg += 16;
288 sz -= 16;
289
290 jmpFrom128byte:
179 rk = _mm_load_si128((__m128i *)rk01); 291 rk = _mm_load_si128((__m128i *)rk01);
180 292
181 for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) { 293 for (; sz >= 16; msg += 16, sz -= 16)
182 msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg)); 294 msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg));
183 } 295
184 296 crc = crc32x86_barrett_reduction(msgxmm);
185 rk = _mm_load_si128((__m128i *)rk05);
186
187 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));
188
189 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));
190
191 /* Barrett Reduction */
192 rk = _mm_load_si128((__m128i *)rk07);
193 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);
194
195 crc = _mm_extract_epi32(msgxmm, 2);
196 } 297 }
197 298
198 if (!sz) return crc; 299 if (!sz) return crc;
199 300
200 /* We were already aligned on a 16-byte boundary going in (hopefully 301 /* We were already aligned on a 16-byte boundary going in (hopefully