Mercurial > crc32
comparison crc32x86.c @ 3:6483683ac857 default tip
*: add profiling code too; expand x86 to use all eight XMM registers
basically ported verbatim from the assembly
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 09 Feb 2026 21:30:30 -0500 |
| parents | ead9f84d11db |
| children |
comparison
equal
deleted
inserted
replaced
| 2:ead9f84d11db | 3:6483683ac857 |
|---|---|
| 1 /* x86-specific CRC routines */ | 1 /* x86-specific CRC routines */ |
| 2 | 2 |
| 3 #ifdef __x86_64__ | 3 #ifdef __x86_64__ |
| 4 | |
| 5 /* NOTE: None of this is really x86-specific. | |
| 6 * There are probably many other architectures with | |
| 7 * native 64x64->128. | |
| 8 * | |
| 9 * We could adapt this to use just the gcc uint128_t | |
| 10 * instead of x86 intrinsics, but it may slow things | |
| 11 * down a bit. */ | |
| 12 | |
| 13 #define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq"))) | |
| 4 | 14 |
| 5 #include "crc32.h" | 15 #include "crc32.h" |
| 6 #include "crc32i.h" | 16 #include "crc32i.h" |
| 7 #include <stdio.h> | 17 #include <stdio.h> |
| 8 #include <immintrin.h> | 18 #include <immintrin.h> |
| 130 #include <stdio.h> | 140 #include <stdio.h> |
| 131 int main(void) | 141 int main(void) |
| 132 { | 142 { |
| 133 unsigned i; | 143 unsigned i; |
| 134 | 144 |
| 135 for (i = 1; i <= (4*128+32); i++) { | 145 for (i = 1; i <= 1024; i++) { |
| 136 printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1); | 146 printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1); |
| 137 printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1); | 147 printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1); |
| 138 } | 148 } |
| 139 | 149 |
| 140 return 0; | 150 return 0; |
| 153 #undef XNDIVP_DIV_ITER | 163 #undef XNDIVP_DIV_ITER |
| 154 | 164 |
| 155 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31) | 165 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31) |
| 156 RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), | 166 RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), |
| 157 RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128), | 167 RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128), |
| 168 RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960), | |
| 169 RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024), | |
| 158 RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), | 170 RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), |
| 159 RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32), | 171 RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32), |
| 160 RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32), | 172 RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32), |
| 161 RK08 = XNDIVP_RK08R, | 173 RK08 = XNDIVP_RK08R, |
| 174 RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832), | |
| 175 RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896), | |
| 176 RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704), | |
| 177 RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768), | |
| 178 RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576), | |
| 179 RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640), | |
| 180 RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448), | |
| 181 RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512), | |
| 182 RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320), | |
| 183 RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384), | |
| 184 RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192), | |
| 185 RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256), | |
| 162 #undef FIXUPCONSTANTS | 186 #undef FIXUPCONSTANTS |
| 163 }; | 187 }; |
| 164 | 188 |
| 165 __attribute__((__target__("vpclmulqdq"))) | 189 VPCLMULQDQ_TARGET |
| 190 CRC32_FORCEINLINE | |
| 191 uint32_t crc32x86_barrett_reduction(__m128i msgxmm) | |
| 192 { | |
| 193 static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06}, | |
| 194 rk07[2] = {RK07, RK08}, | |
| 195 mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; | |
| 196 __m128i rk; | |
| 197 | |
| 198 rk = _mm_load_si128((__m128i *)rk05); | |
| 199 | |
| 200 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); | |
| 201 | |
| 202 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); | |
| 203 | |
| 204 /* Barrett Reduction */ | |
| 205 rk = _mm_load_si128((__m128i *)rk07); | |
| 206 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); | |
| 207 | |
| 208 return _mm_extract_epi32(msgxmm, 2); | |
| 209 } | |
| 210 | |
| 211 VPCLMULQDQ_TARGET | |
| 212 CRC32_FORCEINLINE | |
| 213 __m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next) | |
| 214 { | |
| 215 return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10))); | |
| 216 } | |
| 217 | |
| 218 /* GCC-specific shit */ | |
| 219 VPCLMULQDQ_TARGET | |
| 166 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz) | 220 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz) |
| 167 { | 221 { |
| 222 static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02}, | |
| 223 rk03[2] = {RK03, RK04}, | |
| 224 rk09[2] = {RK09, RK10}, | |
| 225 rk11[2] = {RK11, RK12}, | |
| 226 rk13[2] = {RK13, RK14}, | |
| 227 rk15[2] = {RK15, RK16}, | |
| 228 rk17[2] = {RK17, RK18}, | |
| 229 rk19[2] = {RK19, RK20}; | |
| 230 __m128i msgxmm; | |
| 231 | |
| 232 if (sz >= 256) { | |
| 233 __m128i rk, msgxmma[8], xmm8; | |
| 234 | |
| 235 /* receive first 128 bytes */ | |
| 236 msgxmma[0] = _mm_load_si128((__m128i *)msg + 0); | |
| 237 msgxmma[1] = _mm_load_si128((__m128i *)msg + 1); | |
| 238 msgxmma[2] = _mm_load_si128((__m128i *)msg + 2); | |
| 239 msgxmma[3] = _mm_load_si128((__m128i *)msg + 3); | |
| 240 msgxmma[4] = _mm_load_si128((__m128i *)msg + 4); | |
| 241 msgxmma[5] = _mm_load_si128((__m128i *)msg + 5); | |
| 242 msgxmma[6] = _mm_load_si128((__m128i *)msg + 6); | |
| 243 msgxmma[7] = _mm_load_si128((__m128i *)msg + 7); | |
| 244 msg += 128; | |
| 245 sz -= 128; | |
| 246 | |
| 247 /* XOR the initial CRC */ | |
| 248 msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc)); | |
| 249 | |
| 250 rk = _mm_load_si128((__m128i *)rk03); | |
| 251 | |
| 252 for (; sz >= 128; msg += 128, sz -= 128) { | |
| 253 /* loop unrolled */ | |
| 254 msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0)); | |
| 255 msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1)); | |
| 256 msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2)); | |
| 257 msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3)); | |
| 258 msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4)); | |
| 259 msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5)); | |
| 260 msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6)); | |
| 261 msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7)); | |
| 262 } | |
| 263 | |
| 264 /* Fold it all into one xmm register */ | |
| 265 msgxmm = msgxmma[7]; | |
| 266 | |
| 267 msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm); | |
| 268 msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm); | |
| 269 msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm); | |
| 270 msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm); | |
| 271 msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm); | |
| 272 msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm); | |
| 273 msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm); | |
| 274 | |
| 275 /* Jump across into the 16-byte code, skipping the loading. | |
| 276 * This is much simpler than either doing two barrett reductions or | |
| 277 * adding a whole ton of branches... */ | |
| 278 goto jmpFrom128byte; | |
| 279 } | |
| 280 | |
| 168 /* This actually works for 16-byte buffers too, but whether it's actually | 281 /* This actually works for 16-byte buffers too, but whether it's actually |
| 169 * useful or faster is another question entirely */ | 282 * useful or faster is another question entirely */ |
| 170 if (sz >= 32) { | 283 if (sz >= 32) { |
| 171 static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02}, | 284 __m128i rk; |
| 172 rk05[2] = {RK05, RK06}, | |
| 173 rk07[2] = {RK07, RK08}, | |
| 174 mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; | |
| 175 __m128i rk, msgxmm; | |
| 176 | 285 |
| 177 msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc)); | 286 msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc)); |
| 178 | 287 msg += 16; |
| 288 sz -= 16; | |
| 289 | |
| 290 jmpFrom128byte: | |
| 179 rk = _mm_load_si128((__m128i *)rk01); | 291 rk = _mm_load_si128((__m128i *)rk01); |
| 180 | 292 |
| 181 for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) { | 293 for (; sz >= 16; msg += 16, sz -= 16) |
| 182 msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg)); | 294 msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg)); |
| 183 } | 295 |
| 184 | 296 crc = crc32x86_barrett_reduction(msgxmm); |
| 185 rk = _mm_load_si128((__m128i *)rk05); | |
| 186 | |
| 187 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); | |
| 188 | |
| 189 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); | |
| 190 | |
| 191 /* Barrett Reduction */ | |
| 192 rk = _mm_load_si128((__m128i *)rk07); | |
| 193 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); | |
| 194 | |
| 195 crc = _mm_extract_epi32(msgxmm, 2); | |
| 196 } | 297 } |
| 197 | 298 |
| 198 if (!sz) return crc; | 299 if (!sz) return crc; |
| 199 | 300 |
| 200 /* We were already aligned on a 16-byte boundary going in (hopefully | 301 /* We were already aligned on a 16-byte boundary going in (hopefully |
