Mercurial > crc32
annotate crc32x86.c @ 3:6483683ac857 default tip
*: add profiling code too; expand x86 to use all eight XMM registers
basically ported verbatim from the assembly
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 09 Feb 2026 21:30:30 -0500 |
| parents | ead9f84d11db |
| children |
| rev | line source |
|---|---|
| 0 | 1 /* x86-specific CRC routines */ |
| 2 | |
| 2 | 3 #ifdef __x86_64__ |
| 4 | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
5 /* NOTE: None of this is really x86-specific. |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
6 * There are probably many other architectures with |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
7 * native 64x64->128. |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
8 * |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
9 * We could adapt this to use just the gcc uint128_t |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
10 * instead of x86 intrinsics, but it may slow things |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
11 * down a bit. */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
12 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
13 #define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq"))) |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
14 |
| 0 | 15 #include "crc32.h" |
| 16 #include "crc32i.h" | |
| 17 #include <stdio.h> | |
| 18 #include <immintrin.h> | |
| 19 | |
| 20 #define BITREVERSE64EX(THIS) \ | |
| 21 ( \ | |
| 22 (((THIS) & 0x0000000000000001) << 63) \ | |
| 23 | (((THIS) & 0x0000000000000002) << 61) \ | |
| 24 | (((THIS) & 0x0000000000000004) << 59) \ | |
| 25 | (((THIS) & 0x0000000000000008) << 57) \ | |
| 26 | (((THIS) & 0x0000000000000010) << 55) \ | |
| 27 | (((THIS) & 0x0000000000000020) << 53) \ | |
| 28 | (((THIS) & 0x0000000000000040) << 51) \ | |
| 29 | (((THIS) & 0x0000000000000080) << 49) \ | |
| 30 | (((THIS) & 0x0000000000000100) << 47) \ | |
| 31 | (((THIS) & 0x0000000000000200) << 45) \ | |
| 32 | (((THIS) & 0x0000000000000400) << 43) \ | |
| 33 | (((THIS) & 0x0000000000000800) << 41) \ | |
| 34 | (((THIS) & 0x0000000000001000) << 39) \ | |
| 35 | (((THIS) & 0x0000000000002000) << 37) \ | |
| 36 | (((THIS) & 0x0000000000004000) << 35) \ | |
| 37 | (((THIS) & 0x0000000000008000) << 33) \ | |
| 38 | (((THIS) & 0x0000000000010000) << 31) \ | |
| 39 | (((THIS) & 0x0000000000020000) << 29) \ | |
| 40 | (((THIS) & 0x0000000000040000) << 27) \ | |
| 41 | (((THIS) & 0x0000000000080000) << 25) \ | |
| 42 | (((THIS) & 0x0000000000100000) << 23) \ | |
| 43 | (((THIS) & 0x0000000000200000) << 21) \ | |
| 44 | (((THIS) & 0x0000000000400000) << 19) \ | |
| 45 | (((THIS) & 0x0000000000800000) << 17) \ | |
| 46 | (((THIS) & 0x0000000001000000) << 15) \ | |
| 47 | (((THIS) & 0x0000000002000000) << 13) \ | |
| 48 | (((THIS) & 0x0000000004000000) << 11) \ | |
| 49 | (((THIS) & 0x0000000008000000) << 9) \ | |
| 50 | (((THIS) & 0x0000000010000000) << 7) \ | |
| 51 | (((THIS) & 0x0000000020000000) << 5) \ | |
| 52 | (((THIS) & 0x0000000040000000) << 3) \ | |
| 53 | (((THIS) & 0x0000000080000000) << 1) \ | |
| 54 | (((THIS) & 0x0000000100000000) >> 1) \ | |
| 55 | (((THIS) & 0x0000000200000000) >> 3) \ | |
| 56 | (((THIS) & 0x0000000400000000) >> 5) \ | |
| 57 | (((THIS) & 0x0000000800000000) >> 7) \ | |
| 58 | (((THIS) & 0x0000001000000000) >> 9) \ | |
| 59 | (((THIS) & 0x0000002000000000) >> 11) \ | |
| 60 | (((THIS) & 0x0000004000000000) >> 13) \ | |
| 61 | (((THIS) & 0x0000008000000000) >> 15) \ | |
| 62 | (((THIS) & 0x0000010000000000) >> 17) \ | |
| 63 | (((THIS) & 0x0000020000000000) >> 19) \ | |
| 64 | (((THIS) & 0x0000040000000000) >> 21) \ | |
| 65 | (((THIS) & 0x0000080000000000) >> 23) \ | |
| 66 | (((THIS) & 0x0000100000000000) >> 25) \ | |
| 67 | (((THIS) & 0x0000200000000000) >> 27) \ | |
| 68 | (((THIS) & 0x0000400000000000) >> 29) \ | |
| 69 | (((THIS) & 0x0000800000000000) >> 31) \ | |
| 70 | (((THIS) & 0x0001000000000000) >> 33) \ | |
| 71 | (((THIS) & 0x0002000000000000) >> 35) \ | |
| 72 | (((THIS) & 0x0004000000000000) >> 37) \ | |
| 73 | (((THIS) & 0x0008000000000000) >> 39) \ | |
| 74 | (((THIS) & 0x0010000000000000) >> 41) \ | |
| 75 | (((THIS) & 0x0020000000000000) >> 43) \ | |
| 76 | (((THIS) & 0x0040000000000000) >> 45) \ | |
| 77 | (((THIS) & 0x0080000000000000) >> 47) \ | |
| 78 | (((THIS) & 0x0100000000000000) >> 49) \ | |
| 79 | (((THIS) & 0x0200000000000000) >> 51) \ | |
| 80 | (((THIS) & 0x0400000000000000) >> 53) \ | |
| 81 | (((THIS) & 0x0800000000000000) >> 55) \ | |
| 82 | (((THIS) & 0x1000000000000000) >> 57) \ | |
| 83 | (((THIS) & 0x2000000000000000) >> 59) \ | |
| 84 | (((THIS) & 0x4000000000000000) >> 61) \ | |
| 85 | (((THIS) & 0x8000000000000000) >> 63) \ | |
| 86 ) | |
| 87 | |
| 88 #define BITREVERSE64(THIS) \ | |
| 89 (BITREVERSE64EX((uint64_t)(THIS))) | |
| 90 | |
| 91 #define BITREVERSE32EX(THIS) \ | |
| 92 ( \ | |
| 93 (((THIS) & 0x00000001) << 31) \ | |
| 94 | (((THIS) & 0x00000002) << 29) \ | |
| 95 | (((THIS) & 0x00000004) << 27) \ | |
| 96 | (((THIS) & 0x00000008) << 25) \ | |
| 97 | (((THIS) & 0x00000010) << 23) \ | |
| 98 | (((THIS) & 0x00000020) << 21) \ | |
| 99 | (((THIS) & 0x00000040) << 19) \ | |
| 100 | (((THIS) & 0x00000080) << 17) \ | |
| 101 | (((THIS) & 0x00000100) << 15) \ | |
| 102 | (((THIS) & 0x00000200) << 13) \ | |
| 103 | (((THIS) & 0x00000400) << 11) \ | |
| 104 | (((THIS) & 0x00000800) << 9) \ | |
| 105 | (((THIS) & 0x00001000) << 7) \ | |
| 106 | (((THIS) & 0x00002000) << 5) \ | |
| 107 | (((THIS) & 0x00004000) << 3) \ | |
| 108 | (((THIS) & 0x00008000) << 1) \ | |
| 109 | (((THIS) & 0x00010000) >> 1) \ | |
| 110 | (((THIS) & 0x00020000) >> 3) \ | |
| 111 | (((THIS) & 0x00040000) >> 5) \ | |
| 112 | (((THIS) & 0x00080000) >> 7) \ | |
| 113 | (((THIS) & 0x00100000) >> 9) \ | |
| 114 | (((THIS) & 0x00200000) >> 11) \ | |
| 115 | (((THIS) & 0x00400000) >> 13) \ | |
| 116 | (((THIS) & 0x00800000) >> 15) \ | |
| 117 | (((THIS) & 0x01000000) >> 17) \ | |
| 118 | (((THIS) & 0x02000000) >> 19) \ | |
| 119 | (((THIS) & 0x04000000) >> 21) \ | |
| 120 | (((THIS) & 0x08000000) >> 23) \ | |
| 121 | (((THIS) & 0x10000000) >> 25) \ | |
| 122 | (((THIS) & 0x20000000) >> 27) \ | |
| 123 | (((THIS) & 0x40000000) >> 29) \ | |
| 124 | (((THIS) & 0x80000000) >> 31) \ | |
| 125 ) | |
| 126 | |
| 127 #define BITREVERSE32(THIS) \ | |
| 128 (BITREVERSE32EX((uint32_t)(THIS))) | |
| 129 | |
| 130 enum { | |
| 131 XNDIVP_RK08F = (BITREVERSE32(CRC32_POLYNOMIAL)) | 0x100000000ull, | |
| 132 XNDIVP_RK08R = (BITREVERSE64(XNDIVP_RK08F) >> 31) | 1, | |
| 133 | |
| 134 /* The beginning ... */ | |
| 135 XNDIVP_MOD_ITER_0 = XNDIVP_RK08F, | |
| 136 XNDIVP_DIV_ITER_0 = 1, | |
| 137 | |
| 138 /* to generate table, run this: | |
| 139 | |
| 140 #include <stdio.h> | |
| 141 int main(void) | |
| 142 { | |
| 143 unsigned i; | |
| 144 | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
145 for (i = 1; i <= 1024; i++) { |
| 0 | 146 printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1); |
| 147 printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1); | |
| 148 } | |
| 149 | |
| 150 return 0; | |
| 151 } | |
| 152 */ | |
| 153 | |
| 154 #define XNDIVP_MOD_ITER(This, last) \ | |
| 155 XNDIVP_MOD_ITER_##This = (uint64_t)((XNDIVP_MOD_ITER_##last << 1) ^ ((XNDIVP_MOD_ITER_##last & 0x80000000) ? (XNDIVP_RK08F) : 0)), | |
| 156 | |
| 157 #define XNDIVP_DIV_ITER(This, last) \ | |
| 158 XNDIVP_DIV_ITER_##This = (uint64_t)(((uint64_t)XNDIVP_DIV_ITER_##last << 1) | ((XNDIVP_MOD_ITER_##last & 0x80000000ull) ? 1 : 0)), | |
| 159 | |
| 160 #include "crc32x86-tab.h" | |
| 161 | |
| 162 #undef XNDIVP_MOD_ITER | |
| 163 #undef XNDIVP_DIV_ITER | |
| 164 | |
| 165 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31) | |
| 166 RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), | |
| 167 RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128), | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
168 RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
169 RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024), |
| 0 | 170 RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), |
| 171 RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32), | |
| 172 RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32), | |
| 173 RK08 = XNDIVP_RK08R, | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
174 RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
175 RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
176 RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
177 RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
178 RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
179 RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
180 RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
181 RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
182 RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
183 RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
184 RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192), |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
185 RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256), |
| 0 | 186 #undef FIXUPCONSTANTS |
| 187 }; | |
| 188 | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
189 VPCLMULQDQ_TARGET |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
190 CRC32_FORCEINLINE |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
191 uint32_t crc32x86_barrett_reduction(__m128i msgxmm) |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
192 { |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
193 static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
194 rk07[2] = {RK07, RK08}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
195 mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
196 __m128i rk; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
197 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
198 rk = _mm_load_si128((__m128i *)rk05); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
199 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
200 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
201 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
202 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
203 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
204 /* Barrett Reduction */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
205 rk = _mm_load_si128((__m128i *)rk07); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
206 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
207 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
208 return _mm_extract_epi32(msgxmm, 2); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
209 } |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
210 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
211 VPCLMULQDQ_TARGET |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
212 CRC32_FORCEINLINE |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
213 __m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next) |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
214 { |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
215 return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10))); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
216 } |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
217 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
218 /* GCC-specific shit */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
219 VPCLMULQDQ_TARGET |
| 0 | 220 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz) |
| 221 { | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
222 static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
223 rk03[2] = {RK03, RK04}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
224 rk09[2] = {RK09, RK10}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
225 rk11[2] = {RK11, RK12}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
226 rk13[2] = {RK13, RK14}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
227 rk15[2] = {RK15, RK16}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
228 rk17[2] = {RK17, RK18}, |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
229 rk19[2] = {RK19, RK20}; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
230 __m128i msgxmm; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
231 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
232 if (sz >= 256) { |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
233 __m128i rk, msgxmma[8], xmm8; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
234 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
235 /* receive first 128 bytes */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
236 msgxmma[0] = _mm_load_si128((__m128i *)msg + 0); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
237 msgxmma[1] = _mm_load_si128((__m128i *)msg + 1); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
238 msgxmma[2] = _mm_load_si128((__m128i *)msg + 2); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
239 msgxmma[3] = _mm_load_si128((__m128i *)msg + 3); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
240 msgxmma[4] = _mm_load_si128((__m128i *)msg + 4); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
241 msgxmma[5] = _mm_load_si128((__m128i *)msg + 5); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
242 msgxmma[6] = _mm_load_si128((__m128i *)msg + 6); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
243 msgxmma[7] = _mm_load_si128((__m128i *)msg + 7); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
244 msg += 128; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
245 sz -= 128; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
246 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
247 /* XOR the initial CRC */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
248 msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
249 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
250 rk = _mm_load_si128((__m128i *)rk03); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
251 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
252 for (; sz >= 128; msg += 128, sz -= 128) { |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
253 /* loop unrolled */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
254 msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
255 msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
256 msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
257 msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
258 msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
259 msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
260 msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
261 msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7)); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
262 } |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
263 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
264 /* Fold it all into one xmm register */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
265 msgxmm = msgxmma[7]; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
266 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
267 msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
268 msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
269 msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
270 msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
271 msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
272 msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
273 msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm); |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
274 |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
275 /* Jump across into the 16-byte code, skipping the loading. |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
276 * This is much simpler than either doing two barrett reductions or |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
277 * adding a whole ton of branches... */ |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
278 goto jmpFrom128byte; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
279 } |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
280 |
| 0 | 281 /* This actually works for 16-byte buffers too, but whether it's actually |
| 282 * useful or faster is another question entirely */ | |
| 283 if (sz >= 32) { | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
284 __m128i rk; |
| 0 | 285 |
| 286 msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc)); | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
287 msg += 16; |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
288 sz -= 16; |
| 0 | 289 |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
290 jmpFrom128byte: |
| 0 | 291 rk = _mm_load_si128((__m128i *)rk01); |
| 292 | |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
293 for (; sz >= 16; msg += 16, sz -= 16) |
|
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
294 msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg)); |
| 0 | 295 |
|
3
6483683ac857
*: add profiling code too; expand x86 to use all eight XMM registers
Paper <paper@tflc.us>
parents:
2
diff
changeset
|
296 crc = crc32x86_barrett_reduction(msgxmm); |
| 0 | 297 } |
| 298 | |
| 299 if (!sz) return crc; | |
| 300 | |
| 301 /* We were already aligned on a 16-byte boundary going in (hopefully | |
| 302 * or else it will break), and we process 16-bytes at a time. This | |
| 303 * means `msg` is aligned 16-bytes, a multiple of 4-byte, so we don't | |
| 304 * need to align any more (or use crc32c_r). */ | |
| 305 return crc32qw_r(crc, msg, sz); | |
| 306 } | |
| 2 | 307 |
| 308 #endif |
