|
0
|
1 /* x86-specific CRC routines */
|
|
|
2
|
|
|
3 #include "crc32.h"
|
|
|
4 #include "crc32i.h"
|
|
|
5 #include <stdio.h>
|
|
|
6 #include <immintrin.h>
|
|
|
7
|
|
|
8 #define BITREVERSE64EX(THIS) \
|
|
|
9 ( \
|
|
|
10 (((THIS) & 0x0000000000000001) << 63) \
|
|
|
11 | (((THIS) & 0x0000000000000002) << 61) \
|
|
|
12 | (((THIS) & 0x0000000000000004) << 59) \
|
|
|
13 | (((THIS) & 0x0000000000000008) << 57) \
|
|
|
14 | (((THIS) & 0x0000000000000010) << 55) \
|
|
|
15 | (((THIS) & 0x0000000000000020) << 53) \
|
|
|
16 | (((THIS) & 0x0000000000000040) << 51) \
|
|
|
17 | (((THIS) & 0x0000000000000080) << 49) \
|
|
|
18 | (((THIS) & 0x0000000000000100) << 47) \
|
|
|
19 | (((THIS) & 0x0000000000000200) << 45) \
|
|
|
20 | (((THIS) & 0x0000000000000400) << 43) \
|
|
|
21 | (((THIS) & 0x0000000000000800) << 41) \
|
|
|
22 | (((THIS) & 0x0000000000001000) << 39) \
|
|
|
23 | (((THIS) & 0x0000000000002000) << 37) \
|
|
|
24 | (((THIS) & 0x0000000000004000) << 35) \
|
|
|
25 | (((THIS) & 0x0000000000008000) << 33) \
|
|
|
26 | (((THIS) & 0x0000000000010000) << 31) \
|
|
|
27 | (((THIS) & 0x0000000000020000) << 29) \
|
|
|
28 | (((THIS) & 0x0000000000040000) << 27) \
|
|
|
29 | (((THIS) & 0x0000000000080000) << 25) \
|
|
|
30 | (((THIS) & 0x0000000000100000) << 23) \
|
|
|
31 | (((THIS) & 0x0000000000200000) << 21) \
|
|
|
32 | (((THIS) & 0x0000000000400000) << 19) \
|
|
|
33 | (((THIS) & 0x0000000000800000) << 17) \
|
|
|
34 | (((THIS) & 0x0000000001000000) << 15) \
|
|
|
35 | (((THIS) & 0x0000000002000000) << 13) \
|
|
|
36 | (((THIS) & 0x0000000004000000) << 11) \
|
|
|
37 | (((THIS) & 0x0000000008000000) << 9) \
|
|
|
38 | (((THIS) & 0x0000000010000000) << 7) \
|
|
|
39 | (((THIS) & 0x0000000020000000) << 5) \
|
|
|
40 | (((THIS) & 0x0000000040000000) << 3) \
|
|
|
41 | (((THIS) & 0x0000000080000000) << 1) \
|
|
|
42 | (((THIS) & 0x0000000100000000) >> 1) \
|
|
|
43 | (((THIS) & 0x0000000200000000) >> 3) \
|
|
|
44 | (((THIS) & 0x0000000400000000) >> 5) \
|
|
|
45 | (((THIS) & 0x0000000800000000) >> 7) \
|
|
|
46 | (((THIS) & 0x0000001000000000) >> 9) \
|
|
|
47 | (((THIS) & 0x0000002000000000) >> 11) \
|
|
|
48 | (((THIS) & 0x0000004000000000) >> 13) \
|
|
|
49 | (((THIS) & 0x0000008000000000) >> 15) \
|
|
|
50 | (((THIS) & 0x0000010000000000) >> 17) \
|
|
|
51 | (((THIS) & 0x0000020000000000) >> 19) \
|
|
|
52 | (((THIS) & 0x0000040000000000) >> 21) \
|
|
|
53 | (((THIS) & 0x0000080000000000) >> 23) \
|
|
|
54 | (((THIS) & 0x0000100000000000) >> 25) \
|
|
|
55 | (((THIS) & 0x0000200000000000) >> 27) \
|
|
|
56 | (((THIS) & 0x0000400000000000) >> 29) \
|
|
|
57 | (((THIS) & 0x0000800000000000) >> 31) \
|
|
|
58 | (((THIS) & 0x0001000000000000) >> 33) \
|
|
|
59 | (((THIS) & 0x0002000000000000) >> 35) \
|
|
|
60 | (((THIS) & 0x0004000000000000) >> 37) \
|
|
|
61 | (((THIS) & 0x0008000000000000) >> 39) \
|
|
|
62 | (((THIS) & 0x0010000000000000) >> 41) \
|
|
|
63 | (((THIS) & 0x0020000000000000) >> 43) \
|
|
|
64 | (((THIS) & 0x0040000000000000) >> 45) \
|
|
|
65 | (((THIS) & 0x0080000000000000) >> 47) \
|
|
|
66 | (((THIS) & 0x0100000000000000) >> 49) \
|
|
|
67 | (((THIS) & 0x0200000000000000) >> 51) \
|
|
|
68 | (((THIS) & 0x0400000000000000) >> 53) \
|
|
|
69 | (((THIS) & 0x0800000000000000) >> 55) \
|
|
|
70 | (((THIS) & 0x1000000000000000) >> 57) \
|
|
|
71 | (((THIS) & 0x2000000000000000) >> 59) \
|
|
|
72 | (((THIS) & 0x4000000000000000) >> 61) \
|
|
|
73 | (((THIS) & 0x8000000000000000) >> 63) \
|
|
|
74 )
|
|
|
75
|
|
|
76 #define BITREVERSE64(THIS) \
|
|
|
77 (BITREVERSE64EX((uint64_t)(THIS)))
|
|
|
78
|
|
|
79 #define BITREVERSE32EX(THIS) \
|
|
|
80 ( \
|
|
|
81 (((THIS) & 0x00000001) << 31) \
|
|
|
82 | (((THIS) & 0x00000002) << 29) \
|
|
|
83 | (((THIS) & 0x00000004) << 27) \
|
|
|
84 | (((THIS) & 0x00000008) << 25) \
|
|
|
85 | (((THIS) & 0x00000010) << 23) \
|
|
|
86 | (((THIS) & 0x00000020) << 21) \
|
|
|
87 | (((THIS) & 0x00000040) << 19) \
|
|
|
88 | (((THIS) & 0x00000080) << 17) \
|
|
|
89 | (((THIS) & 0x00000100) << 15) \
|
|
|
90 | (((THIS) & 0x00000200) << 13) \
|
|
|
91 | (((THIS) & 0x00000400) << 11) \
|
|
|
92 | (((THIS) & 0x00000800) << 9) \
|
|
|
93 | (((THIS) & 0x00001000) << 7) \
|
|
|
94 | (((THIS) & 0x00002000) << 5) \
|
|
|
95 | (((THIS) & 0x00004000) << 3) \
|
|
|
96 | (((THIS) & 0x00008000) << 1) \
|
|
|
97 | (((THIS) & 0x00010000) >> 1) \
|
|
|
98 | (((THIS) & 0x00020000) >> 3) \
|
|
|
99 | (((THIS) & 0x00040000) >> 5) \
|
|
|
100 | (((THIS) & 0x00080000) >> 7) \
|
|
|
101 | (((THIS) & 0x00100000) >> 9) \
|
|
|
102 | (((THIS) & 0x00200000) >> 11) \
|
|
|
103 | (((THIS) & 0x00400000) >> 13) \
|
|
|
104 | (((THIS) & 0x00800000) >> 15) \
|
|
|
105 | (((THIS) & 0x01000000) >> 17) \
|
|
|
106 | (((THIS) & 0x02000000) >> 19) \
|
|
|
107 | (((THIS) & 0x04000000) >> 21) \
|
|
|
108 | (((THIS) & 0x08000000) >> 23) \
|
|
|
109 | (((THIS) & 0x10000000) >> 25) \
|
|
|
110 | (((THIS) & 0x20000000) >> 27) \
|
|
|
111 | (((THIS) & 0x40000000) >> 29) \
|
|
|
112 | (((THIS) & 0x80000000) >> 31) \
|
|
|
113 )
|
|
|
114
|
|
|
115 #define BITREVERSE32(THIS) \
|
|
|
116 (BITREVERSE32EX((uint32_t)(THIS)))
|
|
|
117
|
|
|
118 enum {
|
|
|
119 XNDIVP_RK08F = (BITREVERSE32(CRC32_POLYNOMIAL)) | 0x100000000ull,
|
|
|
120 XNDIVP_RK08R = (BITREVERSE64(XNDIVP_RK08F) >> 31) | 1,
|
|
|
121
|
|
|
122 /* The beginning ... */
|
|
|
123 XNDIVP_MOD_ITER_0 = XNDIVP_RK08F,
|
|
|
124 XNDIVP_DIV_ITER_0 = 1,
|
|
|
125
|
|
|
126 /* to generate table, run this:
|
|
|
127
|
|
|
128 #include <stdio.h>
|
|
|
129 int main(void)
|
|
|
130 {
|
|
|
131 unsigned i;
|
|
|
132
|
|
|
133 for (i = 1; i <= (4*128+32); i++) {
|
|
|
134 printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1);
|
|
|
135 printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1);
|
|
|
136 }
|
|
|
137
|
|
|
138 return 0;
|
|
|
139 }
|
|
|
140 */
|
|
|
141
|
|
|
142 #define XNDIVP_MOD_ITER(This, last) \
|
|
|
143 XNDIVP_MOD_ITER_##This = (uint64_t)((XNDIVP_MOD_ITER_##last << 1) ^ ((XNDIVP_MOD_ITER_##last & 0x80000000) ? (XNDIVP_RK08F) : 0)),
|
|
|
144
|
|
|
145 #define XNDIVP_DIV_ITER(This, last) \
|
|
|
146 XNDIVP_DIV_ITER_##This = (uint64_t)(((uint64_t)XNDIVP_DIV_ITER_##last << 1) | ((XNDIVP_MOD_ITER_##last & 0x80000000ull) ? 1 : 0)),
|
|
|
147
|
|
|
148 #include "crc32x86-tab.h"
|
|
|
149
|
|
|
150 #undef XNDIVP_MOD_ITER
|
|
|
151 #undef XNDIVP_DIV_ITER
|
|
|
152
|
|
|
153 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31)
|
|
|
154 RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
|
|
|
155 RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128),
|
|
|
156 RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
|
|
|
157 RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32),
|
|
|
158 RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32),
|
|
|
159 RK08 = XNDIVP_RK08R,
|
|
|
160 #undef FIXUPCONSTANTS
|
|
|
161 };
|
|
|
162
|
|
|
163 __attribute__((__target__("vpclmulqdq")))
|
|
|
164 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz)
|
|
|
165 {
|
|
|
166 /* This actually works for 16-byte buffers too, but whether it's actually
|
|
|
167 * useful or faster is another question entirely */
|
|
|
168 if (sz >= 32) {
|
|
|
169 static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02},
|
|
|
170 rk05[2] = {RK05, RK06},
|
|
|
171 rk07[2] = {RK07, RK08},
|
|
|
172 mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
|
|
|
173 __m128i rk, msgxmm;
|
|
|
174
|
|
|
175 msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc));
|
|
|
176
|
|
|
177 rk = _mm_load_si128((__m128i *)rk01);
|
|
|
178
|
|
|
179 for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) {
|
|
|
180 msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg));
|
|
|
181 }
|
|
|
182
|
|
|
183 rk = _mm_load_si128((__m128i *)rk05);
|
|
|
184
|
|
|
185 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));
|
|
|
186
|
|
|
187 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));
|
|
|
188
|
|
|
189 /* Barrett Reduction */
|
|
|
190 rk = _mm_load_si128((__m128i *)rk07);
|
|
|
191 msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);
|
|
|
192
|
|
|
193 crc = _mm_extract_epi32(msgxmm, 2);
|
|
|
194 }
|
|
|
195
|
|
|
196 if (!sz) return crc;
|
|
|
197
|
|
|
198 /* We were already aligned on a 16-byte boundary going in (hopefully
|
|
|
199 * or else it will break), and we process 16-bytes at a time. This
|
|
|
200 * means `msg` is aligned 16-bytes, a multiple of 4-byte, so we don't
|
|
|
201 * need to align any more (or use crc32c_r). */
|
|
|
202 return crc32qw_r(crc, msg, sz);
|
|
|
203 }
|