Mercurial > vec
comparison src/impl/x86/avx512f.c @ 23:e26874655738
*: huge refactor, new major release (hahaha)
I keep finding things that are broken...
The problem NOW was that vec would unintentionally build some
functions with extended instruction sets, which is Bad and would
mean that for all intents and purposes the CPU detection was
completely broken.
Now vec is no longer header only either. Boohoo. However this gives
a lot more flexibility to vec since we no longer want or need to
care about C++ crap.
The NEON and Altivec implementations have not been updated which
means they won't compile hence why they're commented out in the
cmake build file.
author | Paper <paper@tflc.us> |
---|---|
date | Sun, 24 Nov 2024 02:52:40 -0500 |
parents | |
children | e49e70f7012f |
comparison
equal
deleted
inserted
replaced
22:fbcd3fa6f8fc | 23:e26874655738 |
---|---|
1 /** | |
2 * vec - a tiny SIMD vector library in C99 | |
3 * | |
4 * Copyright (c) 2024 Paper | |
5 * | |
6 * Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 * of this software and associated documentation files (the "Software"), to deal | |
8 * in the Software without restriction, including without limitation the rights | |
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 * copies of the Software, and to permit persons to whom the Software is | |
11 * furnished to do so, subject to the following conditions: | |
12 * | |
13 * The above copyright notice and this permission notice shall be included in all | |
14 * copies or substantial portions of the Software. | |
15 * | |
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 * SOFTWARE. | |
23 **/ | |
24 | |
25 #include "vec/impl/x86/avx512f.h" | |
26 #include "vec/impl/generic.h" | |
27 | |
28 #include <immintrin.h> | |
29 | |
30 // this is a stupid amount of work just to do these operations, is it really worth it ? | |
31 // also same note in avx2.c applies here, these do not handle sign bits properly, which | |
32 // isn't that big of a deal for regular arithmetic operations, but matters quite a bit | |
33 // when doing things like arithmetic shifts. | |
34 #define VEC_AVX512F_OPERATION_8x64(op, sign) \ | |
35 do { \ | |
36 union v##sign##int8x64_impl_data *vec1d = (union v##sign##int8x64_impl_data *)&vec1; \ | |
37 union v##sign##int8x64_impl_data *vec2d = (union v##sign##int8x64_impl_data *)&vec2; \ | |
38 \ | |
39 /* unpack and operate */ \ | |
40 __m512i dst_1 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 24), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 24), 24)); \ | |
41 __m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 16), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 16), 24)); \ | |
42 __m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 8), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 8), 24)); \ | |
43 __m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1d->avx512f, 24), _mm512_srli_epi32(vec2d->avx512f, 24)); \ | |
44 \ | |
45 /* repack */ \ | |
46 vec1d->avx512f = _mm512_or_si512( \ | |
47 _mm512_or_si512( \ | |
48 _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 24), 24), \ | |
49 _mm512_srli_epi32(_mm512_slli_epi32(dst_2, 24), 16) \ | |
50 ), \ | |
51 _mm512_or_si512( \ | |
52 _mm512_srli_epi32(_mm512_slli_epi32(dst_3, 24), 8), \ | |
53 _mm512_slli_epi32(dst_4, 24) \ | |
54 ) \ | |
55 ); \ | |
56 \ | |
57 return vec1d->vec; \ | |
58 } while (0) | |
59 | |
60 #define VEC_AVX512F_OPERATION_16x32(op, sign) \ | |
61 do { \ | |
62 union v##sign##int16x32_impl_data *vec1d = (union v##sign##int16x32_impl_data *)&vec1; \ | |
63 union v##sign##int16x32_impl_data *vec2d = (union v##sign##int16x32_impl_data *)&vec2; \ | |
64 \ | |
65 /* unpack and operate; it would be nice if we had an _m512_andi_epi32... */ \ | |
66 __m512i dst_1 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 16), 16), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 16), 16)); \ | |
67 __m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1d->avx512f, 16), _mm512_srli_epi32(vec2d->avx512f, 16)); \ | |
68 \ | |
69 /* repack */ \ | |
70 vec1d->avx512f = _mm512_or_si512( \ | |
71 _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 16), 16), \ | |
72 _mm512_slli_epi32(dst_2, 16) \ | |
73 ); \ | |
74 return vec1d->vec; \ | |
75 } while (0) | |
76 | |
77 #define VEC_AVX512F_ADD_8x64(sign) \ | |
78 VEC_AVX512F_OPERATION_8x64(add, sign) | |
79 | |
80 #define VEC_AVX512F_ADD_16x32(sign) \ | |
81 VEC_AVX512F_OPERATION_16x32(add, sign) | |
82 | |
83 #define VEC_AVX512F_ADD_32x16(sign) \ | |
84 do { \ | |
85 union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ | |
86 union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ | |
87 \ | |
88 vec1d->avx512f = _mm512_add_epi32(vec1d->avx512f, vec2d->avx512f); \ | |
89 return vec1d->vec; \ | |
90 } while (0) | |
91 | |
92 #define VEC_AVX512F_ADD_64x8(sign) \ | |
93 do { \ | |
94 union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ | |
95 union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ | |
96 \ | |
97 vec1d->avx512f = _mm512_add_epi64(vec1d->avx512f, vec2d->avx512f); \ | |
98 return vec1d->vec; \ | |
99 } while (0) | |
100 | |
101 #define VEC_AVX512F_SUB_8x64(sign) \ | |
102 VEC_AVX512F_OPERATION_8x64(sub, sign) | |
103 | |
104 #define VEC_AVX512F_SUB_16x32(sign) \ | |
105 VEC_AVX512F_OPERATION_16x32(sub, sign) | |
106 | |
107 #define VEC_AVX512F_SUB_32x16(sign) \ | |
108 do { \ | |
109 union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ | |
110 union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ | |
111 \ | |
112 vec1d->avx512f = _mm512_sub_epi32(vec1d->avx512f, vec2d->avx512f); \ | |
113 return vec1d->vec; \ | |
114 } while (0) | |
115 | |
116 #define VEC_AVX512F_SUB_64x8(sign) \ | |
117 do { \ | |
118 union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ | |
119 union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ | |
120 \ | |
121 vec1d->avx512f = _mm512_sub_epi64(vec1d->avx512f, vec2d->avx512f); \ | |
122 return vec1d->vec; \ | |
123 } while (0) | |
124 | |
125 #define VEC_AVX512F_MUL_8x64(sign) \ | |
126 VEC_AVX512F_OPERATION_8x64(mullo, sign) | |
127 | |
128 #define VEC_AVX512F_MUL_16x32(sign) \ | |
129 VEC_AVX512F_OPERATION_16x32(mullo, sign) | |
130 | |
131 #define VEC_AVX512F_MUL_32x16(sign) \ | |
132 do { \ | |
133 union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ | |
134 union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ | |
135 \ | |
136 vec1d->avx512f = _mm512_mullo_epi32(vec1d->avx512f, vec2d->avx512f); \ | |
137 return vec1d->vec; \ | |
138 } while (0) | |
139 | |
140 #define VEC_AVX512F_MUL_64x8(sign) \ | |
141 do { \ | |
142 union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ | |
143 union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ | |
144 \ | |
145 __m512i ac = _mm512_mul_epu32(vec1d->avx512f, vec2d->avx512f); \ | |
146 __m512i b = _mm512_srli_epi64(vec1d->avx512f, 32); \ | |
147 __m512i bc = _mm512_mul_epu32(b, vec2d->avx512f); \ | |
148 __m512i d = _mm512_srli_epi64(vec2d->avx512f, 32); \ | |
149 __m512i ad = _mm512_mul_epu32(vec1d->avx512f, d); \ | |
150 __m512i hi = _mm512_add_epi64(bc, ad); \ | |
151 hi = _mm512_slli_epi64(hi, 32); \ | |
152 \ | |
153 vec1d->avx512f = _mm512_add_epi64(hi, ac); \ | |
154 return vec1d->vec; \ | |
155 } while (0) | |
156 | |
157 #define VEC_AVX512F_LSHIFT_8x64(sign) \ | |
158 VEC_AVX512F_OPERATION_8x64(sllv, sign) | |
159 | |
160 #define VEC_AVX512F_LSHIFT_16x32(sign) \ | |
161 VEC_AVX512F_OPERATION_16x32(sllv, sign) | |
162 | |
163 #define VEC_AVX512F_LSHIFT_32x16(sign) \ | |
164 do { \ | |
165 union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ | |
166 union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ | |
167 \ | |
168 vec1d->avx512f = _mm512_sllv_epi32(vec1d->avx512f, vec2d->avx512f); \ | |
169 return vec1d->vec; \ | |
170 } while (0) | |
171 | |
172 #define VEC_AVX512F_LSHIFT_64x8(sign) \ | |
173 do { \ | |
174 union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ | |
175 union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ | |
176 \ | |
177 vec1d->avx512f = _mm512_sllv_epi64(vec1d->avx512f, vec2d->avx512f); \ | |
178 return vec1d->vec; \ | |
179 } while (0) | |
180 | |
181 #define VEC_AVX512F_lRSHIFT_8x64(sign) \ | |
182 VEC_AVX512F_OPERATION_8x64(srlv, sign) | |
183 | |
184 #define VEC_AVX512F_lRSHIFT_16x32(sign) \ | |
185 VEC_AVX512F_OPERATION_16x32(srlv, sign) | |
186 | |
187 #define VEC_AVX512F_aRSHIFT_8x64(sign) \ | |
188 do { \ | |
189 return v##sign##int8x64_generic_rshift(vec1, vec2); \ | |
190 } while (0) | |
191 | |
192 #define VEC_AVX512F_aRSHIFT_16x32(sign) \ | |
193 do { \ | |
194 return v##sign##int16x32_generic_rshift(vec1, vec2); \ | |
195 } while (0) | |
196 | |
197 #define VEC_AVX512F_RSHIFT_8x64(sign, aORl) VEC_AVX512F_##aORl##RSHIFT_8x64(sign) | |
198 #define VEC_AVX512F_RSHIFT_16x32(sign, aORl) VEC_AVX512F_##aORl##RSHIFT_16x32(sign) | |
199 | |
200 #define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \ | |
201 do { \ | |
202 union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ | |
203 union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ | |
204 \ | |
205 vec1d->avx512f = _mm512_sr##aORl##v_epi32(vec1d->avx512f, vec2d->avx512f); \ | |
206 return vec1d->vec; \ | |
207 } while (0) | |
208 | |
209 #define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \ | |
210 do { \ | |
211 union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ | |
212 union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ | |
213 \ | |
214 vec1d->avx512f = _mm512_sr##aORl##v_epi64(vec1d->avx512f, vec2d->avx512f); \ | |
215 return vec1d->vec; \ | |
216 } while (0) | |
217 | |
218 #define VEC_AVX512F_uRSHIFT_8x64(sign, aORl) VEC_AVX512F_RSHIFT_8x64(sign, l) | |
219 #define VEC_AVX512F_uRSHIFT_16x32(sign, aORl) VEC_AVX512F_RSHIFT_16x32(sign, l) | |
220 #define VEC_AVX512F_uRSHIFT_32x16(sign, aORl) VEC_AVX512F_RSHIFT_32x16(sign, l) | |
221 #define VEC_AVX512F_uRSHIFT_64x8(sign, aORl) VEC_AVX512F_RSHIFT_64x8(sign, l) | |
222 | |
223 #define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ | |
224 union v##sign##int##bits##x##size##_impl_data { \ | |
225 v##sign##int##bits##x##size vec; \ | |
226 __m512i avx512f; \ | |
227 }; \ | |
228 \ | |
229 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \ | |
230 { \ | |
231 union v##sign##int##bits##x##size##_impl_data vec; \ | |
232 vec.avx512f = _mm512_load_si512((const __m512i *)in); \ | |
233 return vec.vec; \ | |
234 } \ | |
235 \ | |
236 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \ | |
237 { \ | |
238 union v##sign##int##bits##x##size##_impl_data vec; \ | |
239 vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \ | |
240 return vec.vec; \ | |
241 } \ | |
242 \ | |
243 static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ | |
244 { \ | |
245 _mm512_store_si512((__m512i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->avx512f); \ | |
246 } \ | |
247 \ | |
248 static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ | |
249 { \ | |
250 _mm512_storeu_si512((__m512i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->avx512f); \ | |
251 } \ | |
252 \ | |
253 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
254 { \ | |
255 VEC_AVX512F_ADD_##bits##x##size(sign); \ | |
256 } \ | |
257 \ | |
258 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
259 { \ | |
260 VEC_AVX512F_SUB_##bits##x##size(sign); \ | |
261 } \ | |
262 \ | |
263 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
264 { \ | |
265 VEC_AVX512F_MUL_##bits##x##size(sign); \ | |
266 } \ | |
267 \ | |
268 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
269 { \ | |
270 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ | |
271 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ | |
272 \ | |
273 vec1d->avx512f = _mm512_and_si512(vec1d->avx512f, vec2d->avx512f); \ | |
274 return vec1d->vec; \ | |
275 } \ | |
276 \ | |
277 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
278 { \ | |
279 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ | |
280 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ | |
281 \ | |
282 vec1d->avx512f = _mm512_or_si512(vec1d->avx512f, vec2d->avx512f); \ | |
283 return vec1d->vec; \ | |
284 } \ | |
285 \ | |
286 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
287 { \ | |
288 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ | |
289 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ | |
290 \ | |
291 vec1d->avx512f = _mm512_xor_si512(vec1d->avx512f, vec2d->avx512f); \ | |
292 return vec1d->vec; \ | |
293 } \ | |
294 \ | |
295 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ | |
296 { \ | |
297 VEC_AVX512F_LSHIFT_##bits##x##size(sign); \ | |
298 } \ | |
299 \ | |
300 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ | |
301 { \ | |
302 VEC_AVX512F_##sign##RSHIFT_##bits##x##size(sign, a); \ | |
303 } \ | |
304 \ | |
305 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ | |
306 { \ | |
307 VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \ | |
308 } \ | |
309 \ | |
310 const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \ | |
311 v##sign##int##bits##x##size##_generic_splat, \ | |
312 v##sign##int##bits##x##size##_avx512f_load_aligned, \ | |
313 v##sign##int##bits##x##size##_avx512f_load, \ | |
314 v##sign##int##bits##x##size##_avx512f_store_aligned, \ | |
315 v##sign##int##bits##x##size##_avx512f_store, \ | |
316 v##sign##int##bits##x##size##_avx512f_add, \ | |
317 v##sign##int##bits##x##size##_avx512f_sub, \ | |
318 v##sign##int##bits##x##size##_avx512f_mul, \ | |
319 v##sign##int##bits##x##size##_generic_div, \ | |
320 v##sign##int##bits##x##size##_generic_avg, \ | |
321 v##sign##int##bits##x##size##_avx512f_and, \ | |
322 v##sign##int##bits##x##size##_avx512f_or, \ | |
323 v##sign##int##bits##x##size##_avx512f_xor, \ | |
324 v##sign##int##bits##x##size##_generic_not, \ | |
325 v##sign##int##bits##x##size##_avx512f_lshift, \ | |
326 v##sign##int##bits##x##size##_avx512f_rshift, \ | |
327 v##sign##int##bits##x##size##_avx512f_lrshift, \ | |
328 v##sign##int##bits##x##size##_generic_cmplt, \ | |
329 v##sign##int##bits##x##size##_generic_cmple, \ | |
330 v##sign##int##bits##x##size##_generic_cmpeq, \ | |
331 v##sign##int##bits##x##size##_generic_cmpge, \ | |
332 v##sign##int##bits##x##size##_generic_cmpgt, \ | |
333 }; | |
334 | |
335 #define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \ | |
336 VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size) \ | |
337 VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size) | |
338 | |
339 VEC_AVX512F_DEFINE_OPERATIONS(8, 64) | |
340 VEC_AVX512F_DEFINE_OPERATIONS(16, 32) | |
341 VEC_AVX512F_DEFINE_OPERATIONS(32, 16) | |
342 VEC_AVX512F_DEFINE_OPERATIONS(64, 8) |