comparison src/impl/arm/neon.c @ 23:e26874655738

*: huge refactor, new major release (hahaha) I keep finding things that are broken... The problem NOW was that vec would unintentionally build some functions with extended instruction sets, which is Bad and would mean that for all intents and purposes the CPU detection was completely broken. Now vec is no longer header only either. Boohoo. However this gives a lot more flexibility to vec since we no longer want or need to care about C++ crap. The NEON and Altivec implementations have not been updated which means they won't compile hence why they're commented out in the cmake build file.
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 02:52:40 -0500
parents
children d00b95f95dd1
comparison
equal deleted inserted replaced
22:fbcd3fa6f8fc 23:e26874655738
1 /**
2 * vec - a tiny SIMD vector library in C99
3 *
4 * Copyright (c) 2024 Paper
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 **/
24
25 #include "vec/impl/arm/neon.h"
26
27 #include <arm_neon.h>
28
29 // There is LOTS of preprocessor hacking here (as if the other files
30 // weren't bad enough... lol)
31
32 #define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
33 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
34 { \
35 v##sign##int##bits##x##size vec; \
36 vec.neon = vld1_##sign##bits(in); \
37 return vec; \
38 } \
39 \
40 static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
41 { \
42 vstore_lane_##bits(sign, vec.neon, out); \
43 } \
44 \
45 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
46 { \
47 v##sign##int##bits##x##size vec; \
48 vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
49 return vec; \
50 } \
51 \
52 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
53 { \
54 v##sign##int##bits##x##size vec; \
55 vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
56 return vec; \
57 } \
58 \
59 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
60 { \
61 v##sign##int##bits##x##size vec; \
62 vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
63 return vec; \
64 } \
65 \
66 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
67 { \
68 v##sign##int##bits##x##size vec; \
69 vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
70 return vec; \
71 } \
72 \
73 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
74 { \
75 v##sign##int##bits##x##size vec; \
76 vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
77 return vec; \
78 } \
79 \
80 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
81 { \
82 v##sign##int##bits##x##size vec; \
83 vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
84 return vec; \
85 } \
86 \
87 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
88 { \
89 v##sign##int##bits##x##size vec; \
90 vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
91 return vec; \
92 } \
93 \
94 static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
95 /* .splat = */ NULL, \
96 v##sign##int##bits##x##size##_neon_load_aligned, \
97 v##sign##int##bits##x##size##_neon_load_aligned, \
98 v##sign##int##bits##x##size##_neon_store_aligned, \
99 v##sign##int##bits##x##size##_neon_store_aligned, \
100 v##sign##int##bits##x##size##_neon_add, \
101 v##sign##int##bits##x##size##_neon_sub, \
102 v##sign##int##bits##x##size##_neon_mul, \
103 /* .div = */ NULL, \
104 /* .avg = */ NULL, \
105 v##sign##int##bits##x##size##_neon_and, \
106 v##sign##int##bits##x##size##_neon_or, \
107 v##sign##int##bits##x##size##_neon_xor, \
108 /* .not = */ NULL, \
109 v##sign##int##bits##x##size##_neon_lshift, \
110 /* .rshift = */ NULL, \
111 /* .lrshift = */ NULL, \
112 };
113
114 #define VEC_DEFINE_OPERATIONS(bits, size) \
115 VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \
116 VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
117
118 // Ok, we'll start out with the 64-bit types.
119
120 #define vadd_8 vadd_s8
121 #define vadd_16 vadd_s16
122 #define vadd_32 vadd_s32
123 #define vsub_8 vsub_s8
124 #define vsub_16 vsub_s16
125 #define vsub_32 vsub_s32
126 #define vmul_8 vmul_s8
127 #define vmul_16 vmul_s16
128 #define vmul_32 vmul_s32
129 #define vshl_8 vshl_s8
130 #define vshl_16 vshl_s16
131 #define vshl_32 vshl_s32
132 #define veor_8 veor_s8
133 #define veor_16 veor_s16
134 #define veor_32 veor_s32
135 #define vorr_8 vorr_s8
136 #define vorr_16 vorr_s16
137 #define vorr_32 vorr_s32
138 #define vand_8 vand_s8
139 #define vand_16 vand_s16
140 #define vand_32 vand_s32
141 #define vld1_8 vld1_s8
142 #define vld1_16 vld1_s16
143 #define vld1_32 vld1_s32
144 #define vget_lane_8 vget_lane_s8
145 #define vget_lane_16 vget_lane_s16
146 #define vget_lane_32 vget_lane_s32
147 #define vstore_lane_8(sign, vec, out) \
148 do { \
149 out[0] = vget_lane_##sign##8(vec, 0); \
150 out[1] = vget_lane_##sign##8(vec, 1); \
151 out[2] = vget_lane_##sign##8(vec, 2); \
152 out[3] = vget_lane_##sign##8(vec, 3); \
153 out[4] = vget_lane_##sign##8(vec, 4); \
154 out[5] = vget_lane_##sign##8(vec, 5); \
155 out[6] = vget_lane_##sign##8(vec, 6); \
156 out[7] = vget_lane_##sign##8(vec, 7); \
157 } while (0)
158 #define vstore_lane_16(sign, vec, out) \
159 do { \
160 out[0] = vget_lane_##sign##16(vec, 0); \
161 out[1] = vget_lane_##sign##16(vec, 1); \
162 out[2] = vget_lane_##sign##16(vec, 2); \
163 out[3] = vget_lane_##sign##16(vec, 3); \
164 } while (0)
165 #define vstore_lane_32(sign, vec, out) \
166 do { \
167 out[0] = vget_lane_##sign##32(vec, 0); \
168 out[1] = vget_lane_##sign##32(vec, 1); \
169 } while (0)
170 #define vreinterpret_8_u8(x) vreinterpret_s8_u8(x)
171 #define vreinterpret_16_u16(x) vreinterpret_s16_u16(x)
172 #define vreinterpret_32_u32(x) vreinterpret_s32_u32(x)
173
174 VEC_DEFINE_OPERATIONS(8, 8)
175 VEC_DEFINE_OPERATIONS(16, 4)
176 VEC_DEFINE_OPERATIONS(32, 2)
177
178 #undef vadd_8
179 #undef vadd_16
180 #undef vadd_32
181 #undef vsub_8
182 #undef vsub_16
183 #undef vsub_32
184 #undef vmul_8
185 #undef vmul_16
186 #undef vmul_32
187 #undef vshl_8
188 #undef vshl_16
189 #undef vshl_32
190 #undef veor_8
191 #undef veor_16
192 #undef veor_32
193 #undef vorr_8
194 #undef vorr_16
195 #undef vorr_32
196 #undef vand_8
197 #undef vand_16
198 #undef vand_32
199 #undef vld1_8
200 #undef vld1_16
201 #undef vld1_32
202 #undef vget_lane_8
203 #undef vget_lane_16
204 #undef vget_lane_32
205 #undef vstore_lane_8
206 #undef vstore_lane_16
207 #undef vstore_lane_32
208 #undef vreinterpret_8_u8
209 #undef vreinterpret_16_u16
210 #undef vreinterpret_32_u32
211
212 ///////////////////////////////////////////////////////////////////////////////
213 // 128-bit
214
215 // Now we can go ahead and do the 128-bit ones.
216
217 // NEON doesn't have native 64-bit multiplication, so we have
218 // to do it ourselves
219 static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b)
220 {
221 const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a));
222 const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b));
223
224 const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a));
225
226 return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr));
227 }
228
229 static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b)
230 {
231 const uint32x2_t ac = vmovn_u64(a);
232 const uint32x2_t pr = vmovn_u64(b);
233
234 const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a));
235
236 return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr);
237 }
238
239 #define vadd_8 vaddq_s8
240 #define vadd_16 vaddq_s16
241 #define vadd_32 vaddq_s32
242 #define vadd_64 vaddq_s64
243 #define vadd_u8 vaddq_u8
244 #define vadd_u16 vaddq_u16
245 #define vadd_u32 vaddq_u32
246 #define vadd_u64 vaddq_u64
247 #define vsub_8 vsubq_s8
248 #define vsub_16 vsubq_s16
249 #define vsub_32 vsubq_s32
250 #define vsub_64 vsubq_s64
251 #define vsub_u8 vsubq_u8
252 #define vsub_u16 vsubq_u16
253 #define vsub_u32 vsubq_u32
254 #define vsub_u64 vsubq_u64
255 #define vmul_8 vmulq_s8
256 #define vmul_16 vmulq_s16
257 #define vmul_32 vmulq_s32
258 #define vmul_64 vmulq_s64
259 #define vmul_u8 vmulq_u8
260 #define vmul_u16 vmulq_u16
261 #define vmul_u32 vmulq_u32
262 #define vmul_u64 vmulq_u64
263 #define vshl_8 vshlq_s8
264 #define vshl_16 vshlq_s16
265 #define vshl_32 vshlq_s32
266 #define vshl_64 vshlq_s64
267 #define vshl_u8 vshlq_u8
268 #define vshl_u16 vshlq_u16
269 #define vshl_u32 vshlq_u32
270 #define vshl_u64 vshlq_u64
271 #define veor_8 veorq_s8
272 #define veor_16 veorq_s16
273 #define veor_32 veorq_s32
274 #define veor_64 veorq_s64
275 #define veor_u8 veorq_u8
276 #define veor_u16 veorq_u16
277 #define veor_u32 veorq_u32
278 #define veor_u64 veorq_u64
279 #define vorr_8 vorrq_s8
280 #define vorr_16 vorrq_s16
281 #define vorr_32 vorrq_s32
282 #define vorr_64 vorrq_s64
283 #define vorr_u8 vorrq_u8
284 #define vorr_u16 vorrq_u16
285 #define vorr_u32 vorrq_u32
286 #define vorr_u64 vorrq_u64
287 #define vand_8 vandq_s8
288 #define vand_16 vandq_s16
289 #define vand_32 vandq_s32
290 #define vand_64 vandq_s64
291 #define vand_u8 vandq_u8
292 #define vand_u16 vandq_u16
293 #define vand_u32 vandq_u32
294 #define vand_u64 vandq_u64
295 #define vld1_8 vld1q_s8
296 #define vld1_16 vld1q_s16
297 #define vld1_32 vld1q_s32
298 #define vld1_64 vld1q_s64
299 #define vld1_u8 vld1q_u8
300 #define vld1_u16 vld1q_u16
301 #define vld1_u32 vld1q_u32
302 #define vld1_u64 vld1q_u64
303 #define vget_lane_8 vgetq_lane_s8
304 #define vget_lane_16 vgetq_lane_s16
305 #define vget_lane_32 vgetq_lane_s32
306 #define vget_lane_64 vgetq_lane_s64
307 #define vget_lane_u8 vgetq_lane_u8
308 #define vget_lane_u16 vgetq_lane_u16
309 #define vget_lane_u32 vgetq_lane_u32
310 #define vget_lane_u64 vgetq_lane_u64
311 #define vstore_lane_8(sign, vec, out) \
312 do { \
313 out[0] = vget_lane_##sign##8(vec, 0); \
314 out[1] = vget_lane_##sign##8(vec, 1); \
315 out[2] = vget_lane_##sign##8(vec, 2); \
316 out[3] = vget_lane_##sign##8(vec, 3); \
317 out[4] = vget_lane_##sign##8(vec, 4); \
318 out[5] = vget_lane_##sign##8(vec, 5); \
319 out[6] = vget_lane_##sign##8(vec, 6); \
320 out[7] = vget_lane_##sign##8(vec, 7); \
321 out[8] = vget_lane_##sign##8(vec, 8); \
322 out[9] = vget_lane_##sign##8(vec, 9); \
323 out[10] = vget_lane_##sign##8(vec, 10); \
324 out[11] = vget_lane_##sign##8(vec, 11); \
325 out[12] = vget_lane_##sign##8(vec, 12); \
326 out[13] = vget_lane_##sign##8(vec, 13); \
327 out[14] = vget_lane_##sign##8(vec, 14); \
328 out[15] = vget_lane_##sign##8(vec, 15); \
329 } while (0)
330 #define vstore_lane_16(sign, vec, out) \
331 do { \
332 out[0] = vget_lane_##sign##16(vec, 0); \
333 out[1] = vget_lane_##sign##16(vec, 1); \
334 out[2] = vget_lane_##sign##16(vec, 2); \
335 out[3] = vget_lane_##sign##16(vec, 3); \
336 out[4] = vget_lane_##sign##16(vec, 4); \
337 out[5] = vget_lane_##sign##16(vec, 5); \
338 out[6] = vget_lane_##sign##16(vec, 6); \
339 out[7] = vget_lane_##sign##16(vec, 7); \
340 } while (0)
341 #define vstore_lane_32(sign, vec, out) \
342 do { \
343 out[0] = vget_lane_##sign##32(vec, 0); \
344 out[1] = vget_lane_##sign##32(vec, 1); \
345 out[2] = vget_lane_##sign##32(vec, 2); \
346 out[3] = vget_lane_##sign##32(vec, 3); \
347 } while (0)
348 #define vstore_lane_64(sign, vec, out) \
349 do { \
350 out[0] = vget_lane_##sign##64(vec, 0); \
351 out[1] = vget_lane_##sign##64(vec, 1); \
352 } while (0)
353 #define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x)
354 #define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x)
355 #define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x)
356 #define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x)
357
358 #define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
359 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
360 { \
361 v##sign##int##bits##x##size vec; \
362 vec.neon = vld1_##sign##bits(in); \
363 return vec; \
364 } \
365 \
366 static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
367 { \
368 vstore_lane_##bits(sign, vec.neon, out); \
369 } \
370 \
371 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
372 { \
373 v##sign##int##bits##x##size vec; \
374 vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
375 return vec; \
376 } \
377 \
378 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
379 { \
380 v##sign##int##bits##x##size vec; \
381 vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
382 return vec; \
383 } \
384 \
385 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
386 { \
387 v##sign##int##bits##x##size vec; \
388 vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
389 return vec; \
390 } \
391 \
392 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
393 { \
394 v##sign##int##bits##x##size vec; \
395 vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
396 return vec; \
397 } \
398 \
399 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
400 { \
401 v##sign##int##bits##x##size vec; \
402 vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
403 return vec; \
404 } \
405 \
406 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
407 { \
408 v##sign##int##bits##x##size vec; \
409 vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
410 return vec; \
411 } \
412 \
413 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
414 { \
415 v##sign##int##bits##x##size vec; \
416 vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
417 return vec; \
418 } \
419 \
420 static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
421 /* .splat = */ NULL, \
422 v##sign##int##bits##x##size##_neon_load_aligned, \
423 v##sign##int##bits##x##size##_neon_load_aligned, \
424 v##sign##int##bits##x##size##_neon_store_aligned, \
425 v##sign##int##bits##x##size##_neon_store_aligned, \
426 v##sign##int##bits##x##size##_neon_add, \
427 v##sign##int##bits##x##size##_neon_sub, \
428 v##sign##int##bits##x##size##_neon_mul, \
429 /* .div = */ NULL, \
430 /* .avg = */ NULL, \
431 v##sign##int##bits##x##size##_neon_and, \
432 v##sign##int##bits##x##size##_neon_or, \
433 v##sign##int##bits##x##size##_neon_xor, \
434 /* .not = */ NULL, \
435 v##sign##int##bits##x##size##_neon_lshift, \
436 /* .rshift = */ NULL, \
437 /* .lrshift = */ NULL, \
438 };
439
440 #define VEC_DEFINE_OPERATIONS(bits, size) \
441 VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \
442 VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
443
444 VEC_DEFINE_OPERATIONS(8, 16)
445 VEC_DEFINE_OPERATIONS(16, 8)
446 VEC_DEFINE_OPERATIONS(32, 4)
447 VEC_DEFINE_OPERATIONS(64, 2)
448
449 #undef vadd_8
450 #undef vadd_16
451 #undef vadd_32
452 #undef vadd_64
453 #undef vsub_8
454 #undef vsub_16
455 #undef vsub_32
456 #undef vsub_64
457 #undef vmul_8
458 #undef vmul_16
459 #undef vmul_32
460 #undef vmul_64
461 #undef vshl_8
462 #undef vshl_16
463 #undef vshl_32
464 #undef vshl_64
465 #undef veor_8
466 #undef veor_16
467 #undef veor_32
468 #undef veor_64
469 #undef vorr_8
470 #undef vorr_16
471 #undef vorr_32
472 #undef vorr_64
473 #undef vand_8
474 #undef vand_16
475 #undef vand_32
476 #undef vand_64
477 #undef vld1_8
478 #undef vld1_16
479 #undef vld1_32
480 #undef vld1_64
481 #undef vget_lane_8
482 #undef vget_lane_16
483 #undef vget_lane_32
484 #undef vget_lane_64
485 #undef vstore_lane_8
486 #undef vstore_lane_16
487 #undef vstore_lane_32
488 #undef vstore_lane_64