Mercurial > vec
comparison src/impl/arm/neon.c @ 23:e26874655738
*: huge refactor, new major release (hahaha)
I keep finding things that are broken...
The problem NOW was that vec would unintentionally build some
functions with extended instruction sets, which is Bad and would
mean that for all intents and purposes the CPU detection was
completely broken.
Now vec is no longer header only either. Boohoo. However this gives
a lot more flexibility to vec since we no longer want or need to
care about C++ crap.
The NEON and Altivec implementations have not been updated which
means they won't compile hence why they're commented out in the
cmake build file.
author | Paper <paper@tflc.us> |
---|---|
date | Sun, 24 Nov 2024 02:52:40 -0500 |
parents | |
children | d00b95f95dd1 |
comparison
equal
deleted
inserted
replaced
22:fbcd3fa6f8fc | 23:e26874655738 |
---|---|
1 /** | |
2 * vec - a tiny SIMD vector library in C99 | |
3 * | |
4 * Copyright (c) 2024 Paper | |
5 * | |
6 * Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 * of this software and associated documentation files (the "Software"), to deal | |
8 * in the Software without restriction, including without limitation the rights | |
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 * copies of the Software, and to permit persons to whom the Software is | |
11 * furnished to do so, subject to the following conditions: | |
12 * | |
13 * The above copyright notice and this permission notice shall be included in all | |
14 * copies or substantial portions of the Software. | |
15 * | |
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 * SOFTWARE. | |
23 **/ | |
24 | |
25 #include "vec/impl/arm/neon.h" | |
26 | |
27 #include <arm_neon.h> | |
28 | |
29 // There is LOTS of preprocessor hacking here (as if the other files | |
30 // weren't bad enough... lol) | |
31 | |
32 #define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ | |
33 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ | |
34 { \ | |
35 v##sign##int##bits##x##size vec; \ | |
36 vec.neon = vld1_##sign##bits(in); \ | |
37 return vec; \ | |
38 } \ | |
39 \ | |
40 static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ | |
41 { \ | |
42 vstore_lane_##bits(sign, vec.neon, out); \ | |
43 } \ | |
44 \ | |
45 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
46 { \ | |
47 v##sign##int##bits##x##size vec; \ | |
48 vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ | |
49 return vec; \ | |
50 } \ | |
51 \ | |
52 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
53 { \ | |
54 v##sign##int##bits##x##size vec; \ | |
55 vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ | |
56 return vec; \ | |
57 } \ | |
58 \ | |
59 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
60 { \ | |
61 v##sign##int##bits##x##size vec; \ | |
62 vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ | |
63 return vec; \ | |
64 } \ | |
65 \ | |
66 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ | |
67 { \ | |
68 v##sign##int##bits##x##size vec; \ | |
69 vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ | |
70 return vec; \ | |
71 } \ | |
72 \ | |
73 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
74 { \ | |
75 v##sign##int##bits##x##size vec; \ | |
76 vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ | |
77 return vec; \ | |
78 } \ | |
79 \ | |
80 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
81 { \ | |
82 v##sign##int##bits##x##size vec; \ | |
83 vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ | |
84 return vec; \ | |
85 } \ | |
86 \ | |
87 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
88 { \ | |
89 v##sign##int##bits##x##size vec; \ | |
90 vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ | |
91 return vec; \ | |
92 } \ | |
93 \ | |
94 static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ | |
95 /* .splat = */ NULL, \ | |
96 v##sign##int##bits##x##size##_neon_load_aligned, \ | |
97 v##sign##int##bits##x##size##_neon_load_aligned, \ | |
98 v##sign##int##bits##x##size##_neon_store_aligned, \ | |
99 v##sign##int##bits##x##size##_neon_store_aligned, \ | |
100 v##sign##int##bits##x##size##_neon_add, \ | |
101 v##sign##int##bits##x##size##_neon_sub, \ | |
102 v##sign##int##bits##x##size##_neon_mul, \ | |
103 /* .div = */ NULL, \ | |
104 /* .avg = */ NULL, \ | |
105 v##sign##int##bits##x##size##_neon_and, \ | |
106 v##sign##int##bits##x##size##_neon_or, \ | |
107 v##sign##int##bits##x##size##_neon_xor, \ | |
108 /* .not = */ NULL, \ | |
109 v##sign##int##bits##x##size##_neon_lshift, \ | |
110 /* .rshift = */ NULL, \ | |
111 /* .lrshift = */ NULL, \ | |
112 }; | |
113 | |
114 #define VEC_DEFINE_OPERATIONS(bits, size) \ | |
115 VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ | |
116 VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) | |
117 | |
118 // Ok, we'll start out with the 64-bit types. | |
119 | |
120 #define vadd_8 vadd_s8 | |
121 #define vadd_16 vadd_s16 | |
122 #define vadd_32 vadd_s32 | |
123 #define vsub_8 vsub_s8 | |
124 #define vsub_16 vsub_s16 | |
125 #define vsub_32 vsub_s32 | |
126 #define vmul_8 vmul_s8 | |
127 #define vmul_16 vmul_s16 | |
128 #define vmul_32 vmul_s32 | |
129 #define vshl_8 vshl_s8 | |
130 #define vshl_16 vshl_s16 | |
131 #define vshl_32 vshl_s32 | |
132 #define veor_8 veor_s8 | |
133 #define veor_16 veor_s16 | |
134 #define veor_32 veor_s32 | |
135 #define vorr_8 vorr_s8 | |
136 #define vorr_16 vorr_s16 | |
137 #define vorr_32 vorr_s32 | |
138 #define vand_8 vand_s8 | |
139 #define vand_16 vand_s16 | |
140 #define vand_32 vand_s32 | |
141 #define vld1_8 vld1_s8 | |
142 #define vld1_16 vld1_s16 | |
143 #define vld1_32 vld1_s32 | |
144 #define vget_lane_8 vget_lane_s8 | |
145 #define vget_lane_16 vget_lane_s16 | |
146 #define vget_lane_32 vget_lane_s32 | |
147 #define vstore_lane_8(sign, vec, out) \ | |
148 do { \ | |
149 out[0] = vget_lane_##sign##8(vec, 0); \ | |
150 out[1] = vget_lane_##sign##8(vec, 1); \ | |
151 out[2] = vget_lane_##sign##8(vec, 2); \ | |
152 out[3] = vget_lane_##sign##8(vec, 3); \ | |
153 out[4] = vget_lane_##sign##8(vec, 4); \ | |
154 out[5] = vget_lane_##sign##8(vec, 5); \ | |
155 out[6] = vget_lane_##sign##8(vec, 6); \ | |
156 out[7] = vget_lane_##sign##8(vec, 7); \ | |
157 } while (0) | |
158 #define vstore_lane_16(sign, vec, out) \ | |
159 do { \ | |
160 out[0] = vget_lane_##sign##16(vec, 0); \ | |
161 out[1] = vget_lane_##sign##16(vec, 1); \ | |
162 out[2] = vget_lane_##sign##16(vec, 2); \ | |
163 out[3] = vget_lane_##sign##16(vec, 3); \ | |
164 } while (0) | |
165 #define vstore_lane_32(sign, vec, out) \ | |
166 do { \ | |
167 out[0] = vget_lane_##sign##32(vec, 0); \ | |
168 out[1] = vget_lane_##sign##32(vec, 1); \ | |
169 } while (0) | |
170 #define vreinterpret_8_u8(x) vreinterpret_s8_u8(x) | |
171 #define vreinterpret_16_u16(x) vreinterpret_s16_u16(x) | |
172 #define vreinterpret_32_u32(x) vreinterpret_s32_u32(x) | |
173 | |
174 VEC_DEFINE_OPERATIONS(8, 8) | |
175 VEC_DEFINE_OPERATIONS(16, 4) | |
176 VEC_DEFINE_OPERATIONS(32, 2) | |
177 | |
178 #undef vadd_8 | |
179 #undef vadd_16 | |
180 #undef vadd_32 | |
181 #undef vsub_8 | |
182 #undef vsub_16 | |
183 #undef vsub_32 | |
184 #undef vmul_8 | |
185 #undef vmul_16 | |
186 #undef vmul_32 | |
187 #undef vshl_8 | |
188 #undef vshl_16 | |
189 #undef vshl_32 | |
190 #undef veor_8 | |
191 #undef veor_16 | |
192 #undef veor_32 | |
193 #undef vorr_8 | |
194 #undef vorr_16 | |
195 #undef vorr_32 | |
196 #undef vand_8 | |
197 #undef vand_16 | |
198 #undef vand_32 | |
199 #undef vld1_8 | |
200 #undef vld1_16 | |
201 #undef vld1_32 | |
202 #undef vget_lane_8 | |
203 #undef vget_lane_16 | |
204 #undef vget_lane_32 | |
205 #undef vstore_lane_8 | |
206 #undef vstore_lane_16 | |
207 #undef vstore_lane_32 | |
208 #undef vreinterpret_8_u8 | |
209 #undef vreinterpret_16_u16 | |
210 #undef vreinterpret_32_u32 | |
211 | |
212 /////////////////////////////////////////////////////////////////////////////// | |
213 // 128-bit | |
214 | |
215 // Now we can go ahead and do the 128-bit ones. | |
216 | |
217 // NEON doesn't have native 64-bit multiplication, so we have | |
218 // to do it ourselves | |
219 static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b) | |
220 { | |
221 const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a)); | |
222 const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b)); | |
223 | |
224 const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a)); | |
225 | |
226 return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr)); | |
227 } | |
228 | |
229 static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b) | |
230 { | |
231 const uint32x2_t ac = vmovn_u64(a); | |
232 const uint32x2_t pr = vmovn_u64(b); | |
233 | |
234 const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a)); | |
235 | |
236 return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr); | |
237 } | |
238 | |
239 #define vadd_8 vaddq_s8 | |
240 #define vadd_16 vaddq_s16 | |
241 #define vadd_32 vaddq_s32 | |
242 #define vadd_64 vaddq_s64 | |
243 #define vadd_u8 vaddq_u8 | |
244 #define vadd_u16 vaddq_u16 | |
245 #define vadd_u32 vaddq_u32 | |
246 #define vadd_u64 vaddq_u64 | |
247 #define vsub_8 vsubq_s8 | |
248 #define vsub_16 vsubq_s16 | |
249 #define vsub_32 vsubq_s32 | |
250 #define vsub_64 vsubq_s64 | |
251 #define vsub_u8 vsubq_u8 | |
252 #define vsub_u16 vsubq_u16 | |
253 #define vsub_u32 vsubq_u32 | |
254 #define vsub_u64 vsubq_u64 | |
255 #define vmul_8 vmulq_s8 | |
256 #define vmul_16 vmulq_s16 | |
257 #define vmul_32 vmulq_s32 | |
258 #define vmul_64 vmulq_s64 | |
259 #define vmul_u8 vmulq_u8 | |
260 #define vmul_u16 vmulq_u16 | |
261 #define vmul_u32 vmulq_u32 | |
262 #define vmul_u64 vmulq_u64 | |
263 #define vshl_8 vshlq_s8 | |
264 #define vshl_16 vshlq_s16 | |
265 #define vshl_32 vshlq_s32 | |
266 #define vshl_64 vshlq_s64 | |
267 #define vshl_u8 vshlq_u8 | |
268 #define vshl_u16 vshlq_u16 | |
269 #define vshl_u32 vshlq_u32 | |
270 #define vshl_u64 vshlq_u64 | |
271 #define veor_8 veorq_s8 | |
272 #define veor_16 veorq_s16 | |
273 #define veor_32 veorq_s32 | |
274 #define veor_64 veorq_s64 | |
275 #define veor_u8 veorq_u8 | |
276 #define veor_u16 veorq_u16 | |
277 #define veor_u32 veorq_u32 | |
278 #define veor_u64 veorq_u64 | |
279 #define vorr_8 vorrq_s8 | |
280 #define vorr_16 vorrq_s16 | |
281 #define vorr_32 vorrq_s32 | |
282 #define vorr_64 vorrq_s64 | |
283 #define vorr_u8 vorrq_u8 | |
284 #define vorr_u16 vorrq_u16 | |
285 #define vorr_u32 vorrq_u32 | |
286 #define vorr_u64 vorrq_u64 | |
287 #define vand_8 vandq_s8 | |
288 #define vand_16 vandq_s16 | |
289 #define vand_32 vandq_s32 | |
290 #define vand_64 vandq_s64 | |
291 #define vand_u8 vandq_u8 | |
292 #define vand_u16 vandq_u16 | |
293 #define vand_u32 vandq_u32 | |
294 #define vand_u64 vandq_u64 | |
295 #define vld1_8 vld1q_s8 | |
296 #define vld1_16 vld1q_s16 | |
297 #define vld1_32 vld1q_s32 | |
298 #define vld1_64 vld1q_s64 | |
299 #define vld1_u8 vld1q_u8 | |
300 #define vld1_u16 vld1q_u16 | |
301 #define vld1_u32 vld1q_u32 | |
302 #define vld1_u64 vld1q_u64 | |
303 #define vget_lane_8 vgetq_lane_s8 | |
304 #define vget_lane_16 vgetq_lane_s16 | |
305 #define vget_lane_32 vgetq_lane_s32 | |
306 #define vget_lane_64 vgetq_lane_s64 | |
307 #define vget_lane_u8 vgetq_lane_u8 | |
308 #define vget_lane_u16 vgetq_lane_u16 | |
309 #define vget_lane_u32 vgetq_lane_u32 | |
310 #define vget_lane_u64 vgetq_lane_u64 | |
311 #define vstore_lane_8(sign, vec, out) \ | |
312 do { \ | |
313 out[0] = vget_lane_##sign##8(vec, 0); \ | |
314 out[1] = vget_lane_##sign##8(vec, 1); \ | |
315 out[2] = vget_lane_##sign##8(vec, 2); \ | |
316 out[3] = vget_lane_##sign##8(vec, 3); \ | |
317 out[4] = vget_lane_##sign##8(vec, 4); \ | |
318 out[5] = vget_lane_##sign##8(vec, 5); \ | |
319 out[6] = vget_lane_##sign##8(vec, 6); \ | |
320 out[7] = vget_lane_##sign##8(vec, 7); \ | |
321 out[8] = vget_lane_##sign##8(vec, 8); \ | |
322 out[9] = vget_lane_##sign##8(vec, 9); \ | |
323 out[10] = vget_lane_##sign##8(vec, 10); \ | |
324 out[11] = vget_lane_##sign##8(vec, 11); \ | |
325 out[12] = vget_lane_##sign##8(vec, 12); \ | |
326 out[13] = vget_lane_##sign##8(vec, 13); \ | |
327 out[14] = vget_lane_##sign##8(vec, 14); \ | |
328 out[15] = vget_lane_##sign##8(vec, 15); \ | |
329 } while (0) | |
330 #define vstore_lane_16(sign, vec, out) \ | |
331 do { \ | |
332 out[0] = vget_lane_##sign##16(vec, 0); \ | |
333 out[1] = vget_lane_##sign##16(vec, 1); \ | |
334 out[2] = vget_lane_##sign##16(vec, 2); \ | |
335 out[3] = vget_lane_##sign##16(vec, 3); \ | |
336 out[4] = vget_lane_##sign##16(vec, 4); \ | |
337 out[5] = vget_lane_##sign##16(vec, 5); \ | |
338 out[6] = vget_lane_##sign##16(vec, 6); \ | |
339 out[7] = vget_lane_##sign##16(vec, 7); \ | |
340 } while (0) | |
341 #define vstore_lane_32(sign, vec, out) \ | |
342 do { \ | |
343 out[0] = vget_lane_##sign##32(vec, 0); \ | |
344 out[1] = vget_lane_##sign##32(vec, 1); \ | |
345 out[2] = vget_lane_##sign##32(vec, 2); \ | |
346 out[3] = vget_lane_##sign##32(vec, 3); \ | |
347 } while (0) | |
348 #define vstore_lane_64(sign, vec, out) \ | |
349 do { \ | |
350 out[0] = vget_lane_##sign##64(vec, 0); \ | |
351 out[1] = vget_lane_##sign##64(vec, 1); \ | |
352 } while (0) | |
353 #define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x) | |
354 #define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x) | |
355 #define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x) | |
356 #define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x) | |
357 | |
358 #define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ | |
359 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ | |
360 { \ | |
361 v##sign##int##bits##x##size vec; \ | |
362 vec.neon = vld1_##sign##bits(in); \ | |
363 return vec; \ | |
364 } \ | |
365 \ | |
366 static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ | |
367 { \ | |
368 vstore_lane_##bits(sign, vec.neon, out); \ | |
369 } \ | |
370 \ | |
371 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
372 { \ | |
373 v##sign##int##bits##x##size vec; \ | |
374 vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ | |
375 return vec; \ | |
376 } \ | |
377 \ | |
378 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
379 { \ | |
380 v##sign##int##bits##x##size vec; \ | |
381 vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ | |
382 return vec; \ | |
383 } \ | |
384 \ | |
385 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
386 { \ | |
387 v##sign##int##bits##x##size vec; \ | |
388 vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ | |
389 return vec; \ | |
390 } \ | |
391 \ | |
392 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ | |
393 { \ | |
394 v##sign##int##bits##x##size vec; \ | |
395 vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ | |
396 return vec; \ | |
397 } \ | |
398 \ | |
399 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
400 { \ | |
401 v##sign##int##bits##x##size vec; \ | |
402 vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ | |
403 return vec; \ | |
404 } \ | |
405 \ | |
406 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
407 { \ | |
408 v##sign##int##bits##x##size vec; \ | |
409 vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ | |
410 return vec; \ | |
411 } \ | |
412 \ | |
413 static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ | |
414 { \ | |
415 v##sign##int##bits##x##size vec; \ | |
416 vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ | |
417 return vec; \ | |
418 } \ | |
419 \ | |
420 static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ | |
421 /* .splat = */ NULL, \ | |
422 v##sign##int##bits##x##size##_neon_load_aligned, \ | |
423 v##sign##int##bits##x##size##_neon_load_aligned, \ | |
424 v##sign##int##bits##x##size##_neon_store_aligned, \ | |
425 v##sign##int##bits##x##size##_neon_store_aligned, \ | |
426 v##sign##int##bits##x##size##_neon_add, \ | |
427 v##sign##int##bits##x##size##_neon_sub, \ | |
428 v##sign##int##bits##x##size##_neon_mul, \ | |
429 /* .div = */ NULL, \ | |
430 /* .avg = */ NULL, \ | |
431 v##sign##int##bits##x##size##_neon_and, \ | |
432 v##sign##int##bits##x##size##_neon_or, \ | |
433 v##sign##int##bits##x##size##_neon_xor, \ | |
434 /* .not = */ NULL, \ | |
435 v##sign##int##bits##x##size##_neon_lshift, \ | |
436 /* .rshift = */ NULL, \ | |
437 /* .lrshift = */ NULL, \ | |
438 }; | |
439 | |
440 #define VEC_DEFINE_OPERATIONS(bits, size) \ | |
441 VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ | |
442 VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) | |
443 | |
444 VEC_DEFINE_OPERATIONS(8, 16) | |
445 VEC_DEFINE_OPERATIONS(16, 8) | |
446 VEC_DEFINE_OPERATIONS(32, 4) | |
447 VEC_DEFINE_OPERATIONS(64, 2) | |
448 | |
449 #undef vadd_8 | |
450 #undef vadd_16 | |
451 #undef vadd_32 | |
452 #undef vadd_64 | |
453 #undef vsub_8 | |
454 #undef vsub_16 | |
455 #undef vsub_32 | |
456 #undef vsub_64 | |
457 #undef vmul_8 | |
458 #undef vmul_16 | |
459 #undef vmul_32 | |
460 #undef vmul_64 | |
461 #undef vshl_8 | |
462 #undef vshl_16 | |
463 #undef vshl_32 | |
464 #undef vshl_64 | |
465 #undef veor_8 | |
466 #undef veor_16 | |
467 #undef veor_32 | |
468 #undef veor_64 | |
469 #undef vorr_8 | |
470 #undef vorr_16 | |
471 #undef vorr_32 | |
472 #undef vorr_64 | |
473 #undef vand_8 | |
474 #undef vand_16 | |
475 #undef vand_32 | |
476 #undef vand_64 | |
477 #undef vld1_8 | |
478 #undef vld1_16 | |
479 #undef vld1_32 | |
480 #undef vld1_64 | |
481 #undef vget_lane_8 | |
482 #undef vget_lane_16 | |
483 #undef vget_lane_32 | |
484 #undef vget_lane_64 | |
485 #undef vstore_lane_8 | |
486 #undef vstore_lane_16 | |
487 #undef vstore_lane_32 | |
488 #undef vstore_lane_64 |