comparison src/vec.c @ 28:c6c99ab1088a

*: add min/max functions and a big big refactor (again) agh, this time I added a few more implementations (and generally made the code just a little faster...)
author Paper <paper@tflc.us>
date Thu, 24 Apr 2025 00:54:02 -0400
parents 92156fe32755
children e59c91d050c0
comparison
equal deleted inserted replaced
27:d00b95f95dd1 28:c6c99ab1088a
30 # include "vec/impl/x86/mmx.h" 30 # include "vec/impl/x86/mmx.h"
31 #endif 31 #endif
32 #ifdef VEC_COMPILER_HAS_SSE2 32 #ifdef VEC_COMPILER_HAS_SSE2
33 # include "vec/impl/x86/sse2.h" 33 # include "vec/impl/x86/sse2.h"
34 #endif 34 #endif
35 #ifdef VEC_COMPILER_HAS_SSE3
36 # include "vec/impl/x86/sse3.h"
37 #endif
35 #ifdef VEC_COMPILER_HAS_SSE41 38 #ifdef VEC_COMPILER_HAS_SSE41
36 # include "vec/impl/x86/sse41.h" 39 # include "vec/impl/x86/sse41.h"
37 #endif 40 #endif
41 #ifdef VEC_COMPILER_HAS_SSE42
42 # include "vec/impl/x86/sse42.h"
43 #endif
38 #ifdef VEC_COMPILER_HAS_AVX2 44 #ifdef VEC_COMPILER_HAS_AVX2
39 # include "vec/impl/x86/avx2.h" 45 # include "vec/impl/x86/avx2.h"
40 #endif 46 #endif
41 #ifdef VEC_COMPILER_HAS_AVX512F 47 #ifdef VEC_COMPILER_HAS_AVX512F
42 # include "vec/impl/x86/avx512f.h" 48 # include "vec/impl/x86/avx512f.h"
49 #endif
50 #ifdef VEC_COMPILER_HAS_AVX512BW
51 # include "vec/impl/x86/avx512bw.h"
52 #endif
53 #ifdef VEC_COMPILER_HAS_AVX512DQ
54 # include "vec/impl/x86/avx512dq.h"
43 #endif 55 #endif
44 #ifdef VEC_COMPILER_HAS_ALTIVEC 56 #ifdef VEC_COMPILER_HAS_ALTIVEC
45 # include "vec/impl/ppc/altivec.h" 57 # include "vec/impl/ppc/altivec.h"
46 #endif 58 #endif
47 #ifdef VEC_COMPILER_HAS_NEON 59 #ifdef VEC_COMPILER_HAS_NEON
57 69
58 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); 70 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y);
59 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); 71 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y);
60 72
61 // 16-bit 73 // 16-bit
62 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; 74 vint8x2_impl vint8x2_impl_cpu = {0};
63 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; 75 vuint8x2_impl vuint8x2_impl_cpu = {0};
64 76
65 // 32-bit 77 // 32-bit
66 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; 78 vint8x4_impl vint8x4_impl_cpu = {0};
67 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; 79 vuint8x4_impl vuint8x4_impl_cpu = {0};
68 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; 80 vint16x2_impl vint16x2_impl_cpu = {0};
69 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; 81 vuint16x2_impl vuint16x2_impl_cpu = {0};
70 82
71 // 64-bit 83 // 64-bit
72 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; 84 vint8x8_impl vint8x8_impl_cpu = {0};
73 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; 85 vuint8x8_impl vuint8x8_impl_cpu = {0};
74 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; 86 vint16x4_impl vint16x4_impl_cpu = {0};
75 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; 87 vuint16x4_impl vuint16x4_impl_cpu = {0};
76 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; 88 vint32x2_impl vint32x2_impl_cpu = {0};
77 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; 89 vuint32x2_impl vuint32x2_impl_cpu = {0};
78 90
79 // 128-bit 91 // 128-bit
80 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; 92 vint8x16_impl vint8x16_impl_cpu = {0};
81 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; 93 vuint8x16_impl vuint8x16_impl_cpu = {0};
82 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; 94 vint16x8_impl vint16x8_impl_cpu = {0};
83 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; 95 vuint16x8_impl vuint16x8_impl_cpu = {0};
84 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; 96 vint32x4_impl vint32x4_impl_cpu = {0};
85 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; 97 vuint32x4_impl vuint32x4_impl_cpu = {0};
86 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; 98 vint64x2_impl vint64x2_impl_cpu = {0};
87 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; 99 vuint64x2_impl vuint64x2_impl_cpu = {0};
88 100
89 // 256-bit 101 // 256-bit
90 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; 102 vint8x32_impl vint8x32_impl_cpu = {0};
91 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; 103 vuint8x32_impl vuint8x32_impl_cpu = {0};
92 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; 104 vint16x16_impl vint16x16_impl_cpu = {0};
93 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; 105 vuint16x16_impl vuint16x16_impl_cpu = {0};
94 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; 106 vint32x8_impl vint32x8_impl_cpu = {0};
95 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; 107 vuint32x8_impl vuint32x8_impl_cpu = {0};
96 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; 108 vint64x4_impl vint64x4_impl_cpu = {0};
97 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; 109 vuint64x4_impl vuint64x4_impl_cpu = {0};
98 110
99 // 512-bit 111 // 512-bit
100 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; 112 vint8x64_impl vint8x64_impl_cpu = {0};
101 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; 113 vuint8x64_impl vuint8x64_impl_cpu = {0};
102 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; 114 vint16x32_impl vint16x32_impl_cpu = {0};
103 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; 115 vuint16x32_impl vuint16x32_impl_cpu = {0};
104 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; 116 vint32x16_impl vint32x16_impl_cpu = {0};
105 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; 117 vuint32x16_impl vuint32x16_impl_cpu = {0};
106 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; 118 vint64x8_impl vint64x8_impl_cpu = {0};
107 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; 119 vuint64x8_impl vuint64x8_impl_cpu = {0};
108 120
109 static int vec_init_spinner = 0; 121 static int vec_init_spinner = 0;
122
123 #define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \
124 do { \
125 if (!(cpu).func && (impl).func) \
126 (cpu).func = (impl).func; \
127 } while (0)
128
129 #define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \
130 do { \
131 FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \
132 FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \
133 FILL_GIVEN_FUNC_PTR(cpu, impl, load); \
134 FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \
135 FILL_GIVEN_FUNC_PTR(cpu, impl, store); \
136 FILL_GIVEN_FUNC_PTR(cpu, impl, add); \
137 FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \
138 FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \
139 FILL_GIVEN_FUNC_PTR(cpu, impl, div); \
140 FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \
141 FILL_GIVEN_FUNC_PTR(cpu, impl, band); \
142 FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \
143 FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \
144 FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \
145 FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \
146 FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \
147 FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \
148 FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \
149 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \
150 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \
151 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \
152 FILL_GIVEN_FUNC_PTR(cpu, impl, min); \
153 FILL_GIVEN_FUNC_PTR(cpu, impl, max); \
154 } while (0)
155
156 #define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \
157 FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl)
110 158
111 // returns 0 or a negative error code on failure 159 // returns 0 or a negative error code on failure
112 int vec_init(void) 160 int vec_init(void)
113 { 161 {
114 // This function is NOT thread safe. However, once vec 162 // This function is NOT thread safe. However, once vec
115 // is initialized, all of the vector functions are thread-safe. 163 // is initialized, all of the vector functions are thread-safe.
116 //
117 // In fact, it's possible to use vec without calling
118 // vec_init() at all, but it would be completely useless since
119 // it would just use a generic implementation without any
120 // vectorization whatsoever (unless maybe the compiler is
121 // smart enough to optimize it into vectors)
122 164
123 if (vec_init_spinner) 165 if (vec_init_spinner)
124 return 0; // already initialized, do nothing 166 return 0; // already initialized, do nothing
125 167
126 vec_uint32 cpu = vec_get_CPU_features(); 168 vec_uint32 cpu = vec_get_CPU_features();
127 169
170 /* Okay, this might be a little confusing:
171 * The way we do this is because of x86. For weird reasons,
172 * Intel decided to extend their prior CPU extensions to
173 * where SSE4.1 has some extended features of SSE2, AVX2
174 * has some extended features that should've been in SSE
175 * in general, etc.
176 *
177 * For this, I've just decided to keep the function
178 * definitions private, and fill in as we go, with newer
179 * intrinsics preferred. Others are arbitrary and are
180 * mutually exclusive (i.e. Altivec vs NEON). This is simply
181 * the easiest way to go about it :) */
182
183 /* --- 512-bit */
184 #ifdef VEC_COMPILER_HAS_AVX512DQ
185 if (cpu & VEC_CPU_HAS_AVX512DQ) {
186 /* these give us native multiply instructions */
187 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq);
188 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq);
189 }
190 #endif
191 #ifdef VEC_COMPILER_HAS_AVX512BW
192 if (cpu & VEC_CPU_HAS_AVX512BW) {
193 FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw);
194 FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw);
195 FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw);
196 FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw);
197 }
198 #endif
199 #ifdef VEC_COMPILER_HAS_AVX512F
200 if (cpu & VEC_CPU_HAS_AVX512F) {
201 FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f);
202 FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f);
203 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f);
204 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f);
205 }
206 #endif
207
208 /* --- 256-bit */
209 #ifdef VEC_COMPILER_HAS_AVX2
210 if (cpu & VEC_CPU_HAS_AVX2) {
211 FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2);
212 FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2);
213 FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2);
214 FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2);
215 FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2);
216 FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2);
217 FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2);
218 FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2);
219 }
220 #endif
221
222 /* --- 128-bit */
223 #ifdef VEC_COMPILER_HAS_SSE42
224 if (cpu & VEC_CPU_HAS_SSE41) {
225 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42);
226 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42);
227 }
228 #endif
229 #ifdef VEC_COMPILER_HAS_SSE41
230 if (cpu & VEC_CPU_HAS_SSE41) {
231 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41);
232 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41);
233 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41);
234 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41);
235 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41);
236 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41);
237 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41);
238 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41);
239 }
240 #endif
241 #ifdef VEC_COMPILER_HAS_SSE3
242 if (cpu & VEC_CPU_HAS_SSE3) {
243 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3);
244 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3);
245 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3);
246 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3);
247 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3);
248 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3);
249 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3);
250 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3);
251 }
252 #endif
253 #ifdef VEC_COMPILER_HAS_SSE2
254 if (cpu & VEC_CPU_HAS_SSE2) {
255 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2);
256 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2);
257 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2);
258 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2);
259 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2);
260 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2);
261 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2);
262 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2);
263 }
264 #endif
265 #ifdef VEC_COMPILER_HAS_NEON
266 if (cpu & VEC_CPU_HAS_NEON) {
267 FILL_GIVEN_FUNC_PTRS( , 8, 16, neon);
268 FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon);
269 FILL_GIVEN_FUNC_PTRS( , 16, 8, neon);
270 FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon);
271 FILL_GIVEN_FUNC_PTRS( , 32, 4, neon);
272 FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon);
273 FILL_GIVEN_FUNC_PTRS( , 64, 2, neon);
274 FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon);
275 }
276 #endif
128 #ifdef VEC_COMPILER_HAS_ALTIVEC 277 #ifdef VEC_COMPILER_HAS_ALTIVEC
129 if (cpu & VEC_CPU_HAS_ALTIVEC) { 278 if (cpu & VEC_CPU_HAS_ALTIVEC) {
130 vint8x16_impl_cpu = &vint8x16_impl_altivec; 279 FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec);
131 vuint8x16_impl_cpu = &vuint8x16_impl_altivec; 280 FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec);
132 vint16x8_impl_cpu = &vint16x8_impl_altivec; 281 FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec);
133 vuint16x8_impl_cpu = &vuint16x8_impl_altivec; 282 FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec);
134 vint32x4_impl_cpu = &vint32x4_impl_altivec; 283 FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec);
135 vuint32x4_impl_cpu = &vuint32x4_impl_altivec; 284 FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec);
136 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX 285 }
137 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { 286 #endif
138 vint64x2_impl_cpu = &vint64x2_impl_altivec; 287
139 vuint64x2_impl_cpu = &vuint64x2_impl_altivec; 288 /* --- 64-bit */
140 }
141 #endif
142 }
143 #endif
144 #ifdef VEC_COMPILER_HAS_AVX512F
145 if (cpu & VEC_CPU_HAS_AVX512F) {
146 vint8x64_impl_cpu = &vint8x64_impl_avx512f;
147 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
148 vint16x32_impl_cpu = &vint16x32_impl_avx512f;
149 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
150 vint32x16_impl_cpu = &vint32x16_impl_avx512f;
151 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
152 vint64x8_impl_cpu = &vint64x8_impl_avx512f;
153 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
154 }
155 #endif
156 #ifdef VEC_COMPILER_HAS_AVX2
157 if (cpu & VEC_CPU_HAS_AVX2) {
158 vint8x32_impl_cpu = &vint8x32_impl_avx2;
159 vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
160 vint16x16_impl_cpu = &vint16x16_impl_avx2;
161 vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
162 vint32x8_impl_cpu = &vint32x8_impl_avx2;
163 vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
164 vint64x4_impl_cpu = &vint64x4_impl_avx2;
165 vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
166 }
167 #endif
168 #ifdef VEC_COMPILER_HAS_SSE2
169 if (cpu & VEC_CPU_HAS_SSE2) {
170 vint8x16_impl_cpu = &vint8x16_impl_sse2;
171 vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
172 vint16x8_impl_cpu = &vint16x8_impl_sse2;
173 vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
174 # ifdef VEC_COMPILER_HAS_SSE41
175 if (cpu & VEC_CPU_HAS_SSE41) {
176 vint32x4_impl_cpu = &vint32x4_impl_sse41;
177 vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
178 } else
179 # endif
180 {
181 vint32x4_impl_cpu = &vint32x4_impl_sse2;
182 vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
183 }
184 vint64x2_impl_cpu = &vint64x2_impl_sse2;
185 vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
186 }
187 #endif
188 #ifdef VEC_COMPILER_HAS_MMX 289 #ifdef VEC_COMPILER_HAS_MMX
189 if (cpu & VEC_CPU_HAS_MMX) { 290 if (cpu & VEC_CPU_HAS_MMX) {
190 vint8x8_impl_cpu = &vint8x8_impl_mmx; 291 FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx);
191 vuint8x8_impl_cpu = &vuint8x8_impl_mmx; 292 FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx);
192 vint16x4_impl_cpu = &vint16x4_impl_mmx; 293 FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx);
193 vuint16x4_impl_cpu = &vuint16x4_impl_mmx; 294 FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx);
194 vint32x2_impl_cpu = &vint32x2_impl_mmx; 295 FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx);
195 vuint32x2_impl_cpu = &vuint32x2_impl_mmx; 296 FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx);
196 } 297 }
197 #endif 298 #endif
198 #ifdef VEC_COMPILER_HAS_NEON 299 #ifdef VEC_COMPILER_HAS_NEON
199 if (cpu & VEC_CPU_HAS_NEON) { 300 if (cpu & VEC_CPU_HAS_NEON) {
200 // 64-bit 301 FILL_GIVEN_FUNC_PTRS( , 8, 8, neon);
201 vint8x8_impl_cpu = &vint8x8_impl_neon; 302 FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon);
202 vuint8x8_impl_cpu = &vuint8x8_impl_neon; 303 FILL_GIVEN_FUNC_PTRS( , 16, 4, neon);
203 vint16x4_impl_cpu = &vint16x4_impl_neon; 304 FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon);
204 vuint16x4_impl_cpu = &vuint16x4_impl_neon; 305 FILL_GIVEN_FUNC_PTRS( , 32, 2, neon);
205 vint32x2_impl_cpu = &vint32x2_impl_neon; 306 FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon);
206 vuint32x2_impl_cpu = &vuint32x2_impl_neon; 307 }
207 308 #endif
208 // 128-bit 309
209 vint8x16_impl_cpu = &vint8x16_impl_neon; 310 /* fill any remaining function pointers with generics */
210 vuint8x16_impl_cpu = &vuint8x16_impl_neon; 311 FILL_GIVEN_FUNC_PTRS( , 8, 64, generic);
211 vint16x8_impl_cpu = &vint16x8_impl_neon; 312 FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic);
212 vuint16x8_impl_cpu = &vuint16x8_impl_neon; 313 FILL_GIVEN_FUNC_PTRS( , 16, 32, generic);
213 vint32x4_impl_cpu = &vint32x4_impl_neon; 314 FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic);
214 vuint32x4_impl_cpu = &vuint32x4_impl_neon; 315 FILL_GIVEN_FUNC_PTRS( , 32, 16, generic);
215 vint64x2_impl_cpu = &vint64x2_impl_neon; 316 FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic);
216 vuint64x2_impl_cpu = &vuint64x2_impl_neon; 317 FILL_GIVEN_FUNC_PTRS( , 64, 8, generic);
217 } 318 FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic);
218 #endif 319
219 { 320 FILL_GIVEN_FUNC_PTRS( , 8, 32, generic);
220 // do nothing, they're already set to generics 321 FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic);
221 } 322 FILL_GIVEN_FUNC_PTRS( , 16, 16, generic);
323 FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic);
324 FILL_GIVEN_FUNC_PTRS( , 32, 8, generic);
325 FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic);
326 FILL_GIVEN_FUNC_PTRS( , 64, 4, generic);
327 FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic);
328
329 FILL_GIVEN_FUNC_PTRS( , 8, 16, generic);
330 FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic);
331 FILL_GIVEN_FUNC_PTRS( , 16, 8, generic);
332 FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic);
333 FILL_GIVEN_FUNC_PTRS( , 32, 4, generic);
334 FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic);
335 FILL_GIVEN_FUNC_PTRS( , 64, 2, generic);
336 FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic);
337
338 FILL_GIVEN_FUNC_PTRS( , 8, 8, generic);
339 FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic);
340 FILL_GIVEN_FUNC_PTRS( , 16, 4, generic);
341 FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic);
342 FILL_GIVEN_FUNC_PTRS( , 32, 2, generic);
343 FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic);
344
345 FILL_GIVEN_FUNC_PTRS( , 8, 4, generic);
346 FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic);
347 FILL_GIVEN_FUNC_PTRS( , 16, 2, generic);
348 FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic);
349
350 FILL_GIVEN_FUNC_PTRS( , 8, 2, generic);
351 FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic);
222 352
223 vec_init_spinner++; 353 vec_init_spinner++;
224 354
225 return 0; 355 return 0;
226 } 356 }
239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 369 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 370 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 371 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 372 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 373 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 374 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 375 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 376 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 377 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 378 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
250 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ 379 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
251 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ 380 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
252 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); 381 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
382 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
383 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2);
253 384
254 #define VEC_DEFINE_OPERATIONS(bits, size) \ 385 #define VEC_DEFINE_OPERATIONS(bits, size) \
255 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ 386 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
256 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) 387 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
257 388