Mercurial > vec
comparison src/vec.c @ 28:c6c99ab1088a
*: add min/max functions and a big big refactor (again)
agh, this time I added a few more implementations (and generally
made the code just a little faster...)
author | Paper <paper@tflc.us> |
---|---|
date | Thu, 24 Apr 2025 00:54:02 -0400 |
parents | 92156fe32755 |
children | e59c91d050c0 |
comparison
equal
deleted
inserted
replaced
27:d00b95f95dd1 | 28:c6c99ab1088a |
---|---|
30 # include "vec/impl/x86/mmx.h" | 30 # include "vec/impl/x86/mmx.h" |
31 #endif | 31 #endif |
32 #ifdef VEC_COMPILER_HAS_SSE2 | 32 #ifdef VEC_COMPILER_HAS_SSE2 |
33 # include "vec/impl/x86/sse2.h" | 33 # include "vec/impl/x86/sse2.h" |
34 #endif | 34 #endif |
35 #ifdef VEC_COMPILER_HAS_SSE3 | |
36 # include "vec/impl/x86/sse3.h" | |
37 #endif | |
35 #ifdef VEC_COMPILER_HAS_SSE41 | 38 #ifdef VEC_COMPILER_HAS_SSE41 |
36 # include "vec/impl/x86/sse41.h" | 39 # include "vec/impl/x86/sse41.h" |
37 #endif | 40 #endif |
41 #ifdef VEC_COMPILER_HAS_SSE42 | |
42 # include "vec/impl/x86/sse42.h" | |
43 #endif | |
38 #ifdef VEC_COMPILER_HAS_AVX2 | 44 #ifdef VEC_COMPILER_HAS_AVX2 |
39 # include "vec/impl/x86/avx2.h" | 45 # include "vec/impl/x86/avx2.h" |
40 #endif | 46 #endif |
41 #ifdef VEC_COMPILER_HAS_AVX512F | 47 #ifdef VEC_COMPILER_HAS_AVX512F |
42 # include "vec/impl/x86/avx512f.h" | 48 # include "vec/impl/x86/avx512f.h" |
49 #endif | |
50 #ifdef VEC_COMPILER_HAS_AVX512BW | |
51 # include "vec/impl/x86/avx512bw.h" | |
52 #endif | |
53 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
54 # include "vec/impl/x86/avx512dq.h" | |
43 #endif | 55 #endif |
44 #ifdef VEC_COMPILER_HAS_ALTIVEC | 56 #ifdef VEC_COMPILER_HAS_ALTIVEC |
45 # include "vec/impl/ppc/altivec.h" | 57 # include "vec/impl/ppc/altivec.h" |
46 #endif | 58 #endif |
47 #ifdef VEC_COMPILER_HAS_NEON | 59 #ifdef VEC_COMPILER_HAS_NEON |
57 | 69 |
58 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); | 70 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); |
59 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); | 71 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); |
60 | 72 |
61 // 16-bit | 73 // 16-bit |
62 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; | 74 vint8x2_impl vint8x2_impl_cpu = {0}; |
63 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; | 75 vuint8x2_impl vuint8x2_impl_cpu = {0}; |
64 | 76 |
65 // 32-bit | 77 // 32-bit |
66 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; | 78 vint8x4_impl vint8x4_impl_cpu = {0}; |
67 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; | 79 vuint8x4_impl vuint8x4_impl_cpu = {0}; |
68 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; | 80 vint16x2_impl vint16x2_impl_cpu = {0}; |
69 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; | 81 vuint16x2_impl vuint16x2_impl_cpu = {0}; |
70 | 82 |
71 // 64-bit | 83 // 64-bit |
72 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; | 84 vint8x8_impl vint8x8_impl_cpu = {0}; |
73 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; | 85 vuint8x8_impl vuint8x8_impl_cpu = {0}; |
74 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; | 86 vint16x4_impl vint16x4_impl_cpu = {0}; |
75 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; | 87 vuint16x4_impl vuint16x4_impl_cpu = {0}; |
76 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; | 88 vint32x2_impl vint32x2_impl_cpu = {0}; |
77 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; | 89 vuint32x2_impl vuint32x2_impl_cpu = {0}; |
78 | 90 |
79 // 128-bit | 91 // 128-bit |
80 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; | 92 vint8x16_impl vint8x16_impl_cpu = {0}; |
81 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; | 93 vuint8x16_impl vuint8x16_impl_cpu = {0}; |
82 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; | 94 vint16x8_impl vint16x8_impl_cpu = {0}; |
83 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; | 95 vuint16x8_impl vuint16x8_impl_cpu = {0}; |
84 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; | 96 vint32x4_impl vint32x4_impl_cpu = {0}; |
85 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; | 97 vuint32x4_impl vuint32x4_impl_cpu = {0}; |
86 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; | 98 vint64x2_impl vint64x2_impl_cpu = {0}; |
87 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; | 99 vuint64x2_impl vuint64x2_impl_cpu = {0}; |
88 | 100 |
89 // 256-bit | 101 // 256-bit |
90 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; | 102 vint8x32_impl vint8x32_impl_cpu = {0}; |
91 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; | 103 vuint8x32_impl vuint8x32_impl_cpu = {0}; |
92 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; | 104 vint16x16_impl vint16x16_impl_cpu = {0}; |
93 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; | 105 vuint16x16_impl vuint16x16_impl_cpu = {0}; |
94 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; | 106 vint32x8_impl vint32x8_impl_cpu = {0}; |
95 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; | 107 vuint32x8_impl vuint32x8_impl_cpu = {0}; |
96 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; | 108 vint64x4_impl vint64x4_impl_cpu = {0}; |
97 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; | 109 vuint64x4_impl vuint64x4_impl_cpu = {0}; |
98 | 110 |
99 // 512-bit | 111 // 512-bit |
100 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; | 112 vint8x64_impl vint8x64_impl_cpu = {0}; |
101 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; | 113 vuint8x64_impl vuint8x64_impl_cpu = {0}; |
102 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; | 114 vint16x32_impl vint16x32_impl_cpu = {0}; |
103 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; | 115 vuint16x32_impl vuint16x32_impl_cpu = {0}; |
104 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; | 116 vint32x16_impl vint32x16_impl_cpu = {0}; |
105 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; | 117 vuint32x16_impl vuint32x16_impl_cpu = {0}; |
106 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; | 118 vint64x8_impl vint64x8_impl_cpu = {0}; |
107 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; | 119 vuint64x8_impl vuint64x8_impl_cpu = {0}; |
108 | 120 |
109 static int vec_init_spinner = 0; | 121 static int vec_init_spinner = 0; |
122 | |
123 #define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \ | |
124 do { \ | |
125 if (!(cpu).func && (impl).func) \ | |
126 (cpu).func = (impl).func; \ | |
127 } while (0) | |
128 | |
129 #define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \ | |
130 do { \ | |
131 FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \ | |
132 FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \ | |
133 FILL_GIVEN_FUNC_PTR(cpu, impl, load); \ | |
134 FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \ | |
135 FILL_GIVEN_FUNC_PTR(cpu, impl, store); \ | |
136 FILL_GIVEN_FUNC_PTR(cpu, impl, add); \ | |
137 FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \ | |
138 FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \ | |
139 FILL_GIVEN_FUNC_PTR(cpu, impl, div); \ | |
140 FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \ | |
141 FILL_GIVEN_FUNC_PTR(cpu, impl, band); \ | |
142 FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \ | |
143 FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \ | |
144 FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \ | |
145 FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \ | |
146 FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \ | |
147 FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \ | |
148 FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \ | |
149 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \ | |
150 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \ | |
151 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \ | |
152 FILL_GIVEN_FUNC_PTR(cpu, impl, min); \ | |
153 FILL_GIVEN_FUNC_PTR(cpu, impl, max); \ | |
154 } while (0) | |
155 | |
156 #define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \ | |
157 FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl) | |
110 | 158 |
111 // returns 0 or a negative error code on failure | 159 // returns 0 or a negative error code on failure |
112 int vec_init(void) | 160 int vec_init(void) |
113 { | 161 { |
114 // This function is NOT thread safe. However, once vec | 162 // This function is NOT thread safe. However, once vec |
115 // is initialized, all of the vector functions are thread-safe. | 163 // is initialized, all of the vector functions are thread-safe. |
116 // | |
117 // In fact, it's possible to use vec without calling | |
118 // vec_init() at all, but it would be completely useless since | |
119 // it would just use a generic implementation without any | |
120 // vectorization whatsoever (unless maybe the compiler is | |
121 // smart enough to optimize it into vectors) | |
122 | 164 |
123 if (vec_init_spinner) | 165 if (vec_init_spinner) |
124 return 0; // already initialized, do nothing | 166 return 0; // already initialized, do nothing |
125 | 167 |
126 vec_uint32 cpu = vec_get_CPU_features(); | 168 vec_uint32 cpu = vec_get_CPU_features(); |
127 | 169 |
170 /* Okay, this might be a little confusing: | |
171 * The way we do this is because of x86. For weird reasons, | |
172 * Intel decided to extend their prior CPU extensions to | |
173 * where SSE4.1 has some extended features of SSE2, AVX2 | |
174 * has some extended features that should've been in SSE | |
175 * in general, etc. | |
176 * | |
177 * For this, I've just decided to keep the function | |
178 * definitions private, and fill in as we go, with newer | |
179 * intrinsics preferred. Others are arbitrary and are | |
180 * mutually exclusive (i.e. Altivec vs NEON). This is simply | |
181 * the easiest way to go about it :) */ | |
182 | |
183 /* --- 512-bit */ | |
184 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
185 if (cpu & VEC_CPU_HAS_AVX512DQ) { | |
186 /* these give us native multiply instructions */ | |
187 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq); | |
188 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq); | |
189 } | |
190 #endif | |
191 #ifdef VEC_COMPILER_HAS_AVX512BW | |
192 if (cpu & VEC_CPU_HAS_AVX512BW) { | |
193 FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw); | |
194 FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw); | |
195 FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw); | |
196 FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw); | |
197 } | |
198 #endif | |
199 #ifdef VEC_COMPILER_HAS_AVX512F | |
200 if (cpu & VEC_CPU_HAS_AVX512F) { | |
201 FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); | |
202 FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); | |
203 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); | |
204 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); | |
205 } | |
206 #endif | |
207 | |
208 /* --- 256-bit */ | |
209 #ifdef VEC_COMPILER_HAS_AVX2 | |
210 if (cpu & VEC_CPU_HAS_AVX2) { | |
211 FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); | |
212 FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); | |
213 FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); | |
214 FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); | |
215 FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); | |
216 FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); | |
217 FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); | |
218 FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); | |
219 } | |
220 #endif | |
221 | |
222 /* --- 128-bit */ | |
223 #ifdef VEC_COMPILER_HAS_SSE42 | |
224 if (cpu & VEC_CPU_HAS_SSE41) { | |
225 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42); | |
226 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42); | |
227 } | |
228 #endif | |
229 #ifdef VEC_COMPILER_HAS_SSE41 | |
230 if (cpu & VEC_CPU_HAS_SSE41) { | |
231 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41); | |
232 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41); | |
233 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41); | |
234 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41); | |
235 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41); | |
236 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41); | |
237 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41); | |
238 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41); | |
239 } | |
240 #endif | |
241 #ifdef VEC_COMPILER_HAS_SSE3 | |
242 if (cpu & VEC_CPU_HAS_SSE3) { | |
243 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3); | |
244 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3); | |
245 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3); | |
246 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3); | |
247 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3); | |
248 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3); | |
249 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3); | |
250 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3); | |
251 } | |
252 #endif | |
253 #ifdef VEC_COMPILER_HAS_SSE2 | |
254 if (cpu & VEC_CPU_HAS_SSE2) { | |
255 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); | |
256 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); | |
257 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); | |
258 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); | |
259 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); | |
260 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); | |
261 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); | |
262 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); | |
263 } | |
264 #endif | |
265 #ifdef VEC_COMPILER_HAS_NEON | |
266 if (cpu & VEC_CPU_HAS_NEON) { | |
267 FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); | |
268 FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); | |
269 FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); | |
270 FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); | |
271 FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); | |
272 FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); | |
273 FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); | |
274 FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); | |
275 } | |
276 #endif | |
128 #ifdef VEC_COMPILER_HAS_ALTIVEC | 277 #ifdef VEC_COMPILER_HAS_ALTIVEC |
129 if (cpu & VEC_CPU_HAS_ALTIVEC) { | 278 if (cpu & VEC_CPU_HAS_ALTIVEC) { |
130 vint8x16_impl_cpu = &vint8x16_impl_altivec; | 279 FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); |
131 vuint8x16_impl_cpu = &vuint8x16_impl_altivec; | 280 FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); |
132 vint16x8_impl_cpu = &vint16x8_impl_altivec; | 281 FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); |
133 vuint16x8_impl_cpu = &vuint16x8_impl_altivec; | 282 FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); |
134 vint32x4_impl_cpu = &vint32x4_impl_altivec; | 283 FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); |
135 vuint32x4_impl_cpu = &vuint32x4_impl_altivec; | 284 FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); |
136 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX | 285 } |
137 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { | 286 #endif |
138 vint64x2_impl_cpu = &vint64x2_impl_altivec; | 287 |
139 vuint64x2_impl_cpu = &vuint64x2_impl_altivec; | 288 /* --- 64-bit */ |
140 } | |
141 #endif | |
142 } | |
143 #endif | |
144 #ifdef VEC_COMPILER_HAS_AVX512F | |
145 if (cpu & VEC_CPU_HAS_AVX512F) { | |
146 vint8x64_impl_cpu = &vint8x64_impl_avx512f; | |
147 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; | |
148 vint16x32_impl_cpu = &vint16x32_impl_avx512f; | |
149 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; | |
150 vint32x16_impl_cpu = &vint32x16_impl_avx512f; | |
151 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; | |
152 vint64x8_impl_cpu = &vint64x8_impl_avx512f; | |
153 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; | |
154 } | |
155 #endif | |
156 #ifdef VEC_COMPILER_HAS_AVX2 | |
157 if (cpu & VEC_CPU_HAS_AVX2) { | |
158 vint8x32_impl_cpu = &vint8x32_impl_avx2; | |
159 vuint8x32_impl_cpu = &vuint8x32_impl_avx2; | |
160 vint16x16_impl_cpu = &vint16x16_impl_avx2; | |
161 vuint16x16_impl_cpu = &vuint16x16_impl_avx2; | |
162 vint32x8_impl_cpu = &vint32x8_impl_avx2; | |
163 vuint32x8_impl_cpu = &vuint32x8_impl_avx2; | |
164 vint64x4_impl_cpu = &vint64x4_impl_avx2; | |
165 vuint64x4_impl_cpu = &vuint64x4_impl_avx2; | |
166 } | |
167 #endif | |
168 #ifdef VEC_COMPILER_HAS_SSE2 | |
169 if (cpu & VEC_CPU_HAS_SSE2) { | |
170 vint8x16_impl_cpu = &vint8x16_impl_sse2; | |
171 vuint8x16_impl_cpu = &vuint8x16_impl_sse2; | |
172 vint16x8_impl_cpu = &vint16x8_impl_sse2; | |
173 vuint16x8_impl_cpu = &vuint16x8_impl_sse2; | |
174 # ifdef VEC_COMPILER_HAS_SSE41 | |
175 if (cpu & VEC_CPU_HAS_SSE41) { | |
176 vint32x4_impl_cpu = &vint32x4_impl_sse41; | |
177 vuint32x4_impl_cpu = &vuint32x4_impl_sse41; | |
178 } else | |
179 # endif | |
180 { | |
181 vint32x4_impl_cpu = &vint32x4_impl_sse2; | |
182 vuint32x4_impl_cpu = &vuint32x4_impl_sse2; | |
183 } | |
184 vint64x2_impl_cpu = &vint64x2_impl_sse2; | |
185 vuint64x2_impl_cpu = &vuint64x2_impl_sse2; | |
186 } | |
187 #endif | |
188 #ifdef VEC_COMPILER_HAS_MMX | 289 #ifdef VEC_COMPILER_HAS_MMX |
189 if (cpu & VEC_CPU_HAS_MMX) { | 290 if (cpu & VEC_CPU_HAS_MMX) { |
190 vint8x8_impl_cpu = &vint8x8_impl_mmx; | 291 FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx); |
191 vuint8x8_impl_cpu = &vuint8x8_impl_mmx; | 292 FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx); |
192 vint16x4_impl_cpu = &vint16x4_impl_mmx; | 293 FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx); |
193 vuint16x4_impl_cpu = &vuint16x4_impl_mmx; | 294 FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx); |
194 vint32x2_impl_cpu = &vint32x2_impl_mmx; | 295 FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx); |
195 vuint32x2_impl_cpu = &vuint32x2_impl_mmx; | 296 FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx); |
196 } | 297 } |
197 #endif | 298 #endif |
198 #ifdef VEC_COMPILER_HAS_NEON | 299 #ifdef VEC_COMPILER_HAS_NEON |
199 if (cpu & VEC_CPU_HAS_NEON) { | 300 if (cpu & VEC_CPU_HAS_NEON) { |
200 // 64-bit | 301 FILL_GIVEN_FUNC_PTRS( , 8, 8, neon); |
201 vint8x8_impl_cpu = &vint8x8_impl_neon; | 302 FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon); |
202 vuint8x8_impl_cpu = &vuint8x8_impl_neon; | 303 FILL_GIVEN_FUNC_PTRS( , 16, 4, neon); |
203 vint16x4_impl_cpu = &vint16x4_impl_neon; | 304 FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon); |
204 vuint16x4_impl_cpu = &vuint16x4_impl_neon; | 305 FILL_GIVEN_FUNC_PTRS( , 32, 2, neon); |
205 vint32x2_impl_cpu = &vint32x2_impl_neon; | 306 FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon); |
206 vuint32x2_impl_cpu = &vuint32x2_impl_neon; | 307 } |
207 | 308 #endif |
208 // 128-bit | 309 |
209 vint8x16_impl_cpu = &vint8x16_impl_neon; | 310 /* fill any remaining function pointers with generics */ |
210 vuint8x16_impl_cpu = &vuint8x16_impl_neon; | 311 FILL_GIVEN_FUNC_PTRS( , 8, 64, generic); |
211 vint16x8_impl_cpu = &vint16x8_impl_neon; | 312 FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic); |
212 vuint16x8_impl_cpu = &vuint16x8_impl_neon; | 313 FILL_GIVEN_FUNC_PTRS( , 16, 32, generic); |
213 vint32x4_impl_cpu = &vint32x4_impl_neon; | 314 FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic); |
214 vuint32x4_impl_cpu = &vuint32x4_impl_neon; | 315 FILL_GIVEN_FUNC_PTRS( , 32, 16, generic); |
215 vint64x2_impl_cpu = &vint64x2_impl_neon; | 316 FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic); |
216 vuint64x2_impl_cpu = &vuint64x2_impl_neon; | 317 FILL_GIVEN_FUNC_PTRS( , 64, 8, generic); |
217 } | 318 FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic); |
218 #endif | 319 |
219 { | 320 FILL_GIVEN_FUNC_PTRS( , 8, 32, generic); |
220 // do nothing, they're already set to generics | 321 FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic); |
221 } | 322 FILL_GIVEN_FUNC_PTRS( , 16, 16, generic); |
323 FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic); | |
324 FILL_GIVEN_FUNC_PTRS( , 32, 8, generic); | |
325 FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic); | |
326 FILL_GIVEN_FUNC_PTRS( , 64, 4, generic); | |
327 FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic); | |
328 | |
329 FILL_GIVEN_FUNC_PTRS( , 8, 16, generic); | |
330 FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic); | |
331 FILL_GIVEN_FUNC_PTRS( , 16, 8, generic); | |
332 FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic); | |
333 FILL_GIVEN_FUNC_PTRS( , 32, 4, generic); | |
334 FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic); | |
335 FILL_GIVEN_FUNC_PTRS( , 64, 2, generic); | |
336 FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic); | |
337 | |
338 FILL_GIVEN_FUNC_PTRS( , 8, 8, generic); | |
339 FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic); | |
340 FILL_GIVEN_FUNC_PTRS( , 16, 4, generic); | |
341 FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic); | |
342 FILL_GIVEN_FUNC_PTRS( , 32, 2, generic); | |
343 FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic); | |
344 | |
345 FILL_GIVEN_FUNC_PTRS( , 8, 4, generic); | |
346 FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic); | |
347 FILL_GIVEN_FUNC_PTRS( , 16, 2, generic); | |
348 FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic); | |
349 | |
350 FILL_GIVEN_FUNC_PTRS( , 8, 2, generic); | |
351 FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic); | |
222 | 352 |
223 vec_init_spinner++; | 353 vec_init_spinner++; |
224 | 354 |
225 return 0; | 355 return 0; |
226 } | 356 } |
239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 369 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 370 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 371 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 372 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 373 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ | |
245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 374 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 375 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 376 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 377 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 378 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
250 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 379 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
251 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 380 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
252 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); | 381 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
382 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
383 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); | |
253 | 384 |
254 #define VEC_DEFINE_OPERATIONS(bits, size) \ | 385 #define VEC_DEFINE_OPERATIONS(bits, size) \ |
255 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ | 386 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ |
256 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) | 387 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) |
257 | 388 |