comparison src/vec.c @ 31:bf6ad516f1e6

Backed out changeset c6c99ab1088a
author Paper <paper@tflc.us>
date Fri, 25 Apr 2025 17:40:33 -0400
parents 641d8c79b1da
children 8b5e0974fd41
comparison
equal deleted inserted replaced
30:641d8c79b1da 31:bf6ad516f1e6
30 # include "vec/impl/x86/mmx.h" 30 # include "vec/impl/x86/mmx.h"
31 #endif 31 #endif
32 #ifdef VEC_COMPILER_HAS_SSE2 32 #ifdef VEC_COMPILER_HAS_SSE2
33 # include "vec/impl/x86/sse2.h" 33 # include "vec/impl/x86/sse2.h"
34 #endif 34 #endif
35 #ifdef VEC_COMPILER_HAS_SSE3
36 # include "vec/impl/x86/sse3.h"
37 #endif
38 #ifdef VEC_COMPILER_HAS_SSE41 35 #ifdef VEC_COMPILER_HAS_SSE41
39 # include "vec/impl/x86/sse41.h" 36 # include "vec/impl/x86/sse41.h"
40 #endif 37 #endif
41 #ifdef VEC_COMPILER_HAS_SSE42
42 # include "vec/impl/x86/sse42.h"
43 #endif
44 #ifdef VEC_COMPILER_HAS_AVX2 38 #ifdef VEC_COMPILER_HAS_AVX2
45 # include "vec/impl/x86/avx2.h" 39 # include "vec/impl/x86/avx2.h"
46 #endif 40 #endif
47 #ifdef VEC_COMPILER_HAS_AVX512F 41 #ifdef VEC_COMPILER_HAS_AVX512F
48 # include "vec/impl/x86/avx512f.h" 42 # include "vec/impl/x86/avx512f.h"
49 #endif
50 #ifdef VEC_COMPILER_HAS_AVX512BW
51 # include "vec/impl/x86/avx512bw.h"
52 #endif
53 #ifdef VEC_COMPILER_HAS_AVX512DQ
54 # include "vec/impl/x86/avx512dq.h"
55 #endif 43 #endif
56 #ifdef VEC_COMPILER_HAS_ALTIVEC 44 #ifdef VEC_COMPILER_HAS_ALTIVEC
57 # include "vec/impl/ppc/altivec.h" 45 # include "vec/impl/ppc/altivec.h"
58 #endif 46 #endif
59 #ifdef VEC_COMPILER_HAS_NEON 47 #ifdef VEC_COMPILER_HAS_NEON
69 57
70 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); 58 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y);
71 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); 59 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y);
72 60
73 // 16-bit 61 // 16-bit
74 vint8x2_impl vint8x2_impl_cpu = {0}; 62 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic;
75 vuint8x2_impl vuint8x2_impl_cpu = {0}; 63 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic;
76 64
77 // 32-bit 65 // 32-bit
78 vint8x4_impl vint8x4_impl_cpu = {0}; 66 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic;
79 vuint8x4_impl vuint8x4_impl_cpu = {0}; 67 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic;
80 vint16x2_impl vint16x2_impl_cpu = {0}; 68 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic;
81 vuint16x2_impl vuint16x2_impl_cpu = {0}; 69 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
82 70
83 // 64-bit 71 // 64-bit
84 vint8x8_impl vint8x8_impl_cpu = {0}; 72 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic;
85 vuint8x8_impl vuint8x8_impl_cpu = {0}; 73 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic;
86 vint16x4_impl vint16x4_impl_cpu = {0}; 74 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic;
87 vuint16x4_impl vuint16x4_impl_cpu = {0}; 75 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
88 vint32x2_impl vint32x2_impl_cpu = {0}; 76 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic;
89 vuint32x2_impl vuint32x2_impl_cpu = {0}; 77 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
90 78
91 // 128-bit 79 // 128-bit
92 vint8x16_impl vint8x16_impl_cpu = {0}; 80 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic;
93 vuint8x16_impl vuint8x16_impl_cpu = {0}; 81 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
94 vint16x8_impl vint16x8_impl_cpu = {0}; 82 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic;
95 vuint16x8_impl vuint16x8_impl_cpu = {0}; 83 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
96 vint32x4_impl vint32x4_impl_cpu = {0}; 84 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic;
97 vuint32x4_impl vuint32x4_impl_cpu = {0}; 85 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
98 vint64x2_impl vint64x2_impl_cpu = {0}; 86 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic;
99 vuint64x2_impl vuint64x2_impl_cpu = {0}; 87 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
100 88
101 // 256-bit 89 // 256-bit
102 vint8x32_impl vint8x32_impl_cpu = {0}; 90 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic;
103 vuint8x32_impl vuint8x32_impl_cpu = {0}; 91 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic;
104 vint16x16_impl vint16x16_impl_cpu = {0}; 92 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic;
105 vuint16x16_impl vuint16x16_impl_cpu = {0}; 93 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
106 vint32x8_impl vint32x8_impl_cpu = {0}; 94 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic;
107 vuint32x8_impl vuint32x8_impl_cpu = {0}; 95 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic;
108 vint64x4_impl vint64x4_impl_cpu = {0}; 96 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic;
109 vuint64x4_impl vuint64x4_impl_cpu = {0}; 97 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic;
110 98
111 // 512-bit 99 // 512-bit
112 vint8x64_impl vint8x64_impl_cpu = {0}; 100 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic;
113 vuint8x64_impl vuint8x64_impl_cpu = {0}; 101 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic;
114 vint16x32_impl vint16x32_impl_cpu = {0}; 102 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic;
115 vuint16x32_impl vuint16x32_impl_cpu = {0}; 103 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
116 vint32x16_impl vint32x16_impl_cpu = {0}; 104 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic;
117 vuint32x16_impl vuint32x16_impl_cpu = {0}; 105 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
118 vint64x8_impl vint64x8_impl_cpu = {0}; 106 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic;
119 vuint64x8_impl vuint64x8_impl_cpu = {0}; 107 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic;
120 108
121 static int vec_init_spinner = 0; 109 static int vec_init_spinner = 0;
122
123 #define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \
124 do { \
125 if (!(cpu).func && (impl).func) \
126 (cpu).func = (impl).func; \
127 } while (0)
128
129 #define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \
130 do { \
131 FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \
132 FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \
133 FILL_GIVEN_FUNC_PTR(cpu, impl, load); \
134 FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \
135 FILL_GIVEN_FUNC_PTR(cpu, impl, store); \
136 FILL_GIVEN_FUNC_PTR(cpu, impl, add); \
137 FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \
138 FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \
139 FILL_GIVEN_FUNC_PTR(cpu, impl, div); \
140 FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \
141 FILL_GIVEN_FUNC_PTR(cpu, impl, band); \
142 FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \
143 FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \
144 FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \
145 FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \
146 FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \
147 FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \
148 FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \
149 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \
150 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \
151 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \
152 FILL_GIVEN_FUNC_PTR(cpu, impl, min); \
153 FILL_GIVEN_FUNC_PTR(cpu, impl, max); \
154 } while (0)
155
156 #define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \
157 FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl)
158 110
159 // returns 0 or a negative error code on failure 111 // returns 0 or a negative error code on failure
160 int vec_init(void) 112 int vec_init(void)
161 { 113 {
162 // This function is NOT thread safe. However, once vec 114 // This function is NOT thread safe. However, once vec
163 // is initialized, all of the vector functions are thread-safe. 115 // is initialized, all of the vector functions are thread-safe.
116 //
117 // In fact, it's possible to use vec without calling
118 // vec_init() at all, but it would be completely useless since
119 // it would just use a generic implementation without any
120 // vectorization whatsoever (unless maybe the compiler is
121 // smart enough to optimize it into vectors)
164 122
165 if (vec_init_spinner) 123 if (vec_init_spinner)
166 return 0; // already initialized, do nothing 124 return 0; // already initialized, do nothing
167 125
168 vec_uint32 cpu = vec_get_CPU_features(); 126 vec_uint32 cpu = vec_get_CPU_features();
169 127
170 /* Okay, this might be a little confusing: 128 #ifdef VEC_COMPILER_HAS_ALTIVEC
171 * The way we do this is because of x86. For weird reasons, 129 if (cpu & VEC_CPU_HAS_ALTIVEC) {
172 * Intel decided to extend their prior CPU extensions to 130 vint8x16_impl_cpu = &vint8x16_impl_altivec;
173 * where SSE4.1 has some extended features of SSE2, AVX2 131 vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
174 * has some extended features that should've been in SSE 132 vint16x8_impl_cpu = &vint16x8_impl_altivec;
175 * in general, etc. 133 vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
176 * 134 vint32x4_impl_cpu = &vint32x4_impl_altivec;
177 * For this, I've just decided to keep the function 135 vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
178 * definitions private, and fill in as we go, with newer 136 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
179 * intrinsics preferred. Others are arbitrary and are 137 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
180 * mutually exclusive (i.e. Altivec vs NEON). This is simply 138 vint64x2_impl_cpu = &vint64x2_impl_altivec;
181 * the easiest way to go about it :) */ 139 vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
182 140 }
183 /* --- 512-bit */ 141 #endif
184 #ifdef VEC_COMPILER_HAS_AVX512DQ
185 if (cpu & VEC_CPU_HAS_AVX512DQ) {
186 /* these give us native multiply instructions */
187 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq);
188 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq);
189 }
190 #endif
191 #ifdef VEC_COMPILER_HAS_AVX512BW
192 if (cpu & VEC_CPU_HAS_AVX512BW) {
193 FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw);
194 FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw);
195 FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw);
196 FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw);
197 } 142 }
198 #endif 143 #endif
199 #ifdef VEC_COMPILER_HAS_AVX512F 144 #ifdef VEC_COMPILER_HAS_AVX512F
200 if (cpu & VEC_CPU_HAS_AVX512F) { 145 if (cpu & VEC_CPU_HAS_AVX512F) {
201 FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); 146 vint8x64_impl_cpu = &vint8x64_impl_avx512f;
202 FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); 147 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
203 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); 148 vint16x32_impl_cpu = &vint16x32_impl_avx512f;
204 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); 149 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
205 } 150 vint32x16_impl_cpu = &vint32x16_impl_avx512f;
206 #endif 151 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
207 152 vint64x8_impl_cpu = &vint64x8_impl_avx512f;
208 /* --- 256-bit */ 153 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
154 }
155 #endif
209 #ifdef VEC_COMPILER_HAS_AVX2 156 #ifdef VEC_COMPILER_HAS_AVX2
210 if (cpu & VEC_CPU_HAS_AVX2) { 157 if (cpu & VEC_CPU_HAS_AVX2) {
211 FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); 158 vint8x32_impl_cpu = &vint8x32_impl_avx2;
212 FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); 159 vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
213 FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); 160 vint16x16_impl_cpu = &vint16x16_impl_avx2;
214 FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); 161 vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
215 FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); 162 vint32x8_impl_cpu = &vint32x8_impl_avx2;
216 FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); 163 vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
217 FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); 164 vint64x4_impl_cpu = &vint64x4_impl_avx2;
218 FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); 165 vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
219 }
220 #endif
221
222 /* --- 128-bit */
223 #ifdef VEC_COMPILER_HAS_SSE42
224 if (cpu & VEC_CPU_HAS_SSE41) {
225 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42);
226 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42);
227 }
228 #endif
229 #ifdef VEC_COMPILER_HAS_SSE41
230 if (cpu & VEC_CPU_HAS_SSE41) {
231 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41);
232 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41);
233 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41);
234 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41);
235 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41);
236 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41);
237 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41);
238 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41);
239 }
240 #endif
241 #ifdef VEC_COMPILER_HAS_SSE3
242 if (cpu & VEC_CPU_HAS_SSE3) {
243 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3);
244 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3);
245 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3);
246 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3);
247 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3);
248 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3);
249 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3);
250 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3);
251 } 166 }
252 #endif 167 #endif
253 #ifdef VEC_COMPILER_HAS_SSE2 168 #ifdef VEC_COMPILER_HAS_SSE2
254 if (cpu & VEC_CPU_HAS_SSE2) { 169 if (cpu & VEC_CPU_HAS_SSE2) {
255 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); 170 vint8x16_impl_cpu = &vint8x16_impl_sse2;
256 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); 171 vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
257 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); 172 vint16x8_impl_cpu = &vint16x8_impl_sse2;
258 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); 173 vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
259 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); 174 # ifdef VEC_COMPILER_HAS_SSE41
260 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); 175 if (cpu & VEC_CPU_HAS_SSE41) {
261 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); 176 vint32x4_impl_cpu = &vint32x4_impl_sse41;
262 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); 177 vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
178 } else
179 # endif
180 {
181 vint32x4_impl_cpu = &vint32x4_impl_sse2;
182 vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
183 }
184 vint64x2_impl_cpu = &vint64x2_impl_sse2;
185 vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
186 }
187 #endif
188 #ifdef VEC_COMPILER_HAS_MMX
189 if (cpu & VEC_CPU_HAS_MMX) {
190 vint8x8_impl_cpu = &vint8x8_impl_mmx;
191 vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
192 vint16x4_impl_cpu = &vint16x4_impl_mmx;
193 vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
194 vint32x2_impl_cpu = &vint32x2_impl_mmx;
195 vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
263 } 196 }
264 #endif 197 #endif
265 #ifdef VEC_COMPILER_HAS_NEON 198 #ifdef VEC_COMPILER_HAS_NEON
266 if (cpu & VEC_CPU_HAS_NEON) { 199 if (cpu & VEC_CPU_HAS_NEON) {
267 FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); 200 // 64-bit
268 FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); 201 vint8x8_impl_cpu = &vint8x8_impl_neon;
269 FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); 202 vuint8x8_impl_cpu = &vuint8x8_impl_neon;
270 FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); 203 vint16x4_impl_cpu = &vint16x4_impl_neon;
271 FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); 204 vuint16x4_impl_cpu = &vuint16x4_impl_neon;
272 FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); 205 vint32x2_impl_cpu = &vint32x2_impl_neon;
273 FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); 206 vuint32x2_impl_cpu = &vuint32x2_impl_neon;
274 FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); 207
275 } 208 // 128-bit
276 #endif 209 vint8x16_impl_cpu = &vint8x16_impl_neon;
277 #ifdef VEC_COMPILER_HAS_ALTIVEC 210 vuint8x16_impl_cpu = &vuint8x16_impl_neon;
278 if (cpu & VEC_CPU_HAS_ALTIVEC) { 211 vint16x8_impl_cpu = &vint16x8_impl_neon;
279 FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); 212 vuint16x8_impl_cpu = &vuint16x8_impl_neon;
280 FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); 213 vint32x4_impl_cpu = &vint32x4_impl_neon;
281 FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); 214 vuint32x4_impl_cpu = &vuint32x4_impl_neon;
282 FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); 215 vint64x2_impl_cpu = &vint64x2_impl_neon;
283 FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); 216 vuint64x2_impl_cpu = &vuint64x2_impl_neon;
284 FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); 217 }
285 } 218 #endif
286 #endif 219 {
287 220 // do nothing, they're already set to generics
288 /* --- 64-bit */ 221 }
289 #ifdef VEC_COMPILER_HAS_MMX
290 if (cpu & VEC_CPU_HAS_MMX) {
291 FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx);
292 FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx);
293 FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx);
294 FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx);
295 FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx);
296 FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx);
297 }
298 #endif
299 #ifdef VEC_COMPILER_HAS_NEON
300 if (cpu & VEC_CPU_HAS_NEON) {
301 FILL_GIVEN_FUNC_PTRS( , 8, 8, neon);
302 FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon);
303 FILL_GIVEN_FUNC_PTRS( , 16, 4, neon);
304 FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon);
305 FILL_GIVEN_FUNC_PTRS( , 32, 2, neon);
306 FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon);
307 }
308 #endif
309
310 /* fill any remaining function pointers with generics */
311 FILL_GIVEN_FUNC_PTRS( , 8, 64, generic);
312 FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic);
313 FILL_GIVEN_FUNC_PTRS( , 16, 32, generic);
314 FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic);
315 FILL_GIVEN_FUNC_PTRS( , 32, 16, generic);
316 FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic);
317 FILL_GIVEN_FUNC_PTRS( , 64, 8, generic);
318 FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic);
319
320 FILL_GIVEN_FUNC_PTRS( , 8, 32, generic);
321 FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic);
322 FILL_GIVEN_FUNC_PTRS( , 16, 16, generic);
323 FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic);
324 FILL_GIVEN_FUNC_PTRS( , 32, 8, generic);
325 FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic);
326 FILL_GIVEN_FUNC_PTRS( , 64, 4, generic);
327 FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic);
328
329 FILL_GIVEN_FUNC_PTRS( , 8, 16, generic);
330 FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic);
331 FILL_GIVEN_FUNC_PTRS( , 16, 8, generic);
332 FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic);
333 FILL_GIVEN_FUNC_PTRS( , 32, 4, generic);
334 FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic);
335 FILL_GIVEN_FUNC_PTRS( , 64, 2, generic);
336 FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic);
337
338 FILL_GIVEN_FUNC_PTRS( , 8, 8, generic);
339 FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic);
340 FILL_GIVEN_FUNC_PTRS( , 16, 4, generic);
341 FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic);
342 FILL_GIVEN_FUNC_PTRS( , 32, 2, generic);
343 FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic);
344
345 FILL_GIVEN_FUNC_PTRS( , 8, 4, generic);
346 FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic);
347 FILL_GIVEN_FUNC_PTRS( , 16, 2, generic);
348 FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic);
349
350 FILL_GIVEN_FUNC_PTRS( , 8, 2, generic);
351 FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic);
352 222
353 vec_init_spinner++; 223 vec_init_spinner++;
354 224
355 return 0; 225 return 0;
356 } 226 }
369 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
370 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
371 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
372 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
373 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
374 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
375 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
376 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
377 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
378 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ 249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
379 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ 250 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
380 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ 251 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
381 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ 252 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
382 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
383 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2);
384 253
385 #define VEC_DEFINE_OPERATIONS(bits, size) \ 254 #define VEC_DEFINE_OPERATIONS(bits, size) \
386 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ 255 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
387 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) 256 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
388 257