Mercurial > vec
comparison src/vec.c @ 31:bf6ad516f1e6
Backed out changeset c6c99ab1088a
author | Paper <paper@tflc.us> |
---|---|
date | Fri, 25 Apr 2025 17:40:33 -0400 |
parents | 641d8c79b1da |
children | 8b5e0974fd41 |
comparison
equal
deleted
inserted
replaced
30:641d8c79b1da | 31:bf6ad516f1e6 |
---|---|
30 # include "vec/impl/x86/mmx.h" | 30 # include "vec/impl/x86/mmx.h" |
31 #endif | 31 #endif |
32 #ifdef VEC_COMPILER_HAS_SSE2 | 32 #ifdef VEC_COMPILER_HAS_SSE2 |
33 # include "vec/impl/x86/sse2.h" | 33 # include "vec/impl/x86/sse2.h" |
34 #endif | 34 #endif |
35 #ifdef VEC_COMPILER_HAS_SSE3 | |
36 # include "vec/impl/x86/sse3.h" | |
37 #endif | |
38 #ifdef VEC_COMPILER_HAS_SSE41 | 35 #ifdef VEC_COMPILER_HAS_SSE41 |
39 # include "vec/impl/x86/sse41.h" | 36 # include "vec/impl/x86/sse41.h" |
40 #endif | 37 #endif |
41 #ifdef VEC_COMPILER_HAS_SSE42 | |
42 # include "vec/impl/x86/sse42.h" | |
43 #endif | |
44 #ifdef VEC_COMPILER_HAS_AVX2 | 38 #ifdef VEC_COMPILER_HAS_AVX2 |
45 # include "vec/impl/x86/avx2.h" | 39 # include "vec/impl/x86/avx2.h" |
46 #endif | 40 #endif |
47 #ifdef VEC_COMPILER_HAS_AVX512F | 41 #ifdef VEC_COMPILER_HAS_AVX512F |
48 # include "vec/impl/x86/avx512f.h" | 42 # include "vec/impl/x86/avx512f.h" |
49 #endif | |
50 #ifdef VEC_COMPILER_HAS_AVX512BW | |
51 # include "vec/impl/x86/avx512bw.h" | |
52 #endif | |
53 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
54 # include "vec/impl/x86/avx512dq.h" | |
55 #endif | 43 #endif |
56 #ifdef VEC_COMPILER_HAS_ALTIVEC | 44 #ifdef VEC_COMPILER_HAS_ALTIVEC |
57 # include "vec/impl/ppc/altivec.h" | 45 # include "vec/impl/ppc/altivec.h" |
58 #endif | 46 #endif |
59 #ifdef VEC_COMPILER_HAS_NEON | 47 #ifdef VEC_COMPILER_HAS_NEON |
69 | 57 |
70 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); | 58 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); |
71 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); | 59 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); |
72 | 60 |
73 // 16-bit | 61 // 16-bit |
74 vint8x2_impl vint8x2_impl_cpu = {0}; | 62 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; |
75 vuint8x2_impl vuint8x2_impl_cpu = {0}; | 63 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; |
76 | 64 |
77 // 32-bit | 65 // 32-bit |
78 vint8x4_impl vint8x4_impl_cpu = {0}; | 66 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; |
79 vuint8x4_impl vuint8x4_impl_cpu = {0}; | 67 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; |
80 vint16x2_impl vint16x2_impl_cpu = {0}; | 68 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; |
81 vuint16x2_impl vuint16x2_impl_cpu = {0}; | 69 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; |
82 | 70 |
83 // 64-bit | 71 // 64-bit |
84 vint8x8_impl vint8x8_impl_cpu = {0}; | 72 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; |
85 vuint8x8_impl vuint8x8_impl_cpu = {0}; | 73 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; |
86 vint16x4_impl vint16x4_impl_cpu = {0}; | 74 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; |
87 vuint16x4_impl vuint16x4_impl_cpu = {0}; | 75 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; |
88 vint32x2_impl vint32x2_impl_cpu = {0}; | 76 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; |
89 vuint32x2_impl vuint32x2_impl_cpu = {0}; | 77 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; |
90 | 78 |
91 // 128-bit | 79 // 128-bit |
92 vint8x16_impl vint8x16_impl_cpu = {0}; | 80 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; |
93 vuint8x16_impl vuint8x16_impl_cpu = {0}; | 81 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; |
94 vint16x8_impl vint16x8_impl_cpu = {0}; | 82 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; |
95 vuint16x8_impl vuint16x8_impl_cpu = {0}; | 83 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; |
96 vint32x4_impl vint32x4_impl_cpu = {0}; | 84 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; |
97 vuint32x4_impl vuint32x4_impl_cpu = {0}; | 85 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; |
98 vint64x2_impl vint64x2_impl_cpu = {0}; | 86 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; |
99 vuint64x2_impl vuint64x2_impl_cpu = {0}; | 87 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; |
100 | 88 |
101 // 256-bit | 89 // 256-bit |
102 vint8x32_impl vint8x32_impl_cpu = {0}; | 90 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; |
103 vuint8x32_impl vuint8x32_impl_cpu = {0}; | 91 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; |
104 vint16x16_impl vint16x16_impl_cpu = {0}; | 92 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; |
105 vuint16x16_impl vuint16x16_impl_cpu = {0}; | 93 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; |
106 vint32x8_impl vint32x8_impl_cpu = {0}; | 94 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; |
107 vuint32x8_impl vuint32x8_impl_cpu = {0}; | 95 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; |
108 vint64x4_impl vint64x4_impl_cpu = {0}; | 96 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; |
109 vuint64x4_impl vuint64x4_impl_cpu = {0}; | 97 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; |
110 | 98 |
111 // 512-bit | 99 // 512-bit |
112 vint8x64_impl vint8x64_impl_cpu = {0}; | 100 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; |
113 vuint8x64_impl vuint8x64_impl_cpu = {0}; | 101 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; |
114 vint16x32_impl vint16x32_impl_cpu = {0}; | 102 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; |
115 vuint16x32_impl vuint16x32_impl_cpu = {0}; | 103 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; |
116 vint32x16_impl vint32x16_impl_cpu = {0}; | 104 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; |
117 vuint32x16_impl vuint32x16_impl_cpu = {0}; | 105 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; |
118 vint64x8_impl vint64x8_impl_cpu = {0}; | 106 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; |
119 vuint64x8_impl vuint64x8_impl_cpu = {0}; | 107 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; |
120 | 108 |
121 static int vec_init_spinner = 0; | 109 static int vec_init_spinner = 0; |
122 | |
123 #define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \ | |
124 do { \ | |
125 if (!(cpu).func && (impl).func) \ | |
126 (cpu).func = (impl).func; \ | |
127 } while (0) | |
128 | |
129 #define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \ | |
130 do { \ | |
131 FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \ | |
132 FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \ | |
133 FILL_GIVEN_FUNC_PTR(cpu, impl, load); \ | |
134 FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \ | |
135 FILL_GIVEN_FUNC_PTR(cpu, impl, store); \ | |
136 FILL_GIVEN_FUNC_PTR(cpu, impl, add); \ | |
137 FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \ | |
138 FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \ | |
139 FILL_GIVEN_FUNC_PTR(cpu, impl, div); \ | |
140 FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \ | |
141 FILL_GIVEN_FUNC_PTR(cpu, impl, band); \ | |
142 FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \ | |
143 FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \ | |
144 FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \ | |
145 FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \ | |
146 FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \ | |
147 FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \ | |
148 FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \ | |
149 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \ | |
150 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \ | |
151 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \ | |
152 FILL_GIVEN_FUNC_PTR(cpu, impl, min); \ | |
153 FILL_GIVEN_FUNC_PTR(cpu, impl, max); \ | |
154 } while (0) | |
155 | |
156 #define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \ | |
157 FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl) | |
158 | 110 |
159 // returns 0 or a negative error code on failure | 111 // returns 0 or a negative error code on failure |
160 int vec_init(void) | 112 int vec_init(void) |
161 { | 113 { |
162 // This function is NOT thread safe. However, once vec | 114 // This function is NOT thread safe. However, once vec |
163 // is initialized, all of the vector functions are thread-safe. | 115 // is initialized, all of the vector functions are thread-safe. |
116 // | |
117 // In fact, it's possible to use vec without calling | |
118 // vec_init() at all, but it would be completely useless since | |
119 // it would just use a generic implementation without any | |
120 // vectorization whatsoever (unless maybe the compiler is | |
121 // smart enough to optimize it into vectors) | |
164 | 122 |
165 if (vec_init_spinner) | 123 if (vec_init_spinner) |
166 return 0; // already initialized, do nothing | 124 return 0; // already initialized, do nothing |
167 | 125 |
168 vec_uint32 cpu = vec_get_CPU_features(); | 126 vec_uint32 cpu = vec_get_CPU_features(); |
169 | 127 |
170 /* Okay, this might be a little confusing: | 128 #ifdef VEC_COMPILER_HAS_ALTIVEC |
171 * The way we do this is because of x86. For weird reasons, | 129 if (cpu & VEC_CPU_HAS_ALTIVEC) { |
172 * Intel decided to extend their prior CPU extensions to | 130 vint8x16_impl_cpu = &vint8x16_impl_altivec; |
173 * where SSE4.1 has some extended features of SSE2, AVX2 | 131 vuint8x16_impl_cpu = &vuint8x16_impl_altivec; |
174 * has some extended features that should've been in SSE | 132 vint16x8_impl_cpu = &vint16x8_impl_altivec; |
175 * in general, etc. | 133 vuint16x8_impl_cpu = &vuint16x8_impl_altivec; |
176 * | 134 vint32x4_impl_cpu = &vint32x4_impl_altivec; |
177 * For this, I've just decided to keep the function | 135 vuint32x4_impl_cpu = &vuint32x4_impl_altivec; |
178 * definitions private, and fill in as we go, with newer | 136 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX |
179 * intrinsics preferred. Others are arbitrary and are | 137 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { |
180 * mutually exclusive (i.e. Altivec vs NEON). This is simply | 138 vint64x2_impl_cpu = &vint64x2_impl_altivec; |
181 * the easiest way to go about it :) */ | 139 vuint64x2_impl_cpu = &vuint64x2_impl_altivec; |
182 | 140 } |
183 /* --- 512-bit */ | 141 #endif |
184 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
185 if (cpu & VEC_CPU_HAS_AVX512DQ) { | |
186 /* these give us native multiply instructions */ | |
187 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq); | |
188 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq); | |
189 } | |
190 #endif | |
191 #ifdef VEC_COMPILER_HAS_AVX512BW | |
192 if (cpu & VEC_CPU_HAS_AVX512BW) { | |
193 FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw); | |
194 FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw); | |
195 FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw); | |
196 FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw); | |
197 } | 142 } |
198 #endif | 143 #endif |
199 #ifdef VEC_COMPILER_HAS_AVX512F | 144 #ifdef VEC_COMPILER_HAS_AVX512F |
200 if (cpu & VEC_CPU_HAS_AVX512F) { | 145 if (cpu & VEC_CPU_HAS_AVX512F) { |
201 FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); | 146 vint8x64_impl_cpu = &vint8x64_impl_avx512f; |
202 FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); | 147 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; |
203 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); | 148 vint16x32_impl_cpu = &vint16x32_impl_avx512f; |
204 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); | 149 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; |
205 } | 150 vint32x16_impl_cpu = &vint32x16_impl_avx512f; |
206 #endif | 151 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; |
207 | 152 vint64x8_impl_cpu = &vint64x8_impl_avx512f; |
208 /* --- 256-bit */ | 153 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; |
154 } | |
155 #endif | |
209 #ifdef VEC_COMPILER_HAS_AVX2 | 156 #ifdef VEC_COMPILER_HAS_AVX2 |
210 if (cpu & VEC_CPU_HAS_AVX2) { | 157 if (cpu & VEC_CPU_HAS_AVX2) { |
211 FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); | 158 vint8x32_impl_cpu = &vint8x32_impl_avx2; |
212 FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); | 159 vuint8x32_impl_cpu = &vuint8x32_impl_avx2; |
213 FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); | 160 vint16x16_impl_cpu = &vint16x16_impl_avx2; |
214 FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); | 161 vuint16x16_impl_cpu = &vuint16x16_impl_avx2; |
215 FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); | 162 vint32x8_impl_cpu = &vint32x8_impl_avx2; |
216 FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); | 163 vuint32x8_impl_cpu = &vuint32x8_impl_avx2; |
217 FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); | 164 vint64x4_impl_cpu = &vint64x4_impl_avx2; |
218 FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); | 165 vuint64x4_impl_cpu = &vuint64x4_impl_avx2; |
219 } | |
220 #endif | |
221 | |
222 /* --- 128-bit */ | |
223 #ifdef VEC_COMPILER_HAS_SSE42 | |
224 if (cpu & VEC_CPU_HAS_SSE41) { | |
225 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42); | |
226 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42); | |
227 } | |
228 #endif | |
229 #ifdef VEC_COMPILER_HAS_SSE41 | |
230 if (cpu & VEC_CPU_HAS_SSE41) { | |
231 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41); | |
232 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41); | |
233 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41); | |
234 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41); | |
235 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41); | |
236 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41); | |
237 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41); | |
238 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41); | |
239 } | |
240 #endif | |
241 #ifdef VEC_COMPILER_HAS_SSE3 | |
242 if (cpu & VEC_CPU_HAS_SSE3) { | |
243 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3); | |
244 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3); | |
245 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3); | |
246 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3); | |
247 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3); | |
248 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3); | |
249 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3); | |
250 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3); | |
251 } | 166 } |
252 #endif | 167 #endif |
253 #ifdef VEC_COMPILER_HAS_SSE2 | 168 #ifdef VEC_COMPILER_HAS_SSE2 |
254 if (cpu & VEC_CPU_HAS_SSE2) { | 169 if (cpu & VEC_CPU_HAS_SSE2) { |
255 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); | 170 vint8x16_impl_cpu = &vint8x16_impl_sse2; |
256 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); | 171 vuint8x16_impl_cpu = &vuint8x16_impl_sse2; |
257 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); | 172 vint16x8_impl_cpu = &vint16x8_impl_sse2; |
258 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); | 173 vuint16x8_impl_cpu = &vuint16x8_impl_sse2; |
259 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); | 174 # ifdef VEC_COMPILER_HAS_SSE41 |
260 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); | 175 if (cpu & VEC_CPU_HAS_SSE41) { |
261 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); | 176 vint32x4_impl_cpu = &vint32x4_impl_sse41; |
262 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); | 177 vuint32x4_impl_cpu = &vuint32x4_impl_sse41; |
178 } else | |
179 # endif | |
180 { | |
181 vint32x4_impl_cpu = &vint32x4_impl_sse2; | |
182 vuint32x4_impl_cpu = &vuint32x4_impl_sse2; | |
183 } | |
184 vint64x2_impl_cpu = &vint64x2_impl_sse2; | |
185 vuint64x2_impl_cpu = &vuint64x2_impl_sse2; | |
186 } | |
187 #endif | |
188 #ifdef VEC_COMPILER_HAS_MMX | |
189 if (cpu & VEC_CPU_HAS_MMX) { | |
190 vint8x8_impl_cpu = &vint8x8_impl_mmx; | |
191 vuint8x8_impl_cpu = &vuint8x8_impl_mmx; | |
192 vint16x4_impl_cpu = &vint16x4_impl_mmx; | |
193 vuint16x4_impl_cpu = &vuint16x4_impl_mmx; | |
194 vint32x2_impl_cpu = &vint32x2_impl_mmx; | |
195 vuint32x2_impl_cpu = &vuint32x2_impl_mmx; | |
263 } | 196 } |
264 #endif | 197 #endif |
265 #ifdef VEC_COMPILER_HAS_NEON | 198 #ifdef VEC_COMPILER_HAS_NEON |
266 if (cpu & VEC_CPU_HAS_NEON) { | 199 if (cpu & VEC_CPU_HAS_NEON) { |
267 FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); | 200 // 64-bit |
268 FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); | 201 vint8x8_impl_cpu = &vint8x8_impl_neon; |
269 FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); | 202 vuint8x8_impl_cpu = &vuint8x8_impl_neon; |
270 FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); | 203 vint16x4_impl_cpu = &vint16x4_impl_neon; |
271 FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); | 204 vuint16x4_impl_cpu = &vuint16x4_impl_neon; |
272 FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); | 205 vint32x2_impl_cpu = &vint32x2_impl_neon; |
273 FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); | 206 vuint32x2_impl_cpu = &vuint32x2_impl_neon; |
274 FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); | 207 |
275 } | 208 // 128-bit |
276 #endif | 209 vint8x16_impl_cpu = &vint8x16_impl_neon; |
277 #ifdef VEC_COMPILER_HAS_ALTIVEC | 210 vuint8x16_impl_cpu = &vuint8x16_impl_neon; |
278 if (cpu & VEC_CPU_HAS_ALTIVEC) { | 211 vint16x8_impl_cpu = &vint16x8_impl_neon; |
279 FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); | 212 vuint16x8_impl_cpu = &vuint16x8_impl_neon; |
280 FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); | 213 vint32x4_impl_cpu = &vint32x4_impl_neon; |
281 FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); | 214 vuint32x4_impl_cpu = &vuint32x4_impl_neon; |
282 FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); | 215 vint64x2_impl_cpu = &vint64x2_impl_neon; |
283 FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); | 216 vuint64x2_impl_cpu = &vuint64x2_impl_neon; |
284 FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); | 217 } |
285 } | 218 #endif |
286 #endif | 219 { |
287 | 220 // do nothing, they're already set to generics |
288 /* --- 64-bit */ | 221 } |
289 #ifdef VEC_COMPILER_HAS_MMX | |
290 if (cpu & VEC_CPU_HAS_MMX) { | |
291 FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx); | |
292 FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx); | |
293 FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx); | |
294 FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx); | |
295 FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx); | |
296 FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx); | |
297 } | |
298 #endif | |
299 #ifdef VEC_COMPILER_HAS_NEON | |
300 if (cpu & VEC_CPU_HAS_NEON) { | |
301 FILL_GIVEN_FUNC_PTRS( , 8, 8, neon); | |
302 FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon); | |
303 FILL_GIVEN_FUNC_PTRS( , 16, 4, neon); | |
304 FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon); | |
305 FILL_GIVEN_FUNC_PTRS( , 32, 2, neon); | |
306 FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon); | |
307 } | |
308 #endif | |
309 | |
310 /* fill any remaining function pointers with generics */ | |
311 FILL_GIVEN_FUNC_PTRS( , 8, 64, generic); | |
312 FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic); | |
313 FILL_GIVEN_FUNC_PTRS( , 16, 32, generic); | |
314 FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic); | |
315 FILL_GIVEN_FUNC_PTRS( , 32, 16, generic); | |
316 FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic); | |
317 FILL_GIVEN_FUNC_PTRS( , 64, 8, generic); | |
318 FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic); | |
319 | |
320 FILL_GIVEN_FUNC_PTRS( , 8, 32, generic); | |
321 FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic); | |
322 FILL_GIVEN_FUNC_PTRS( , 16, 16, generic); | |
323 FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic); | |
324 FILL_GIVEN_FUNC_PTRS( , 32, 8, generic); | |
325 FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic); | |
326 FILL_GIVEN_FUNC_PTRS( , 64, 4, generic); | |
327 FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic); | |
328 | |
329 FILL_GIVEN_FUNC_PTRS( , 8, 16, generic); | |
330 FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic); | |
331 FILL_GIVEN_FUNC_PTRS( , 16, 8, generic); | |
332 FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic); | |
333 FILL_GIVEN_FUNC_PTRS( , 32, 4, generic); | |
334 FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic); | |
335 FILL_GIVEN_FUNC_PTRS( , 64, 2, generic); | |
336 FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic); | |
337 | |
338 FILL_GIVEN_FUNC_PTRS( , 8, 8, generic); | |
339 FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic); | |
340 FILL_GIVEN_FUNC_PTRS( , 16, 4, generic); | |
341 FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic); | |
342 FILL_GIVEN_FUNC_PTRS( , 32, 2, generic); | |
343 FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic); | |
344 | |
345 FILL_GIVEN_FUNC_PTRS( , 8, 4, generic); | |
346 FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic); | |
347 FILL_GIVEN_FUNC_PTRS( , 16, 2, generic); | |
348 FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic); | |
349 | |
350 FILL_GIVEN_FUNC_PTRS( , 8, 2, generic); | |
351 FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic); | |
352 | 222 |
353 vec_init_spinner++; | 223 vec_init_spinner++; |
354 | 224 |
355 return 0; | 225 return 0; |
356 } | 226 } |
369 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
370 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
371 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
372 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
373 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ | |
374 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
375 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
376 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
377 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
378 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
379 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 250 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
380 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 251 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
381 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 252 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); |
382 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
383 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); | |
384 | 253 |
385 #define VEC_DEFINE_OPERATIONS(bits, size) \ | 254 #define VEC_DEFINE_OPERATIONS(bits, size) \ |
386 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ | 255 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ |
387 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) | 256 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) |
388 | 257 |