Mercurial > vec
comparison src/vec.c @ 23:e26874655738
*: huge refactor, new major release (hahaha)
I keep finding things that are broken...
The problem NOW was that vec would unintentionally build some
functions with extended instruction sets, which is Bad and would
mean that for all intents and purposes the CPU detection was
completely broken.
Now vec is no longer header only either. Boohoo. However this gives
a lot more flexibility to vec since we no longer want or need to
care about C++ crap.
The NEON and Altivec implementations have not been updated which
means they won't compile hence why they're commented out in the
cmake build file.
author | Paper <paper@tflc.us> |
---|---|
date | Sun, 24 Nov 2024 02:52:40 -0500 |
parents | e05c257c6a23 |
children | 92156fe32755 |
comparison
equal
deleted
inserted
replaced
22:fbcd3fa6f8fc | 23:e26874655738 |
---|---|
1 #define VEC_IMPLEMENTATION | 1 /** |
2 * vec - a tiny SIMD vector library in C99 | |
3 * | |
4 * Copyright (c) 2024 Paper | |
5 * | |
6 * Permission is hereby granted, free of charge, to any person obtaining a copy | |
7 * of this software and associated documentation files (the "Software"), to deal | |
8 * in the Software without restriction, including without limitation the rights | |
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 * copies of the Software, and to permit persons to whom the Software is | |
11 * furnished to do so, subject to the following conditions: | |
12 * | |
13 * The above copyright notice and this permission notice shall be included in all | |
14 * copies or substantial portions of the Software. | |
15 * | |
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 * SOFTWARE. | |
23 **/ | |
24 | |
2 #include "vec/vec.h" | 25 #include "vec/vec.h" |
26 #include "vec/cpu.h" | |
27 #include "vec/impl/generic.h" | |
28 #include "vec/impl/fallback.h" | |
29 #ifdef VEC_COMPILER_HAS_MMX | |
30 # include "vec/impl/x86/mmx.h" | |
31 #endif | |
32 #ifdef VEC_COMPILER_HAS_SSE2 | |
33 # include "vec/impl/x86/sse2.h" | |
34 #endif | |
35 #ifdef VEC_COMPILER_HAS_SSE41 | |
36 # include "vec/impl/x86/sse41.h" | |
37 #endif | |
38 #ifdef VEC_COMPILER_HAS_AVX2 | |
39 # include "vec/impl/x86/avx2.h" | |
40 #endif | |
41 #ifdef VEC_COMPILER_HAS_AVX512F | |
42 # include "vec/impl/x86/avx512f.h" | |
43 #endif | |
44 #ifdef VEC_COMPILER_HAS_ALTIVEC | |
45 # include "vec/impl/ppc/altivec.h" | |
46 #endif | |
47 #ifdef VEC_COMPILER_HAS_NEON | |
48 # include "vec/impl/arm/neon.h" | |
49 #endif | |
50 | |
51 extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y); | |
52 extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y); | |
53 extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); | |
54 extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); | |
55 extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); | |
56 extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); | |
57 | |
58 // 16-bit | |
59 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; | |
60 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; | |
61 | |
62 // 32-bit | |
63 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; | |
64 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; | |
65 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; | |
66 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; | |
67 | |
68 // 64-bit | |
69 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; | |
70 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; | |
71 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; | |
72 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; | |
73 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; | |
74 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; | |
75 | |
76 // 128-bit | |
77 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; | |
78 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; | |
79 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; | |
80 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; | |
81 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; | |
82 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; | |
83 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; | |
84 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; | |
85 | |
86 // 256-bit | |
87 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; | |
88 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; | |
89 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; | |
90 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; | |
91 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; | |
92 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; | |
93 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; | |
94 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; | |
95 | |
96 // 512-bit | |
97 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; | |
98 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; | |
99 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; | |
100 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; | |
101 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; | |
102 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; | |
103 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; | |
104 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; | |
105 | |
106 static int vec_init_spinner = 0; | |
107 | |
108 // returns 0 or a negative error code on failure | |
109 int vec_init(void) | |
110 { | |
111 // This function is NOT thread safe. However, once vec | |
112 // is initialized, all of the vector functions are thread-safe. | |
113 // | |
114 // In fact, it's possible to use vec without calling | |
115 // vec_init() at all, but it would be completely useless since | |
116 // it would just use a generic implementation without any | |
117 // vectorization whatsoever (unless maybe the compiler is | |
118 // smart enough to optimize it into vectors) | |
119 | |
120 if (vec_init_spinner) | |
121 return 0; // already initialized, do nothing | |
122 | |
123 vec_uint32 cpu = vec_get_CPU_features(); | |
124 | |
125 #ifdef VEC_COMPILER_HAS_ALTIVEC | |
126 if (cpu & VEC_CPU_HAS_ALTIVEC) { | |
127 vint8x16_impl_cpu = &vint8x16_impl_altivec; | |
128 vuint8x16_impl_cpu = &vuint8x16_impl_altivec; | |
129 vint16x8_impl_cpu = &vint16x8_impl_altivec; | |
130 vuint16x8_impl_cpu = &vuint16x8_impl_altivec; | |
131 vint32x4_impl_cpu = &vint32x4_impl_altivec; | |
132 vuint32x4_impl_cpu = &vuint32x4_impl_altivec; | |
133 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX | |
134 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { | |
135 vint64x2_impl_cpu = &vint64x2_impl_altivec; | |
136 vuint64x2_impl_cpu = &vuint64x2_impl_altivec; | |
137 } | |
138 #endif | |
139 } | |
140 #endif | |
141 #ifdef VEC_COMPILER_HAS_AVX512F | |
142 if (cpu & VEC_CPU_HAS_AVX512F) { | |
143 vint8x64_impl_cpu = &vint8x64_impl_avx512f; | |
144 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; | |
145 vint16x32_impl_cpu = &vint16x32_impl_avx512f; | |
146 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; | |
147 vint32x16_impl_cpu = &vint32x16_impl_avx512f; | |
148 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; | |
149 vint64x8_impl_cpu = &vint64x8_impl_avx512f; | |
150 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; | |
151 } | |
152 #endif | |
153 #ifdef VEC_COMPILER_HAS_AVX2 | |
154 if (cpu & VEC_CPU_HAS_AVX2) { | |
155 vint8x32_impl_cpu = &vint8x32_impl_avx2; | |
156 vuint8x32_impl_cpu = &vuint8x32_impl_avx2; | |
157 vint16x16_impl_cpu = &vint16x16_impl_avx2; | |
158 vuint16x16_impl_cpu = &vuint16x16_impl_avx2; | |
159 vint32x8_impl_cpu = &vint32x8_impl_avx2; | |
160 vuint32x8_impl_cpu = &vuint32x8_impl_avx2; | |
161 vint64x4_impl_cpu = &vint64x4_impl_avx2; | |
162 vuint64x4_impl_cpu = &vuint64x4_impl_avx2; | |
163 } | |
164 #endif | |
165 #ifdef VEC_COMPILER_HAS_SSE2 | |
166 if (cpu & VEC_CPU_HAS_SSE2) { | |
167 vint8x16_impl_cpu = &vint8x16_impl_sse2; | |
168 vuint8x16_impl_cpu = &vuint8x16_impl_sse2; | |
169 vint16x8_impl_cpu = &vint16x8_impl_sse2; | |
170 vuint16x8_impl_cpu = &vuint16x8_impl_sse2; | |
171 # ifdef VEC_COMPILER_HAS_SSE41 | |
172 if (cpu & VEC_CPU_HAS_SSE41) { | |
173 vint32x4_impl_cpu = &vint32x4_impl_sse41; | |
174 vuint32x4_impl_cpu = &vuint32x4_impl_sse41; | |
175 } else | |
176 # endif | |
177 { | |
178 vint32x4_impl_cpu = &vint32x4_impl_sse2; | |
179 vuint32x4_impl_cpu = &vuint32x4_impl_sse2; | |
180 } | |
181 vint64x2_impl_cpu = &vint64x2_impl_sse2; | |
182 vuint64x2_impl_cpu = &vuint64x2_impl_sse2; | |
183 } | |
184 #endif | |
185 #ifdef VEC_COMPILER_HAS_MMX | |
186 if (cpu & VEC_CPU_HAS_MMX) { | |
187 vint8x8_impl_cpu = &vint8x8_impl_mmx; | |
188 vuint8x8_impl_cpu = &vuint8x8_impl_mmx; | |
189 vint16x4_impl_cpu = &vint16x4_impl_mmx; | |
190 vuint16x4_impl_cpu = &vuint16x4_impl_mmx; | |
191 vint32x2_impl_cpu = &vint32x2_impl_mmx; | |
192 vuint32x2_impl_cpu = &vuint32x2_impl_mmx; | |
193 } | |
194 #endif | |
195 #ifdef VEC_COMPILER_HAS_NEON | |
196 if (cpu & VEC_CPU_HAS_NEON) { | |
197 // 64-bit | |
198 vint8x8_impl_cpu = &vint8x8_impl_neon; | |
199 vuint8x8_impl_cpu = &vuint8x8_impl_neon; | |
200 vint16x4_impl_cpu = &vint16x4_impl_neon; | |
201 vuint16x4_impl_cpu = &vuint16x4_impl_neon; | |
202 vint32x2_impl_cpu = &vint32x2_impl_neon; | |
203 vuint32x2_impl_cpu = &vuint32x2_impl_neon; | |
204 | |
205 // 128-bit | |
206 vint8x16_impl_cpu = &vint8x16_impl_neon; | |
207 vuint8x16_impl_cpu = &vuint8x16_impl_neon; | |
208 vint16x8_impl_cpu = &vint16x8_impl_neon; | |
209 vuint16x8_impl_cpu = &vuint16x8_impl_neon; | |
210 vint32x4_impl_cpu = &vint32x4_impl_neon; | |
211 vuint32x4_impl_cpu = &vuint32x4_impl_neon; | |
212 vint64x2_impl_cpu = &vint64x2_impl_neon; | |
213 vuint64x2_impl_cpu = &vuint64x2_impl_neon; | |
214 } | |
215 #endif | |
216 { | |
217 // do nothing, they're already set to generics | |
218 } | |
219 | |
220 vec_init_spinner++; | |
221 | |
222 return 0; | |
223 } | |
224 | |
225 /* ---------------------------------------------------------------- */ | |
226 | |
227 #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ | |
228 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ | |
229 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ | |
230 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ | |
231 extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ | |
232 extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ | |
233 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
234 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
235 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
236 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
237 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
238 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ | |
242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | |
248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | |
249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); | |
250 | |
251 #define VEC_DEFINE_OPERATIONS(bits, size) \ | |
252 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ | |
253 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) | |
254 | |
255 // 16-bit | |
256 VEC_DEFINE_OPERATIONS(8, 2) | |
257 | |
258 // 32-bit | |
259 VEC_DEFINE_OPERATIONS(8, 4) | |
260 VEC_DEFINE_OPERATIONS(16, 2) | |
261 | |
262 // 64-bit | |
263 VEC_DEFINE_OPERATIONS(8, 8) | |
264 VEC_DEFINE_OPERATIONS(16, 4) | |
265 VEC_DEFINE_OPERATIONS(32, 2) | |
266 | |
267 // 128-bit | |
268 VEC_DEFINE_OPERATIONS(8, 16) | |
269 VEC_DEFINE_OPERATIONS(16, 8) | |
270 VEC_DEFINE_OPERATIONS(32, 4) | |
271 VEC_DEFINE_OPERATIONS(64, 2) | |
272 | |
273 // 256-bit | |
274 VEC_DEFINE_OPERATIONS(8, 32) | |
275 VEC_DEFINE_OPERATIONS(16, 16) | |
276 VEC_DEFINE_OPERATIONS(32, 8) | |
277 VEC_DEFINE_OPERATIONS(64, 4) | |
278 | |
279 // 512-bit | |
280 VEC_DEFINE_OPERATIONS(8, 64) | |
281 VEC_DEFINE_OPERATIONS(16, 32) | |
282 VEC_DEFINE_OPERATIONS(32, 16) | |
283 VEC_DEFINE_OPERATIONS(64, 8) | |
284 | |
285 #undef VEC_DEFINE_OPERATIONS | |
286 #undef VEC_DEFINE_OPERATIONS_SIGN |