comparison src/vec.c @ 23:e26874655738

*: huge refactor, new major release (hahaha) I keep finding things that are broken... The problem NOW was that vec would unintentionally build some functions with extended instruction sets, which is Bad and would mean that for all intents and purposes the CPU detection was completely broken. Now vec is no longer header only either. Boohoo. However this gives a lot more flexibility to vec since we no longer want or need to care about C++ crap. The NEON and Altivec implementations have not been updated which means they won't compile hence why they're commented out in the cmake build file.
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 02:52:40 -0500
parents e05c257c6a23
children 92156fe32755
comparison
equal deleted inserted replaced
22:fbcd3fa6f8fc 23:e26874655738
1 #define VEC_IMPLEMENTATION 1 /**
2 * vec - a tiny SIMD vector library in C99
3 *
4 * Copyright (c) 2024 Paper
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 **/
24
2 #include "vec/vec.h" 25 #include "vec/vec.h"
26 #include "vec/cpu.h"
27 #include "vec/impl/generic.h"
28 #include "vec/impl/fallback.h"
29 #ifdef VEC_COMPILER_HAS_MMX
30 # include "vec/impl/x86/mmx.h"
31 #endif
32 #ifdef VEC_COMPILER_HAS_SSE2
33 # include "vec/impl/x86/sse2.h"
34 #endif
35 #ifdef VEC_COMPILER_HAS_SSE41
36 # include "vec/impl/x86/sse41.h"
37 #endif
38 #ifdef VEC_COMPILER_HAS_AVX2
39 # include "vec/impl/x86/avx2.h"
40 #endif
41 #ifdef VEC_COMPILER_HAS_AVX512F
42 # include "vec/impl/x86/avx512f.h"
43 #endif
44 #ifdef VEC_COMPILER_HAS_ALTIVEC
45 # include "vec/impl/ppc/altivec.h"
46 #endif
47 #ifdef VEC_COMPILER_HAS_NEON
48 # include "vec/impl/arm/neon.h"
49 #endif
50
51 extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y);
52 extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y);
53 extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
54 extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
55 extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
56 extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);
57
58 // 16-bit
59 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic;
60 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic;
61
62 // 32-bit
63 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic;
64 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic;
65 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic;
66 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
67
68 // 64-bit
69 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic;
70 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic;
71 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic;
72 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
73 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic;
74 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
75
76 // 128-bit
77 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic;
78 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
79 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic;
80 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
81 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic;
82 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
83 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic;
84 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
85
86 // 256-bit
87 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic;
88 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic;
89 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic;
90 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
91 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic;
92 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic;
93 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic;
94 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic;
95
96 // 512-bit
97 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic;
98 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic;
99 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic;
100 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
101 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic;
102 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
103 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic;
104 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic;
105
106 static int vec_init_spinner = 0;
107
108 // returns 0 or a negative error code on failure
109 int vec_init(void)
110 {
111 // This function is NOT thread safe. However, once vec
112 // is initialized, all of the vector functions are thread-safe.
113 //
114 // In fact, it's possible to use vec without calling
115 // vec_init() at all, but it would be completely useless since
116 // it would just use a generic implementation without any
117 // vectorization whatsoever (unless maybe the compiler is
118 // smart enough to optimize it into vectors)
119
120 if (vec_init_spinner)
121 return 0; // already initialized, do nothing
122
123 vec_uint32 cpu = vec_get_CPU_features();
124
125 #ifdef VEC_COMPILER_HAS_ALTIVEC
126 if (cpu & VEC_CPU_HAS_ALTIVEC) {
127 vint8x16_impl_cpu = &vint8x16_impl_altivec;
128 vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
129 vint16x8_impl_cpu = &vint16x8_impl_altivec;
130 vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
131 vint32x4_impl_cpu = &vint32x4_impl_altivec;
132 vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
133 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
134 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
135 vint64x2_impl_cpu = &vint64x2_impl_altivec;
136 vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
137 }
138 #endif
139 }
140 #endif
141 #ifdef VEC_COMPILER_HAS_AVX512F
142 if (cpu & VEC_CPU_HAS_AVX512F) {
143 vint8x64_impl_cpu = &vint8x64_impl_avx512f;
144 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
145 vint16x32_impl_cpu = &vint16x32_impl_avx512f;
146 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
147 vint32x16_impl_cpu = &vint32x16_impl_avx512f;
148 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
149 vint64x8_impl_cpu = &vint64x8_impl_avx512f;
150 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
151 }
152 #endif
153 #ifdef VEC_COMPILER_HAS_AVX2
154 if (cpu & VEC_CPU_HAS_AVX2) {
155 vint8x32_impl_cpu = &vint8x32_impl_avx2;
156 vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
157 vint16x16_impl_cpu = &vint16x16_impl_avx2;
158 vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
159 vint32x8_impl_cpu = &vint32x8_impl_avx2;
160 vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
161 vint64x4_impl_cpu = &vint64x4_impl_avx2;
162 vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
163 }
164 #endif
165 #ifdef VEC_COMPILER_HAS_SSE2
166 if (cpu & VEC_CPU_HAS_SSE2) {
167 vint8x16_impl_cpu = &vint8x16_impl_sse2;
168 vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
169 vint16x8_impl_cpu = &vint16x8_impl_sse2;
170 vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
171 # ifdef VEC_COMPILER_HAS_SSE41
172 if (cpu & VEC_CPU_HAS_SSE41) {
173 vint32x4_impl_cpu = &vint32x4_impl_sse41;
174 vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
175 } else
176 # endif
177 {
178 vint32x4_impl_cpu = &vint32x4_impl_sse2;
179 vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
180 }
181 vint64x2_impl_cpu = &vint64x2_impl_sse2;
182 vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
183 }
184 #endif
185 #ifdef VEC_COMPILER_HAS_MMX
186 if (cpu & VEC_CPU_HAS_MMX) {
187 vint8x8_impl_cpu = &vint8x8_impl_mmx;
188 vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
189 vint16x4_impl_cpu = &vint16x4_impl_mmx;
190 vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
191 vint32x2_impl_cpu = &vint32x2_impl_mmx;
192 vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
193 }
194 #endif
195 #ifdef VEC_COMPILER_HAS_NEON
196 if (cpu & VEC_CPU_HAS_NEON) {
197 // 64-bit
198 vint8x8_impl_cpu = &vint8x8_impl_neon;
199 vuint8x8_impl_cpu = &vuint8x8_impl_neon;
200 vint16x4_impl_cpu = &vint16x4_impl_neon;
201 vuint16x4_impl_cpu = &vuint16x4_impl_neon;
202 vint32x2_impl_cpu = &vint32x2_impl_neon;
203 vuint32x2_impl_cpu = &vuint32x2_impl_neon;
204
205 // 128-bit
206 vint8x16_impl_cpu = &vint8x16_impl_neon;
207 vuint8x16_impl_cpu = &vuint8x16_impl_neon;
208 vint16x8_impl_cpu = &vint16x8_impl_neon;
209 vuint16x8_impl_cpu = &vuint16x8_impl_neon;
210 vint32x4_impl_cpu = &vint32x4_impl_neon;
211 vuint32x4_impl_cpu = &vuint32x4_impl_neon;
212 vint64x2_impl_cpu = &vint64x2_impl_neon;
213 vuint64x2_impl_cpu = &vuint64x2_impl_neon;
214 }
215 #endif
216 {
217 // do nothing, they're already set to generics
218 }
219
220 vec_init_spinner++;
221
222 return 0;
223 }
224
225 /* ---------------------------------------------------------------- */
226
227 #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
228 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
229 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
230 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
231 extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
232 extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
233 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
234 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
235 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
236 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
237 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
238 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
250
251 #define VEC_DEFINE_OPERATIONS(bits, size) \
252 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
253 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
254
255 // 16-bit
256 VEC_DEFINE_OPERATIONS(8, 2)
257
258 // 32-bit
259 VEC_DEFINE_OPERATIONS(8, 4)
260 VEC_DEFINE_OPERATIONS(16, 2)
261
262 // 64-bit
263 VEC_DEFINE_OPERATIONS(8, 8)
264 VEC_DEFINE_OPERATIONS(16, 4)
265 VEC_DEFINE_OPERATIONS(32, 2)
266
267 // 128-bit
268 VEC_DEFINE_OPERATIONS(8, 16)
269 VEC_DEFINE_OPERATIONS(16, 8)
270 VEC_DEFINE_OPERATIONS(32, 4)
271 VEC_DEFINE_OPERATIONS(64, 2)
272
273 // 256-bit
274 VEC_DEFINE_OPERATIONS(8, 32)
275 VEC_DEFINE_OPERATIONS(16, 16)
276 VEC_DEFINE_OPERATIONS(32, 8)
277 VEC_DEFINE_OPERATIONS(64, 4)
278
279 // 512-bit
280 VEC_DEFINE_OPERATIONS(8, 64)
281 VEC_DEFINE_OPERATIONS(16, 32)
282 VEC_DEFINE_OPERATIONS(32, 16)
283 VEC_DEFINE_OPERATIONS(64, 8)
284
285 #undef VEC_DEFINE_OPERATIONS
286 #undef VEC_DEFINE_OPERATIONS_SIGN