comparison src/vec.c @ 36:677c03c382b8

Backed out changeset e26874655738
author Paper <paper@tflc.us>
date Fri, 25 Apr 2025 17:40:55 -0400
parents 8b5e0974fd41
children
comparison
equal deleted inserted replaced
35:99e4539f922f 36:677c03c382b8
1 /** 1 #define VEC_IMPLEMENTATION
2 * vec - a tiny SIMD vector library in C99
3 *
4 * Copyright (c) 2024 Paper
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 **/
24
25 #include "vec/vec.h" 2 #include "vec/vec.h"
26 #include "vec/cpu.h"
27 #include "vec/impl/generic.h"
28 #include "vec/impl/fallback.h"
29 #ifdef VEC_COMPILER_HAS_MMX
30 # include "vec/impl/x86/mmx.h"
31 #endif
32 #ifdef VEC_COMPILER_HAS_SSE2
33 # include "vec/impl/x86/sse2.h"
34 #endif
35 #ifdef VEC_COMPILER_HAS_SSE41
36 # include "vec/impl/x86/sse41.h"
37 #endif
38 #ifdef VEC_COMPILER_HAS_AVX2
39 # include "vec/impl/x86/avx2.h"
40 #endif
41 #ifdef VEC_COMPILER_HAS_AVX512F
42 # include "vec/impl/x86/avx512f.h"
43 #endif
44 #ifdef VEC_COMPILER_HAS_ALTIVEC
45 # include "vec/impl/ppc/altivec.h"
46 #endif
47 #ifdef VEC_COMPILER_HAS_NEON
48 # include "vec/impl/arm/neon.h"
49 #endif
50
51 extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y);
52 extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y);
53 extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
54 extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
55 extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
56 extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);
57
58 // 16-bit
59 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic;
60 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic;
61
62 // 32-bit
63 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic;
64 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic;
65 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic;
66 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
67
68 // 64-bit
69 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic;
70 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic;
71 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic;
72 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
73 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic;
74 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
75
76 // 128-bit
77 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic;
78 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
79 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic;
80 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
81 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic;
82 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
83 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic;
84 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
85
86 // 256-bit
87 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic;
88 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic;
89 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic;
90 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
91 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic;
92 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic;
93 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic;
94 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic;
95
96 // 512-bit
97 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic;
98 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic;
99 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic;
100 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
101 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic;
102 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
103 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic;
104 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic;
105
106 static int vec_init_spinner = 0;
107
108 // returns 0 or a negative error code on failure
109 int vec_init(void)
110 {
111 // This function is NOT thread safe. However, once vec
112 // is initialized, all of the vector functions are thread-safe.
113 //
114 // In fact, it's possible to use vec without calling
115 // vec_init() at all, but it would be completely useless since
116 // it would just use a generic implementation without any
117 // vectorization whatsoever (unless maybe the compiler is
118 // smart enough to optimize it into vectors)
119
120 if (vec_init_spinner)
121 return 0; // already initialized, do nothing
122
123 vec_uint32 cpu = vec_get_CPU_features();
124
125 #ifdef VEC_COMPILER_HAS_ALTIVEC
126 if (cpu & VEC_CPU_HAS_ALTIVEC) {
127 vint8x16_impl_cpu = &vint8x16_impl_altivec;
128 vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
129 vint16x8_impl_cpu = &vint16x8_impl_altivec;
130 vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
131 vint32x4_impl_cpu = &vint32x4_impl_altivec;
132 vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
133 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
134 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
135 vint64x2_impl_cpu = &vint64x2_impl_altivec;
136 vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
137 }
138 #endif
139 }
140 #endif
141 #ifdef VEC_COMPILER_HAS_AVX512F
142 if (cpu & VEC_CPU_HAS_AVX512F) {
143 vint8x64_impl_cpu = &vint8x64_impl_avx512f;
144 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
145 vint16x32_impl_cpu = &vint16x32_impl_avx512f;
146 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
147 vint32x16_impl_cpu = &vint32x16_impl_avx512f;
148 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
149 vint64x8_impl_cpu = &vint64x8_impl_avx512f;
150 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
151 }
152 #endif
153 #ifdef VEC_COMPILER_HAS_AVX2
154 if (cpu & VEC_CPU_HAS_AVX2) {
155 vint8x32_impl_cpu = &vint8x32_impl_avx2;
156 vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
157 vint16x16_impl_cpu = &vint16x16_impl_avx2;
158 vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
159 vint32x8_impl_cpu = &vint32x8_impl_avx2;
160 vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
161 vint64x4_impl_cpu = &vint64x4_impl_avx2;
162 vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
163 }
164 #endif
165 #ifdef VEC_COMPILER_HAS_SSE2
166 if (cpu & VEC_CPU_HAS_SSE2) {
167 vint8x16_impl_cpu = &vint8x16_impl_sse2;
168 vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
169 vint16x8_impl_cpu = &vint16x8_impl_sse2;
170 vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
171 # ifdef VEC_COMPILER_HAS_SSE41
172 if (cpu & VEC_CPU_HAS_SSE41) {
173 vint32x4_impl_cpu = &vint32x4_impl_sse41;
174 vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
175 } else
176 # endif
177 {
178 vint32x4_impl_cpu = &vint32x4_impl_sse2;
179 vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
180 }
181 vint64x2_impl_cpu = &vint64x2_impl_sse2;
182 vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
183 }
184 #endif
185 #ifdef VEC_COMPILER_HAS_MMX
186 if (cpu & VEC_CPU_HAS_MMX) {
187 vint8x8_impl_cpu = &vint8x8_impl_mmx;
188 vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
189 vint16x4_impl_cpu = &vint16x4_impl_mmx;
190 vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
191 vint32x2_impl_cpu = &vint32x2_impl_mmx;
192 vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
193 }
194 #endif
195 #ifdef VEC_COMPILER_HAS_NEON
196 if (cpu & VEC_CPU_HAS_NEON) {
197 // 64-bit
198 vint8x8_impl_cpu = &vint8x8_impl_neon;
199 vuint8x8_impl_cpu = &vuint8x8_impl_neon;
200 vint16x4_impl_cpu = &vint16x4_impl_neon;
201 vuint16x4_impl_cpu = &vuint16x4_impl_neon;
202 vint32x2_impl_cpu = &vint32x2_impl_neon;
203 vuint32x2_impl_cpu = &vuint32x2_impl_neon;
204
205 // 128-bit
206 vint8x16_impl_cpu = &vint8x16_impl_neon;
207 vuint8x16_impl_cpu = &vuint8x16_impl_neon;
208 vint16x8_impl_cpu = &vint16x8_impl_neon;
209 vuint16x8_impl_cpu = &vuint16x8_impl_neon;
210 vint32x4_impl_cpu = &vint32x4_impl_neon;
211 vuint32x4_impl_cpu = &vuint32x4_impl_neon;
212 vint64x2_impl_cpu = &vint64x2_impl_neon;
213 vuint64x2_impl_cpu = &vuint64x2_impl_neon;
214 }
215 #endif
216 {
217 // do nothing, they're already set to generics
218 }
219
220 vec_init_spinner++;
221
222 return 0;
223 }
224
225 /* ---------------------------------------------------------------- */
226
227 #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
228 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
229 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
230 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
231 extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
232 extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
233 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
234 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
235 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
236 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
237 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
238 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
250
251 #define VEC_DEFINE_OPERATIONS(bits, size) \
252 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
253 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
254
255 // 16-bit
256 VEC_DEFINE_OPERATIONS(8, 2)
257
258 // 32-bit
259 VEC_DEFINE_OPERATIONS(8, 4)
260 VEC_DEFINE_OPERATIONS(16, 2)
261
262 // 64-bit
263 VEC_DEFINE_OPERATIONS(8, 8)
264 VEC_DEFINE_OPERATIONS(16, 4)
265 VEC_DEFINE_OPERATIONS(32, 2)
266
267 // 128-bit
268 VEC_DEFINE_OPERATIONS(8, 16)
269 VEC_DEFINE_OPERATIONS(16, 8)
270 VEC_DEFINE_OPERATIONS(32, 4)
271 VEC_DEFINE_OPERATIONS(64, 2)
272
273 // 256-bit
274 VEC_DEFINE_OPERATIONS(8, 32)
275 VEC_DEFINE_OPERATIONS(16, 16)
276 VEC_DEFINE_OPERATIONS(32, 8)
277 VEC_DEFINE_OPERATIONS(64, 4)
278
279 // 512-bit
280 VEC_DEFINE_OPERATIONS(8, 64)
281 VEC_DEFINE_OPERATIONS(16, 32)
282 VEC_DEFINE_OPERATIONS(32, 16)
283 VEC_DEFINE_OPERATIONS(64, 8)
284
285 #undef VEC_DEFINE_OPERATIONS
286 #undef VEC_DEFINE_OPERATIONS_SIGN