Mercurial > vec
comparison src/vec.c @ 36:677c03c382b8
Backed out changeset e26874655738
| author | Paper <paper@tflc.us> |
|---|---|
| date | Fri, 25 Apr 2025 17:40:55 -0400 |
| parents | 8b5e0974fd41 |
| children |
comparison
equal
deleted
inserted
replaced
| 35:99e4539f922f | 36:677c03c382b8 |
|---|---|
| 1 /** | 1 #define VEC_IMPLEMENTATION |
| 2 * vec - a tiny SIMD vector library in C99 | |
| 3 * | |
| 4 * Copyright (c) 2024 Paper | |
| 5 * | |
| 6 * Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 7 * of this software and associated documentation files (the "Software"), to deal | |
| 8 * in the Software without restriction, including without limitation the rights | |
| 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 10 * copies of the Software, and to permit persons to whom the Software is | |
| 11 * furnished to do so, subject to the following conditions: | |
| 12 * | |
| 13 * The above copyright notice and this permission notice shall be included in all | |
| 14 * copies or substantial portions of the Software. | |
| 15 * | |
| 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 22 * SOFTWARE. | |
| 23 **/ | |
| 24 | |
| 25 #include "vec/vec.h" | 2 #include "vec/vec.h" |
| 26 #include "vec/cpu.h" | |
| 27 #include "vec/impl/generic.h" | |
| 28 #include "vec/impl/fallback.h" | |
| 29 #ifdef VEC_COMPILER_HAS_MMX | |
| 30 # include "vec/impl/x86/mmx.h" | |
| 31 #endif | |
| 32 #ifdef VEC_COMPILER_HAS_SSE2 | |
| 33 # include "vec/impl/x86/sse2.h" | |
| 34 #endif | |
| 35 #ifdef VEC_COMPILER_HAS_SSE41 | |
| 36 # include "vec/impl/x86/sse41.h" | |
| 37 #endif | |
| 38 #ifdef VEC_COMPILER_HAS_AVX2 | |
| 39 # include "vec/impl/x86/avx2.h" | |
| 40 #endif | |
| 41 #ifdef VEC_COMPILER_HAS_AVX512F | |
| 42 # include "vec/impl/x86/avx512f.h" | |
| 43 #endif | |
| 44 #ifdef VEC_COMPILER_HAS_ALTIVEC | |
| 45 # include "vec/impl/ppc/altivec.h" | |
| 46 #endif | |
| 47 #ifdef VEC_COMPILER_HAS_NEON | |
| 48 # include "vec/impl/arm/neon.h" | |
| 49 #endif | |
| 50 | |
| 51 extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y); | |
| 52 extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y); | |
| 53 extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); | |
| 54 extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); | |
| 55 extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); | |
| 56 extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); | |
| 57 | |
| 58 // 16-bit | |
| 59 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; | |
| 60 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; | |
| 61 | |
| 62 // 32-bit | |
| 63 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; | |
| 64 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; | |
| 65 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; | |
| 66 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; | |
| 67 | |
| 68 // 64-bit | |
| 69 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; | |
| 70 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; | |
| 71 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; | |
| 72 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; | |
| 73 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; | |
| 74 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; | |
| 75 | |
| 76 // 128-bit | |
| 77 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; | |
| 78 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; | |
| 79 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; | |
| 80 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; | |
| 81 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; | |
| 82 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; | |
| 83 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; | |
| 84 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; | |
| 85 | |
| 86 // 256-bit | |
| 87 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; | |
| 88 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; | |
| 89 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; | |
| 90 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; | |
| 91 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; | |
| 92 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; | |
| 93 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; | |
| 94 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; | |
| 95 | |
| 96 // 512-bit | |
| 97 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; | |
| 98 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; | |
| 99 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; | |
| 100 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; | |
| 101 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; | |
| 102 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; | |
| 103 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; | |
| 104 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; | |
| 105 | |
| 106 static int vec_init_spinner = 0; | |
| 107 | |
| 108 // returns 0 or a negative error code on failure | |
| 109 int vec_init(void) | |
| 110 { | |
| 111 // This function is NOT thread safe. However, once vec | |
| 112 // is initialized, all of the vector functions are thread-safe. | |
| 113 // | |
| 114 // In fact, it's possible to use vec without calling | |
| 115 // vec_init() at all, but it would be completely useless since | |
| 116 // it would just use a generic implementation without any | |
| 117 // vectorization whatsoever (unless maybe the compiler is | |
| 118 // smart enough to optimize it into vectors) | |
| 119 | |
| 120 if (vec_init_spinner) | |
| 121 return 0; // already initialized, do nothing | |
| 122 | |
| 123 vec_uint32 cpu = vec_get_CPU_features(); | |
| 124 | |
| 125 #ifdef VEC_COMPILER_HAS_ALTIVEC | |
| 126 if (cpu & VEC_CPU_HAS_ALTIVEC) { | |
| 127 vint8x16_impl_cpu = &vint8x16_impl_altivec; | |
| 128 vuint8x16_impl_cpu = &vuint8x16_impl_altivec; | |
| 129 vint16x8_impl_cpu = &vint16x8_impl_altivec; | |
| 130 vuint16x8_impl_cpu = &vuint16x8_impl_altivec; | |
| 131 vint32x4_impl_cpu = &vint32x4_impl_altivec; | |
| 132 vuint32x4_impl_cpu = &vuint32x4_impl_altivec; | |
| 133 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX | |
| 134 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { | |
| 135 vint64x2_impl_cpu = &vint64x2_impl_altivec; | |
| 136 vuint64x2_impl_cpu = &vuint64x2_impl_altivec; | |
| 137 } | |
| 138 #endif | |
| 139 } | |
| 140 #endif | |
| 141 #ifdef VEC_COMPILER_HAS_AVX512F | |
| 142 if (cpu & VEC_CPU_HAS_AVX512F) { | |
| 143 vint8x64_impl_cpu = &vint8x64_impl_avx512f; | |
| 144 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; | |
| 145 vint16x32_impl_cpu = &vint16x32_impl_avx512f; | |
| 146 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; | |
| 147 vint32x16_impl_cpu = &vint32x16_impl_avx512f; | |
| 148 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; | |
| 149 vint64x8_impl_cpu = &vint64x8_impl_avx512f; | |
| 150 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; | |
| 151 } | |
| 152 #endif | |
| 153 #ifdef VEC_COMPILER_HAS_AVX2 | |
| 154 if (cpu & VEC_CPU_HAS_AVX2) { | |
| 155 vint8x32_impl_cpu = &vint8x32_impl_avx2; | |
| 156 vuint8x32_impl_cpu = &vuint8x32_impl_avx2; | |
| 157 vint16x16_impl_cpu = &vint16x16_impl_avx2; | |
| 158 vuint16x16_impl_cpu = &vuint16x16_impl_avx2; | |
| 159 vint32x8_impl_cpu = &vint32x8_impl_avx2; | |
| 160 vuint32x8_impl_cpu = &vuint32x8_impl_avx2; | |
| 161 vint64x4_impl_cpu = &vint64x4_impl_avx2; | |
| 162 vuint64x4_impl_cpu = &vuint64x4_impl_avx2; | |
| 163 } | |
| 164 #endif | |
| 165 #ifdef VEC_COMPILER_HAS_SSE2 | |
| 166 if (cpu & VEC_CPU_HAS_SSE2) { | |
| 167 vint8x16_impl_cpu = &vint8x16_impl_sse2; | |
| 168 vuint8x16_impl_cpu = &vuint8x16_impl_sse2; | |
| 169 vint16x8_impl_cpu = &vint16x8_impl_sse2; | |
| 170 vuint16x8_impl_cpu = &vuint16x8_impl_sse2; | |
| 171 # ifdef VEC_COMPILER_HAS_SSE41 | |
| 172 if (cpu & VEC_CPU_HAS_SSE41) { | |
| 173 vint32x4_impl_cpu = &vint32x4_impl_sse41; | |
| 174 vuint32x4_impl_cpu = &vuint32x4_impl_sse41; | |
| 175 } else | |
| 176 # endif | |
| 177 { | |
| 178 vint32x4_impl_cpu = &vint32x4_impl_sse2; | |
| 179 vuint32x4_impl_cpu = &vuint32x4_impl_sse2; | |
| 180 } | |
| 181 vint64x2_impl_cpu = &vint64x2_impl_sse2; | |
| 182 vuint64x2_impl_cpu = &vuint64x2_impl_sse2; | |
| 183 } | |
| 184 #endif | |
| 185 #ifdef VEC_COMPILER_HAS_MMX | |
| 186 if (cpu & VEC_CPU_HAS_MMX) { | |
| 187 vint8x8_impl_cpu = &vint8x8_impl_mmx; | |
| 188 vuint8x8_impl_cpu = &vuint8x8_impl_mmx; | |
| 189 vint16x4_impl_cpu = &vint16x4_impl_mmx; | |
| 190 vuint16x4_impl_cpu = &vuint16x4_impl_mmx; | |
| 191 vint32x2_impl_cpu = &vint32x2_impl_mmx; | |
| 192 vuint32x2_impl_cpu = &vuint32x2_impl_mmx; | |
| 193 } | |
| 194 #endif | |
| 195 #ifdef VEC_COMPILER_HAS_NEON | |
| 196 if (cpu & VEC_CPU_HAS_NEON) { | |
| 197 // 64-bit | |
| 198 vint8x8_impl_cpu = &vint8x8_impl_neon; | |
| 199 vuint8x8_impl_cpu = &vuint8x8_impl_neon; | |
| 200 vint16x4_impl_cpu = &vint16x4_impl_neon; | |
| 201 vuint16x4_impl_cpu = &vuint16x4_impl_neon; | |
| 202 vint32x2_impl_cpu = &vint32x2_impl_neon; | |
| 203 vuint32x2_impl_cpu = &vuint32x2_impl_neon; | |
| 204 | |
| 205 // 128-bit | |
| 206 vint8x16_impl_cpu = &vint8x16_impl_neon; | |
| 207 vuint8x16_impl_cpu = &vuint8x16_impl_neon; | |
| 208 vint16x8_impl_cpu = &vint16x8_impl_neon; | |
| 209 vuint16x8_impl_cpu = &vuint16x8_impl_neon; | |
| 210 vint32x4_impl_cpu = &vint32x4_impl_neon; | |
| 211 vuint32x4_impl_cpu = &vuint32x4_impl_neon; | |
| 212 vint64x2_impl_cpu = &vint64x2_impl_neon; | |
| 213 vuint64x2_impl_cpu = &vuint64x2_impl_neon; | |
| 214 } | |
| 215 #endif | |
| 216 { | |
| 217 // do nothing, they're already set to generics | |
| 218 } | |
| 219 | |
| 220 vec_init_spinner++; | |
| 221 | |
| 222 return 0; | |
| 223 } | |
| 224 | |
| 225 /* ---------------------------------------------------------------- */ | |
| 226 | |
| 227 #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ | |
| 228 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ | |
| 229 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ | |
| 230 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ | |
| 231 extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ | |
| 232 extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ | |
| 233 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 234 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 235 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 236 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 237 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 238 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ | |
| 242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | |
| 248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | |
| 249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); | |
| 250 | |
| 251 #define VEC_DEFINE_OPERATIONS(bits, size) \ | |
| 252 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ | |
| 253 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) | |
| 254 | |
| 255 // 16-bit | |
| 256 VEC_DEFINE_OPERATIONS(8, 2) | |
| 257 | |
| 258 // 32-bit | |
| 259 VEC_DEFINE_OPERATIONS(8, 4) | |
| 260 VEC_DEFINE_OPERATIONS(16, 2) | |
| 261 | |
| 262 // 64-bit | |
| 263 VEC_DEFINE_OPERATIONS(8, 8) | |
| 264 VEC_DEFINE_OPERATIONS(16, 4) | |
| 265 VEC_DEFINE_OPERATIONS(32, 2) | |
| 266 | |
| 267 // 128-bit | |
| 268 VEC_DEFINE_OPERATIONS(8, 16) | |
| 269 VEC_DEFINE_OPERATIONS(16, 8) | |
| 270 VEC_DEFINE_OPERATIONS(32, 4) | |
| 271 VEC_DEFINE_OPERATIONS(64, 2) | |
| 272 | |
| 273 // 256-bit | |
| 274 VEC_DEFINE_OPERATIONS(8, 32) | |
| 275 VEC_DEFINE_OPERATIONS(16, 16) | |
| 276 VEC_DEFINE_OPERATIONS(32, 8) | |
| 277 VEC_DEFINE_OPERATIONS(64, 4) | |
| 278 | |
| 279 // 512-bit | |
| 280 VEC_DEFINE_OPERATIONS(8, 64) | |
| 281 VEC_DEFINE_OPERATIONS(16, 32) | |
| 282 VEC_DEFINE_OPERATIONS(32, 16) | |
| 283 VEC_DEFINE_OPERATIONS(64, 8) | |
| 284 | |
| 285 #undef VEC_DEFINE_OPERATIONS | |
| 286 #undef VEC_DEFINE_OPERATIONS_SIGN |
