Mercurial > vec
comparison src/vec.c @ 28:c6c99ab1088a
*: add min/max functions and a big big refactor (again)
agh, this time I added a few more implementations (and generally
made the code just a little faster...)
| author | Paper <paper@tflc.us> |
|---|---|
| date | Thu, 24 Apr 2025 00:54:02 -0400 |
| parents | 92156fe32755 |
| children | e59c91d050c0 |
comparison
equal
deleted
inserted
replaced
| 27:d00b95f95dd1 | 28:c6c99ab1088a |
|---|---|
| 30 # include "vec/impl/x86/mmx.h" | 30 # include "vec/impl/x86/mmx.h" |
| 31 #endif | 31 #endif |
| 32 #ifdef VEC_COMPILER_HAS_SSE2 | 32 #ifdef VEC_COMPILER_HAS_SSE2 |
| 33 # include "vec/impl/x86/sse2.h" | 33 # include "vec/impl/x86/sse2.h" |
| 34 #endif | 34 #endif |
| 35 #ifdef VEC_COMPILER_HAS_SSE3 | |
| 36 # include "vec/impl/x86/sse3.h" | |
| 37 #endif | |
| 35 #ifdef VEC_COMPILER_HAS_SSE41 | 38 #ifdef VEC_COMPILER_HAS_SSE41 |
| 36 # include "vec/impl/x86/sse41.h" | 39 # include "vec/impl/x86/sse41.h" |
| 37 #endif | 40 #endif |
| 41 #ifdef VEC_COMPILER_HAS_SSE42 | |
| 42 # include "vec/impl/x86/sse42.h" | |
| 43 #endif | |
| 38 #ifdef VEC_COMPILER_HAS_AVX2 | 44 #ifdef VEC_COMPILER_HAS_AVX2 |
| 39 # include "vec/impl/x86/avx2.h" | 45 # include "vec/impl/x86/avx2.h" |
| 40 #endif | 46 #endif |
| 41 #ifdef VEC_COMPILER_HAS_AVX512F | 47 #ifdef VEC_COMPILER_HAS_AVX512F |
| 42 # include "vec/impl/x86/avx512f.h" | 48 # include "vec/impl/x86/avx512f.h" |
| 49 #endif | |
| 50 #ifdef VEC_COMPILER_HAS_AVX512BW | |
| 51 # include "vec/impl/x86/avx512bw.h" | |
| 52 #endif | |
| 53 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
| 54 # include "vec/impl/x86/avx512dq.h" | |
| 43 #endif | 55 #endif |
| 44 #ifdef VEC_COMPILER_HAS_ALTIVEC | 56 #ifdef VEC_COMPILER_HAS_ALTIVEC |
| 45 # include "vec/impl/ppc/altivec.h" | 57 # include "vec/impl/ppc/altivec.h" |
| 46 #endif | 58 #endif |
| 47 #ifdef VEC_COMPILER_HAS_NEON | 59 #ifdef VEC_COMPILER_HAS_NEON |
| 57 | 69 |
| 58 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); | 70 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); |
| 59 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); | 71 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); |
| 60 | 72 |
| 61 // 16-bit | 73 // 16-bit |
| 62 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; | 74 vint8x2_impl vint8x2_impl_cpu = {0}; |
| 63 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; | 75 vuint8x2_impl vuint8x2_impl_cpu = {0}; |
| 64 | 76 |
| 65 // 32-bit | 77 // 32-bit |
| 66 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; | 78 vint8x4_impl vint8x4_impl_cpu = {0}; |
| 67 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; | 79 vuint8x4_impl vuint8x4_impl_cpu = {0}; |
| 68 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; | 80 vint16x2_impl vint16x2_impl_cpu = {0}; |
| 69 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; | 81 vuint16x2_impl vuint16x2_impl_cpu = {0}; |
| 70 | 82 |
| 71 // 64-bit | 83 // 64-bit |
| 72 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; | 84 vint8x8_impl vint8x8_impl_cpu = {0}; |
| 73 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; | 85 vuint8x8_impl vuint8x8_impl_cpu = {0}; |
| 74 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; | 86 vint16x4_impl vint16x4_impl_cpu = {0}; |
| 75 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; | 87 vuint16x4_impl vuint16x4_impl_cpu = {0}; |
| 76 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; | 88 vint32x2_impl vint32x2_impl_cpu = {0}; |
| 77 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; | 89 vuint32x2_impl vuint32x2_impl_cpu = {0}; |
| 78 | 90 |
| 79 // 128-bit | 91 // 128-bit |
| 80 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; | 92 vint8x16_impl vint8x16_impl_cpu = {0}; |
| 81 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; | 93 vuint8x16_impl vuint8x16_impl_cpu = {0}; |
| 82 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; | 94 vint16x8_impl vint16x8_impl_cpu = {0}; |
| 83 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; | 95 vuint16x8_impl vuint16x8_impl_cpu = {0}; |
| 84 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; | 96 vint32x4_impl vint32x4_impl_cpu = {0}; |
| 85 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; | 97 vuint32x4_impl vuint32x4_impl_cpu = {0}; |
| 86 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; | 98 vint64x2_impl vint64x2_impl_cpu = {0}; |
| 87 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; | 99 vuint64x2_impl vuint64x2_impl_cpu = {0}; |
| 88 | 100 |
| 89 // 256-bit | 101 // 256-bit |
| 90 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; | 102 vint8x32_impl vint8x32_impl_cpu = {0}; |
| 91 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; | 103 vuint8x32_impl vuint8x32_impl_cpu = {0}; |
| 92 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; | 104 vint16x16_impl vint16x16_impl_cpu = {0}; |
| 93 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; | 105 vuint16x16_impl vuint16x16_impl_cpu = {0}; |
| 94 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; | 106 vint32x8_impl vint32x8_impl_cpu = {0}; |
| 95 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; | 107 vuint32x8_impl vuint32x8_impl_cpu = {0}; |
| 96 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; | 108 vint64x4_impl vint64x4_impl_cpu = {0}; |
| 97 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; | 109 vuint64x4_impl vuint64x4_impl_cpu = {0}; |
| 98 | 110 |
| 99 // 512-bit | 111 // 512-bit |
| 100 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; | 112 vint8x64_impl vint8x64_impl_cpu = {0}; |
| 101 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; | 113 vuint8x64_impl vuint8x64_impl_cpu = {0}; |
| 102 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; | 114 vint16x32_impl vint16x32_impl_cpu = {0}; |
| 103 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; | 115 vuint16x32_impl vuint16x32_impl_cpu = {0}; |
| 104 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; | 116 vint32x16_impl vint32x16_impl_cpu = {0}; |
| 105 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; | 117 vuint32x16_impl vuint32x16_impl_cpu = {0}; |
| 106 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; | 118 vint64x8_impl vint64x8_impl_cpu = {0}; |
| 107 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; | 119 vuint64x8_impl vuint64x8_impl_cpu = {0}; |
| 108 | 120 |
| 109 static int vec_init_spinner = 0; | 121 static int vec_init_spinner = 0; |
| 122 | |
| 123 #define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \ | |
| 124 do { \ | |
| 125 if (!(cpu).func && (impl).func) \ | |
| 126 (cpu).func = (impl).func; \ | |
| 127 } while (0) | |
| 128 | |
| 129 #define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \ | |
| 130 do { \ | |
| 131 FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \ | |
| 132 FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \ | |
| 133 FILL_GIVEN_FUNC_PTR(cpu, impl, load); \ | |
| 134 FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \ | |
| 135 FILL_GIVEN_FUNC_PTR(cpu, impl, store); \ | |
| 136 FILL_GIVEN_FUNC_PTR(cpu, impl, add); \ | |
| 137 FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \ | |
| 138 FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \ | |
| 139 FILL_GIVEN_FUNC_PTR(cpu, impl, div); \ | |
| 140 FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \ | |
| 141 FILL_GIVEN_FUNC_PTR(cpu, impl, band); \ | |
| 142 FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \ | |
| 143 FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \ | |
| 144 FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \ | |
| 145 FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \ | |
| 146 FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \ | |
| 147 FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \ | |
| 148 FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \ | |
| 149 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \ | |
| 150 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \ | |
| 151 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \ | |
| 152 FILL_GIVEN_FUNC_PTR(cpu, impl, min); \ | |
| 153 FILL_GIVEN_FUNC_PTR(cpu, impl, max); \ | |
| 154 } while (0) | |
| 155 | |
| 156 #define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \ | |
| 157 FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl) | |
| 110 | 158 |
| 111 // returns 0 or a negative error code on failure | 159 // returns 0 or a negative error code on failure |
| 112 int vec_init(void) | 160 int vec_init(void) |
| 113 { | 161 { |
| 114 // This function is NOT thread safe. However, once vec | 162 // This function is NOT thread safe. However, once vec |
| 115 // is initialized, all of the vector functions are thread-safe. | 163 // is initialized, all of the vector functions are thread-safe. |
| 116 // | |
| 117 // In fact, it's possible to use vec without calling | |
| 118 // vec_init() at all, but it would be completely useless since | |
| 119 // it would just use a generic implementation without any | |
| 120 // vectorization whatsoever (unless maybe the compiler is | |
| 121 // smart enough to optimize it into vectors) | |
| 122 | 164 |
| 123 if (vec_init_spinner) | 165 if (vec_init_spinner) |
| 124 return 0; // already initialized, do nothing | 166 return 0; // already initialized, do nothing |
| 125 | 167 |
| 126 vec_uint32 cpu = vec_get_CPU_features(); | 168 vec_uint32 cpu = vec_get_CPU_features(); |
| 127 | 169 |
| 170 /* Okay, this might be a little confusing: | |
| 171 * The way we do this is because of x86. For weird reasons, | |
| 172 * Intel decided to extend their prior CPU extensions to | |
| 173 * where SSE4.1 has some extended features of SSE2, AVX2 | |
| 174 * has some extended features that should've been in SSE | |
| 175 * in general, etc. | |
| 176 * | |
| 177 * For this, I've just decided to keep the function | |
| 178 * definitions private, and fill in as we go, with newer | |
| 179 * intrinsics preferred. Others are arbitrary and are | |
| 180 * mutually exclusive (i.e. Altivec vs NEON). This is simply | |
| 181 * the easiest way to go about it :) */ | |
| 182 | |
| 183 /* --- 512-bit */ | |
| 184 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
| 185 if (cpu & VEC_CPU_HAS_AVX512DQ) { | |
| 186 /* these give us native multiply instructions */ | |
| 187 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq); | |
| 188 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq); | |
| 189 } | |
| 190 #endif | |
| 191 #ifdef VEC_COMPILER_HAS_AVX512BW | |
| 192 if (cpu & VEC_CPU_HAS_AVX512BW) { | |
| 193 FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw); | |
| 194 FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw); | |
| 195 FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw); | |
| 196 FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw); | |
| 197 } | |
| 198 #endif | |
| 199 #ifdef VEC_COMPILER_HAS_AVX512F | |
| 200 if (cpu & VEC_CPU_HAS_AVX512F) { | |
| 201 FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); | |
| 202 FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); | |
| 203 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); | |
| 204 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); | |
| 205 } | |
| 206 #endif | |
| 207 | |
| 208 /* --- 256-bit */ | |
| 209 #ifdef VEC_COMPILER_HAS_AVX2 | |
| 210 if (cpu & VEC_CPU_HAS_AVX2) { | |
| 211 FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); | |
| 212 FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); | |
| 213 FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); | |
| 214 FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); | |
| 215 FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); | |
| 216 FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); | |
| 217 FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); | |
| 218 FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); | |
| 219 } | |
| 220 #endif | |
| 221 | |
| 222 /* --- 128-bit */ | |
| 223 #ifdef VEC_COMPILER_HAS_SSE42 | |
| 224 if (cpu & VEC_CPU_HAS_SSE41) { | |
| 225 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42); | |
| 226 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42); | |
| 227 } | |
| 228 #endif | |
| 229 #ifdef VEC_COMPILER_HAS_SSE41 | |
| 230 if (cpu & VEC_CPU_HAS_SSE41) { | |
| 231 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41); | |
| 232 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41); | |
| 233 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41); | |
| 234 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41); | |
| 235 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41); | |
| 236 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41); | |
| 237 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41); | |
| 238 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41); | |
| 239 } | |
| 240 #endif | |
| 241 #ifdef VEC_COMPILER_HAS_SSE3 | |
| 242 if (cpu & VEC_CPU_HAS_SSE3) { | |
| 243 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3); | |
| 244 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3); | |
| 245 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3); | |
| 246 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3); | |
| 247 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3); | |
| 248 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3); | |
| 249 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3); | |
| 250 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3); | |
| 251 } | |
| 252 #endif | |
| 253 #ifdef VEC_COMPILER_HAS_SSE2 | |
| 254 if (cpu & VEC_CPU_HAS_SSE2) { | |
| 255 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); | |
| 256 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); | |
| 257 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); | |
| 258 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); | |
| 259 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); | |
| 260 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); | |
| 261 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); | |
| 262 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); | |
| 263 } | |
| 264 #endif | |
| 265 #ifdef VEC_COMPILER_HAS_NEON | |
| 266 if (cpu & VEC_CPU_HAS_NEON) { | |
| 267 FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); | |
| 268 FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); | |
| 269 FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); | |
| 270 FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); | |
| 271 FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); | |
| 272 FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); | |
| 273 FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); | |
| 274 FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); | |
| 275 } | |
| 276 #endif | |
| 128 #ifdef VEC_COMPILER_HAS_ALTIVEC | 277 #ifdef VEC_COMPILER_HAS_ALTIVEC |
| 129 if (cpu & VEC_CPU_HAS_ALTIVEC) { | 278 if (cpu & VEC_CPU_HAS_ALTIVEC) { |
| 130 vint8x16_impl_cpu = &vint8x16_impl_altivec; | 279 FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); |
| 131 vuint8x16_impl_cpu = &vuint8x16_impl_altivec; | 280 FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); |
| 132 vint16x8_impl_cpu = &vint16x8_impl_altivec; | 281 FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); |
| 133 vuint16x8_impl_cpu = &vuint16x8_impl_altivec; | 282 FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); |
| 134 vint32x4_impl_cpu = &vint32x4_impl_altivec; | 283 FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); |
| 135 vuint32x4_impl_cpu = &vuint32x4_impl_altivec; | 284 FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); |
| 136 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX | 285 } |
| 137 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { | 286 #endif |
| 138 vint64x2_impl_cpu = &vint64x2_impl_altivec; | 287 |
| 139 vuint64x2_impl_cpu = &vuint64x2_impl_altivec; | 288 /* --- 64-bit */ |
| 140 } | |
| 141 #endif | |
| 142 } | |
| 143 #endif | |
| 144 #ifdef VEC_COMPILER_HAS_AVX512F | |
| 145 if (cpu & VEC_CPU_HAS_AVX512F) { | |
| 146 vint8x64_impl_cpu = &vint8x64_impl_avx512f; | |
| 147 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; | |
| 148 vint16x32_impl_cpu = &vint16x32_impl_avx512f; | |
| 149 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; | |
| 150 vint32x16_impl_cpu = &vint32x16_impl_avx512f; | |
| 151 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; | |
| 152 vint64x8_impl_cpu = &vint64x8_impl_avx512f; | |
| 153 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; | |
| 154 } | |
| 155 #endif | |
| 156 #ifdef VEC_COMPILER_HAS_AVX2 | |
| 157 if (cpu & VEC_CPU_HAS_AVX2) { | |
| 158 vint8x32_impl_cpu = &vint8x32_impl_avx2; | |
| 159 vuint8x32_impl_cpu = &vuint8x32_impl_avx2; | |
| 160 vint16x16_impl_cpu = &vint16x16_impl_avx2; | |
| 161 vuint16x16_impl_cpu = &vuint16x16_impl_avx2; | |
| 162 vint32x8_impl_cpu = &vint32x8_impl_avx2; | |
| 163 vuint32x8_impl_cpu = &vuint32x8_impl_avx2; | |
| 164 vint64x4_impl_cpu = &vint64x4_impl_avx2; | |
| 165 vuint64x4_impl_cpu = &vuint64x4_impl_avx2; | |
| 166 } | |
| 167 #endif | |
| 168 #ifdef VEC_COMPILER_HAS_SSE2 | |
| 169 if (cpu & VEC_CPU_HAS_SSE2) { | |
| 170 vint8x16_impl_cpu = &vint8x16_impl_sse2; | |
| 171 vuint8x16_impl_cpu = &vuint8x16_impl_sse2; | |
| 172 vint16x8_impl_cpu = &vint16x8_impl_sse2; | |
| 173 vuint16x8_impl_cpu = &vuint16x8_impl_sse2; | |
| 174 # ifdef VEC_COMPILER_HAS_SSE41 | |
| 175 if (cpu & VEC_CPU_HAS_SSE41) { | |
| 176 vint32x4_impl_cpu = &vint32x4_impl_sse41; | |
| 177 vuint32x4_impl_cpu = &vuint32x4_impl_sse41; | |
| 178 } else | |
| 179 # endif | |
| 180 { | |
| 181 vint32x4_impl_cpu = &vint32x4_impl_sse2; | |
| 182 vuint32x4_impl_cpu = &vuint32x4_impl_sse2; | |
| 183 } | |
| 184 vint64x2_impl_cpu = &vint64x2_impl_sse2; | |
| 185 vuint64x2_impl_cpu = &vuint64x2_impl_sse2; | |
| 186 } | |
| 187 #endif | |
| 188 #ifdef VEC_COMPILER_HAS_MMX | 289 #ifdef VEC_COMPILER_HAS_MMX |
| 189 if (cpu & VEC_CPU_HAS_MMX) { | 290 if (cpu & VEC_CPU_HAS_MMX) { |
| 190 vint8x8_impl_cpu = &vint8x8_impl_mmx; | 291 FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx); |
| 191 vuint8x8_impl_cpu = &vuint8x8_impl_mmx; | 292 FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx); |
| 192 vint16x4_impl_cpu = &vint16x4_impl_mmx; | 293 FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx); |
| 193 vuint16x4_impl_cpu = &vuint16x4_impl_mmx; | 294 FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx); |
| 194 vint32x2_impl_cpu = &vint32x2_impl_mmx; | 295 FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx); |
| 195 vuint32x2_impl_cpu = &vuint32x2_impl_mmx; | 296 FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx); |
| 196 } | 297 } |
| 197 #endif | 298 #endif |
| 198 #ifdef VEC_COMPILER_HAS_NEON | 299 #ifdef VEC_COMPILER_HAS_NEON |
| 199 if (cpu & VEC_CPU_HAS_NEON) { | 300 if (cpu & VEC_CPU_HAS_NEON) { |
| 200 // 64-bit | 301 FILL_GIVEN_FUNC_PTRS( , 8, 8, neon); |
| 201 vint8x8_impl_cpu = &vint8x8_impl_neon; | 302 FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon); |
| 202 vuint8x8_impl_cpu = &vuint8x8_impl_neon; | 303 FILL_GIVEN_FUNC_PTRS( , 16, 4, neon); |
| 203 vint16x4_impl_cpu = &vint16x4_impl_neon; | 304 FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon); |
| 204 vuint16x4_impl_cpu = &vuint16x4_impl_neon; | 305 FILL_GIVEN_FUNC_PTRS( , 32, 2, neon); |
| 205 vint32x2_impl_cpu = &vint32x2_impl_neon; | 306 FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon); |
| 206 vuint32x2_impl_cpu = &vuint32x2_impl_neon; | 307 } |
| 207 | 308 #endif |
| 208 // 128-bit | 309 |
| 209 vint8x16_impl_cpu = &vint8x16_impl_neon; | 310 /* fill any remaining function pointers with generics */ |
| 210 vuint8x16_impl_cpu = &vuint8x16_impl_neon; | 311 FILL_GIVEN_FUNC_PTRS( , 8, 64, generic); |
| 211 vint16x8_impl_cpu = &vint16x8_impl_neon; | 312 FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic); |
| 212 vuint16x8_impl_cpu = &vuint16x8_impl_neon; | 313 FILL_GIVEN_FUNC_PTRS( , 16, 32, generic); |
| 213 vint32x4_impl_cpu = &vint32x4_impl_neon; | 314 FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic); |
| 214 vuint32x4_impl_cpu = &vuint32x4_impl_neon; | 315 FILL_GIVEN_FUNC_PTRS( , 32, 16, generic); |
| 215 vint64x2_impl_cpu = &vint64x2_impl_neon; | 316 FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic); |
| 216 vuint64x2_impl_cpu = &vuint64x2_impl_neon; | 317 FILL_GIVEN_FUNC_PTRS( , 64, 8, generic); |
| 217 } | 318 FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic); |
| 218 #endif | 319 |
| 219 { | 320 FILL_GIVEN_FUNC_PTRS( , 8, 32, generic); |
| 220 // do nothing, they're already set to generics | 321 FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic); |
| 221 } | 322 FILL_GIVEN_FUNC_PTRS( , 16, 16, generic); |
| 323 FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic); | |
| 324 FILL_GIVEN_FUNC_PTRS( , 32, 8, generic); | |
| 325 FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic); | |
| 326 FILL_GIVEN_FUNC_PTRS( , 64, 4, generic); | |
| 327 FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic); | |
| 328 | |
| 329 FILL_GIVEN_FUNC_PTRS( , 8, 16, generic); | |
| 330 FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic); | |
| 331 FILL_GIVEN_FUNC_PTRS( , 16, 8, generic); | |
| 332 FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic); | |
| 333 FILL_GIVEN_FUNC_PTRS( , 32, 4, generic); | |
| 334 FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic); | |
| 335 FILL_GIVEN_FUNC_PTRS( , 64, 2, generic); | |
| 336 FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic); | |
| 337 | |
| 338 FILL_GIVEN_FUNC_PTRS( , 8, 8, generic); | |
| 339 FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic); | |
| 340 FILL_GIVEN_FUNC_PTRS( , 16, 4, generic); | |
| 341 FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic); | |
| 342 FILL_GIVEN_FUNC_PTRS( , 32, 2, generic); | |
| 343 FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic); | |
| 344 | |
| 345 FILL_GIVEN_FUNC_PTRS( , 8, 4, generic); | |
| 346 FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic); | |
| 347 FILL_GIVEN_FUNC_PTRS( , 16, 2, generic); | |
| 348 FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic); | |
| 349 | |
| 350 FILL_GIVEN_FUNC_PTRS( , 8, 2, generic); | |
| 351 FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic); | |
| 222 | 352 |
| 223 vec_init_spinner++; | 353 vec_init_spinner++; |
| 224 | 354 |
| 225 return 0; | 355 return 0; |
| 226 } | 356 } |
| 239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 369 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 370 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 371 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 372 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 373 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ | |
| 245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 374 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 375 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 376 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 377 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 378 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 250 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 379 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
| 251 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 380 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
| 252 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); | 381 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
| 382 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 383 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); | |
| 253 | 384 |
| 254 #define VEC_DEFINE_OPERATIONS(bits, size) \ | 385 #define VEC_DEFINE_OPERATIONS(bits, size) \ |
| 255 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ | 386 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ |
| 256 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) | 387 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) |
| 257 | 388 |
