Mercurial > vec
comparison src/vec.c @ 31:bf6ad516f1e6
Backed out changeset c6c99ab1088a
| author | Paper <paper@tflc.us> |
|---|---|
| date | Fri, 25 Apr 2025 17:40:33 -0400 |
| parents | 641d8c79b1da |
| children | 8b5e0974fd41 |
comparison
equal
deleted
inserted
replaced
| 30:641d8c79b1da | 31:bf6ad516f1e6 |
|---|---|
| 30 # include "vec/impl/x86/mmx.h" | 30 # include "vec/impl/x86/mmx.h" |
| 31 #endif | 31 #endif |
| 32 #ifdef VEC_COMPILER_HAS_SSE2 | 32 #ifdef VEC_COMPILER_HAS_SSE2 |
| 33 # include "vec/impl/x86/sse2.h" | 33 # include "vec/impl/x86/sse2.h" |
| 34 #endif | 34 #endif |
| 35 #ifdef VEC_COMPILER_HAS_SSE3 | |
| 36 # include "vec/impl/x86/sse3.h" | |
| 37 #endif | |
| 38 #ifdef VEC_COMPILER_HAS_SSE41 | 35 #ifdef VEC_COMPILER_HAS_SSE41 |
| 39 # include "vec/impl/x86/sse41.h" | 36 # include "vec/impl/x86/sse41.h" |
| 40 #endif | 37 #endif |
| 41 #ifdef VEC_COMPILER_HAS_SSE42 | |
| 42 # include "vec/impl/x86/sse42.h" | |
| 43 #endif | |
| 44 #ifdef VEC_COMPILER_HAS_AVX2 | 38 #ifdef VEC_COMPILER_HAS_AVX2 |
| 45 # include "vec/impl/x86/avx2.h" | 39 # include "vec/impl/x86/avx2.h" |
| 46 #endif | 40 #endif |
| 47 #ifdef VEC_COMPILER_HAS_AVX512F | 41 #ifdef VEC_COMPILER_HAS_AVX512F |
| 48 # include "vec/impl/x86/avx512f.h" | 42 # include "vec/impl/x86/avx512f.h" |
| 49 #endif | |
| 50 #ifdef VEC_COMPILER_HAS_AVX512BW | |
| 51 # include "vec/impl/x86/avx512bw.h" | |
| 52 #endif | |
| 53 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
| 54 # include "vec/impl/x86/avx512dq.h" | |
| 55 #endif | 43 #endif |
| 56 #ifdef VEC_COMPILER_HAS_ALTIVEC | 44 #ifdef VEC_COMPILER_HAS_ALTIVEC |
| 57 # include "vec/impl/ppc/altivec.h" | 45 # include "vec/impl/ppc/altivec.h" |
| 58 #endif | 46 #endif |
| 59 #ifdef VEC_COMPILER_HAS_NEON | 47 #ifdef VEC_COMPILER_HAS_NEON |
| 69 | 57 |
| 70 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); | 58 extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); |
| 71 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); | 59 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); |
| 72 | 60 |
| 73 // 16-bit | 61 // 16-bit |
| 74 vint8x2_impl vint8x2_impl_cpu = {0}; | 62 const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; |
| 75 vuint8x2_impl vuint8x2_impl_cpu = {0}; | 63 const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; |
| 76 | 64 |
| 77 // 32-bit | 65 // 32-bit |
| 78 vint8x4_impl vint8x4_impl_cpu = {0}; | 66 const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; |
| 79 vuint8x4_impl vuint8x4_impl_cpu = {0}; | 67 const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; |
| 80 vint16x2_impl vint16x2_impl_cpu = {0}; | 68 const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; |
| 81 vuint16x2_impl vuint16x2_impl_cpu = {0}; | 69 const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; |
| 82 | 70 |
| 83 // 64-bit | 71 // 64-bit |
| 84 vint8x8_impl vint8x8_impl_cpu = {0}; | 72 const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; |
| 85 vuint8x8_impl vuint8x8_impl_cpu = {0}; | 73 const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; |
| 86 vint16x4_impl vint16x4_impl_cpu = {0}; | 74 const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; |
| 87 vuint16x4_impl vuint16x4_impl_cpu = {0}; | 75 const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; |
| 88 vint32x2_impl vint32x2_impl_cpu = {0}; | 76 const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; |
| 89 vuint32x2_impl vuint32x2_impl_cpu = {0}; | 77 const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; |
| 90 | 78 |
| 91 // 128-bit | 79 // 128-bit |
| 92 vint8x16_impl vint8x16_impl_cpu = {0}; | 80 const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; |
| 93 vuint8x16_impl vuint8x16_impl_cpu = {0}; | 81 const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; |
| 94 vint16x8_impl vint16x8_impl_cpu = {0}; | 82 const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; |
| 95 vuint16x8_impl vuint16x8_impl_cpu = {0}; | 83 const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; |
| 96 vint32x4_impl vint32x4_impl_cpu = {0}; | 84 const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; |
| 97 vuint32x4_impl vuint32x4_impl_cpu = {0}; | 85 const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; |
| 98 vint64x2_impl vint64x2_impl_cpu = {0}; | 86 const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; |
| 99 vuint64x2_impl vuint64x2_impl_cpu = {0}; | 87 const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; |
| 100 | 88 |
| 101 // 256-bit | 89 // 256-bit |
| 102 vint8x32_impl vint8x32_impl_cpu = {0}; | 90 const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; |
| 103 vuint8x32_impl vuint8x32_impl_cpu = {0}; | 91 const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; |
| 104 vint16x16_impl vint16x16_impl_cpu = {0}; | 92 const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; |
| 105 vuint16x16_impl vuint16x16_impl_cpu = {0}; | 93 const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; |
| 106 vint32x8_impl vint32x8_impl_cpu = {0}; | 94 const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; |
| 107 vuint32x8_impl vuint32x8_impl_cpu = {0}; | 95 const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; |
| 108 vint64x4_impl vint64x4_impl_cpu = {0}; | 96 const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; |
| 109 vuint64x4_impl vuint64x4_impl_cpu = {0}; | 97 const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; |
| 110 | 98 |
| 111 // 512-bit | 99 // 512-bit |
| 112 vint8x64_impl vint8x64_impl_cpu = {0}; | 100 const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; |
| 113 vuint8x64_impl vuint8x64_impl_cpu = {0}; | 101 const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; |
| 114 vint16x32_impl vint16x32_impl_cpu = {0}; | 102 const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; |
| 115 vuint16x32_impl vuint16x32_impl_cpu = {0}; | 103 const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; |
| 116 vint32x16_impl vint32x16_impl_cpu = {0}; | 104 const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; |
| 117 vuint32x16_impl vuint32x16_impl_cpu = {0}; | 105 const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; |
| 118 vint64x8_impl vint64x8_impl_cpu = {0}; | 106 const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; |
| 119 vuint64x8_impl vuint64x8_impl_cpu = {0}; | 107 const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; |
| 120 | 108 |
| 121 static int vec_init_spinner = 0; | 109 static int vec_init_spinner = 0; |
| 122 | |
| 123 #define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \ | |
| 124 do { \ | |
| 125 if (!(cpu).func && (impl).func) \ | |
| 126 (cpu).func = (impl).func; \ | |
| 127 } while (0) | |
| 128 | |
| 129 #define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \ | |
| 130 do { \ | |
| 131 FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \ | |
| 132 FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \ | |
| 133 FILL_GIVEN_FUNC_PTR(cpu, impl, load); \ | |
| 134 FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \ | |
| 135 FILL_GIVEN_FUNC_PTR(cpu, impl, store); \ | |
| 136 FILL_GIVEN_FUNC_PTR(cpu, impl, add); \ | |
| 137 FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \ | |
| 138 FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \ | |
| 139 FILL_GIVEN_FUNC_PTR(cpu, impl, div); \ | |
| 140 FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \ | |
| 141 FILL_GIVEN_FUNC_PTR(cpu, impl, band); \ | |
| 142 FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \ | |
| 143 FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \ | |
| 144 FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \ | |
| 145 FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \ | |
| 146 FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \ | |
| 147 FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \ | |
| 148 FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \ | |
| 149 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \ | |
| 150 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \ | |
| 151 FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \ | |
| 152 FILL_GIVEN_FUNC_PTR(cpu, impl, min); \ | |
| 153 FILL_GIVEN_FUNC_PTR(cpu, impl, max); \ | |
| 154 } while (0) | |
| 155 | |
| 156 #define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \ | |
| 157 FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl) | |
| 158 | 110 |
| 159 // returns 0 or a negative error code on failure | 111 // returns 0 or a negative error code on failure |
| 160 int vec_init(void) | 112 int vec_init(void) |
| 161 { | 113 { |
| 162 // This function is NOT thread safe. However, once vec | 114 // This function is NOT thread safe. However, once vec |
| 163 // is initialized, all of the vector functions are thread-safe. | 115 // is initialized, all of the vector functions are thread-safe. |
| 116 // | |
| 117 // In fact, it's possible to use vec without calling | |
| 118 // vec_init() at all, but it would be completely useless since | |
| 119 // it would just use a generic implementation without any | |
| 120 // vectorization whatsoever (unless maybe the compiler is | |
| 121 // smart enough to optimize it into vectors) | |
| 164 | 122 |
| 165 if (vec_init_spinner) | 123 if (vec_init_spinner) |
| 166 return 0; // already initialized, do nothing | 124 return 0; // already initialized, do nothing |
| 167 | 125 |
| 168 vec_uint32 cpu = vec_get_CPU_features(); | 126 vec_uint32 cpu = vec_get_CPU_features(); |
| 169 | 127 |
| 170 /* Okay, this might be a little confusing: | 128 #ifdef VEC_COMPILER_HAS_ALTIVEC |
| 171 * The way we do this is because of x86. For weird reasons, | 129 if (cpu & VEC_CPU_HAS_ALTIVEC) { |
| 172 * Intel decided to extend their prior CPU extensions to | 130 vint8x16_impl_cpu = &vint8x16_impl_altivec; |
| 173 * where SSE4.1 has some extended features of SSE2, AVX2 | 131 vuint8x16_impl_cpu = &vuint8x16_impl_altivec; |
| 174 * has some extended features that should've been in SSE | 132 vint16x8_impl_cpu = &vint16x8_impl_altivec; |
| 175 * in general, etc. | 133 vuint16x8_impl_cpu = &vuint16x8_impl_altivec; |
| 176 * | 134 vint32x4_impl_cpu = &vint32x4_impl_altivec; |
| 177 * For this, I've just decided to keep the function | 135 vuint32x4_impl_cpu = &vuint32x4_impl_altivec; |
| 178 * definitions private, and fill in as we go, with newer | 136 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX |
| 179 * intrinsics preferred. Others are arbitrary and are | 137 if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { |
| 180 * mutually exclusive (i.e. Altivec vs NEON). This is simply | 138 vint64x2_impl_cpu = &vint64x2_impl_altivec; |
| 181 * the easiest way to go about it :) */ | 139 vuint64x2_impl_cpu = &vuint64x2_impl_altivec; |
| 182 | 140 } |
| 183 /* --- 512-bit */ | 141 #endif |
| 184 #ifdef VEC_COMPILER_HAS_AVX512DQ | |
| 185 if (cpu & VEC_CPU_HAS_AVX512DQ) { | |
| 186 /* these give us native multiply instructions */ | |
| 187 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq); | |
| 188 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq); | |
| 189 } | |
| 190 #endif | |
| 191 #ifdef VEC_COMPILER_HAS_AVX512BW | |
| 192 if (cpu & VEC_CPU_HAS_AVX512BW) { | |
| 193 FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw); | |
| 194 FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw); | |
| 195 FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw); | |
| 196 FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw); | |
| 197 } | 142 } |
| 198 #endif | 143 #endif |
| 199 #ifdef VEC_COMPILER_HAS_AVX512F | 144 #ifdef VEC_COMPILER_HAS_AVX512F |
| 200 if (cpu & VEC_CPU_HAS_AVX512F) { | 145 if (cpu & VEC_CPU_HAS_AVX512F) { |
| 201 FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); | 146 vint8x64_impl_cpu = &vint8x64_impl_avx512f; |
| 202 FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); | 147 vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; |
| 203 FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); | 148 vint16x32_impl_cpu = &vint16x32_impl_avx512f; |
| 204 FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); | 149 vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; |
| 205 } | 150 vint32x16_impl_cpu = &vint32x16_impl_avx512f; |
| 206 #endif | 151 vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; |
| 207 | 152 vint64x8_impl_cpu = &vint64x8_impl_avx512f; |
| 208 /* --- 256-bit */ | 153 vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; |
| 154 } | |
| 155 #endif | |
| 209 #ifdef VEC_COMPILER_HAS_AVX2 | 156 #ifdef VEC_COMPILER_HAS_AVX2 |
| 210 if (cpu & VEC_CPU_HAS_AVX2) { | 157 if (cpu & VEC_CPU_HAS_AVX2) { |
| 211 FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); | 158 vint8x32_impl_cpu = &vint8x32_impl_avx2; |
| 212 FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); | 159 vuint8x32_impl_cpu = &vuint8x32_impl_avx2; |
| 213 FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); | 160 vint16x16_impl_cpu = &vint16x16_impl_avx2; |
| 214 FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); | 161 vuint16x16_impl_cpu = &vuint16x16_impl_avx2; |
| 215 FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); | 162 vint32x8_impl_cpu = &vint32x8_impl_avx2; |
| 216 FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); | 163 vuint32x8_impl_cpu = &vuint32x8_impl_avx2; |
| 217 FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); | 164 vint64x4_impl_cpu = &vint64x4_impl_avx2; |
| 218 FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); | 165 vuint64x4_impl_cpu = &vuint64x4_impl_avx2; |
| 219 } | |
| 220 #endif | |
| 221 | |
| 222 /* --- 128-bit */ | |
| 223 #ifdef VEC_COMPILER_HAS_SSE42 | |
| 224 if (cpu & VEC_CPU_HAS_SSE41) { | |
| 225 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42); | |
| 226 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42); | |
| 227 } | |
| 228 #endif | |
| 229 #ifdef VEC_COMPILER_HAS_SSE41 | |
| 230 if (cpu & VEC_CPU_HAS_SSE41) { | |
| 231 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41); | |
| 232 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41); | |
| 233 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41); | |
| 234 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41); | |
| 235 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41); | |
| 236 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41); | |
| 237 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41); | |
| 238 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41); | |
| 239 } | |
| 240 #endif | |
| 241 #ifdef VEC_COMPILER_HAS_SSE3 | |
| 242 if (cpu & VEC_CPU_HAS_SSE3) { | |
| 243 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3); | |
| 244 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3); | |
| 245 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3); | |
| 246 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3); | |
| 247 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3); | |
| 248 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3); | |
| 249 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3); | |
| 250 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3); | |
| 251 } | 166 } |
| 252 #endif | 167 #endif |
| 253 #ifdef VEC_COMPILER_HAS_SSE2 | 168 #ifdef VEC_COMPILER_HAS_SSE2 |
| 254 if (cpu & VEC_CPU_HAS_SSE2) { | 169 if (cpu & VEC_CPU_HAS_SSE2) { |
| 255 FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); | 170 vint8x16_impl_cpu = &vint8x16_impl_sse2; |
| 256 FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); | 171 vuint8x16_impl_cpu = &vuint8x16_impl_sse2; |
| 257 FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); | 172 vint16x8_impl_cpu = &vint16x8_impl_sse2; |
| 258 FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); | 173 vuint16x8_impl_cpu = &vuint16x8_impl_sse2; |
| 259 FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); | 174 # ifdef VEC_COMPILER_HAS_SSE41 |
| 260 FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); | 175 if (cpu & VEC_CPU_HAS_SSE41) { |
| 261 FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); | 176 vint32x4_impl_cpu = &vint32x4_impl_sse41; |
| 262 FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); | 177 vuint32x4_impl_cpu = &vuint32x4_impl_sse41; |
| 178 } else | |
| 179 # endif | |
| 180 { | |
| 181 vint32x4_impl_cpu = &vint32x4_impl_sse2; | |
| 182 vuint32x4_impl_cpu = &vuint32x4_impl_sse2; | |
| 183 } | |
| 184 vint64x2_impl_cpu = &vint64x2_impl_sse2; | |
| 185 vuint64x2_impl_cpu = &vuint64x2_impl_sse2; | |
| 186 } | |
| 187 #endif | |
| 188 #ifdef VEC_COMPILER_HAS_MMX | |
| 189 if (cpu & VEC_CPU_HAS_MMX) { | |
| 190 vint8x8_impl_cpu = &vint8x8_impl_mmx; | |
| 191 vuint8x8_impl_cpu = &vuint8x8_impl_mmx; | |
| 192 vint16x4_impl_cpu = &vint16x4_impl_mmx; | |
| 193 vuint16x4_impl_cpu = &vuint16x4_impl_mmx; | |
| 194 vint32x2_impl_cpu = &vint32x2_impl_mmx; | |
| 195 vuint32x2_impl_cpu = &vuint32x2_impl_mmx; | |
| 263 } | 196 } |
| 264 #endif | 197 #endif |
| 265 #ifdef VEC_COMPILER_HAS_NEON | 198 #ifdef VEC_COMPILER_HAS_NEON |
| 266 if (cpu & VEC_CPU_HAS_NEON) { | 199 if (cpu & VEC_CPU_HAS_NEON) { |
| 267 FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); | 200 // 64-bit |
| 268 FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); | 201 vint8x8_impl_cpu = &vint8x8_impl_neon; |
| 269 FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); | 202 vuint8x8_impl_cpu = &vuint8x8_impl_neon; |
| 270 FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); | 203 vint16x4_impl_cpu = &vint16x4_impl_neon; |
| 271 FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); | 204 vuint16x4_impl_cpu = &vuint16x4_impl_neon; |
| 272 FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); | 205 vint32x2_impl_cpu = &vint32x2_impl_neon; |
| 273 FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); | 206 vuint32x2_impl_cpu = &vuint32x2_impl_neon; |
| 274 FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); | 207 |
| 275 } | 208 // 128-bit |
| 276 #endif | 209 vint8x16_impl_cpu = &vint8x16_impl_neon; |
| 277 #ifdef VEC_COMPILER_HAS_ALTIVEC | 210 vuint8x16_impl_cpu = &vuint8x16_impl_neon; |
| 278 if (cpu & VEC_CPU_HAS_ALTIVEC) { | 211 vint16x8_impl_cpu = &vint16x8_impl_neon; |
| 279 FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); | 212 vuint16x8_impl_cpu = &vuint16x8_impl_neon; |
| 280 FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); | 213 vint32x4_impl_cpu = &vint32x4_impl_neon; |
| 281 FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); | 214 vuint32x4_impl_cpu = &vuint32x4_impl_neon; |
| 282 FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); | 215 vint64x2_impl_cpu = &vint64x2_impl_neon; |
| 283 FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); | 216 vuint64x2_impl_cpu = &vuint64x2_impl_neon; |
| 284 FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); | 217 } |
| 285 } | 218 #endif |
| 286 #endif | 219 { |
| 287 | 220 // do nothing, they're already set to generics |
| 288 /* --- 64-bit */ | 221 } |
| 289 #ifdef VEC_COMPILER_HAS_MMX | |
| 290 if (cpu & VEC_CPU_HAS_MMX) { | |
| 291 FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx); | |
| 292 FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx); | |
| 293 FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx); | |
| 294 FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx); | |
| 295 FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx); | |
| 296 FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx); | |
| 297 } | |
| 298 #endif | |
| 299 #ifdef VEC_COMPILER_HAS_NEON | |
| 300 if (cpu & VEC_CPU_HAS_NEON) { | |
| 301 FILL_GIVEN_FUNC_PTRS( , 8, 8, neon); | |
| 302 FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon); | |
| 303 FILL_GIVEN_FUNC_PTRS( , 16, 4, neon); | |
| 304 FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon); | |
| 305 FILL_GIVEN_FUNC_PTRS( , 32, 2, neon); | |
| 306 FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon); | |
| 307 } | |
| 308 #endif | |
| 309 | |
| 310 /* fill any remaining function pointers with generics */ | |
| 311 FILL_GIVEN_FUNC_PTRS( , 8, 64, generic); | |
| 312 FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic); | |
| 313 FILL_GIVEN_FUNC_PTRS( , 16, 32, generic); | |
| 314 FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic); | |
| 315 FILL_GIVEN_FUNC_PTRS( , 32, 16, generic); | |
| 316 FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic); | |
| 317 FILL_GIVEN_FUNC_PTRS( , 64, 8, generic); | |
| 318 FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic); | |
| 319 | |
| 320 FILL_GIVEN_FUNC_PTRS( , 8, 32, generic); | |
| 321 FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic); | |
| 322 FILL_GIVEN_FUNC_PTRS( , 16, 16, generic); | |
| 323 FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic); | |
| 324 FILL_GIVEN_FUNC_PTRS( , 32, 8, generic); | |
| 325 FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic); | |
| 326 FILL_GIVEN_FUNC_PTRS( , 64, 4, generic); | |
| 327 FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic); | |
| 328 | |
| 329 FILL_GIVEN_FUNC_PTRS( , 8, 16, generic); | |
| 330 FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic); | |
| 331 FILL_GIVEN_FUNC_PTRS( , 16, 8, generic); | |
| 332 FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic); | |
| 333 FILL_GIVEN_FUNC_PTRS( , 32, 4, generic); | |
| 334 FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic); | |
| 335 FILL_GIVEN_FUNC_PTRS( , 64, 2, generic); | |
| 336 FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic); | |
| 337 | |
| 338 FILL_GIVEN_FUNC_PTRS( , 8, 8, generic); | |
| 339 FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic); | |
| 340 FILL_GIVEN_FUNC_PTRS( , 16, 4, generic); | |
| 341 FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic); | |
| 342 FILL_GIVEN_FUNC_PTRS( , 32, 2, generic); | |
| 343 FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic); | |
| 344 | |
| 345 FILL_GIVEN_FUNC_PTRS( , 8, 4, generic); | |
| 346 FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic); | |
| 347 FILL_GIVEN_FUNC_PTRS( , 16, 2, generic); | |
| 348 FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic); | |
| 349 | |
| 350 FILL_GIVEN_FUNC_PTRS( , 8, 2, generic); | |
| 351 FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic); | |
| 352 | 222 |
| 353 vec_init_spinner++; | 223 vec_init_spinner++; |
| 354 | 224 |
| 355 return 0; | 225 return 0; |
| 356 } | 226 } |
| 369 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 239 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 370 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 240 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 371 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 241 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 372 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 242 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 373 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 243 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 244 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ | |
| 374 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 245 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 375 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 246 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 376 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 247 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 377 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 248 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 378 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | 249 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ |
| 379 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 250 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
| 380 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 251 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ |
| 381 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ | 252 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); |
| 382 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ | |
| 383 extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); | |
| 384 | 253 |
| 385 #define VEC_DEFINE_OPERATIONS(bits, size) \ | 254 #define VEC_DEFINE_OPERATIONS(bits, size) \ |
| 386 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ | 255 VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ |
| 387 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) | 256 VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) |
| 388 | 257 |
