Mercurial > vec
diff src/vec.c @ 31:bf6ad516f1e6
Backed out changeset c6c99ab1088a
author | Paper <paper@tflc.us> |
---|---|
date | Fri, 25 Apr 2025 17:40:33 -0400 |
parents | 641d8c79b1da |
children | 8b5e0974fd41 |
line wrap: on
line diff
--- a/src/vec.c Fri Apr 25 17:40:30 2025 -0400 +++ b/src/vec.c Fri Apr 25 17:40:33 2025 -0400 @@ -32,27 +32,15 @@ #ifdef VEC_COMPILER_HAS_SSE2 # include "vec/impl/x86/sse2.h" #endif -#ifdef VEC_COMPILER_HAS_SSE3 -# include "vec/impl/x86/sse3.h" -#endif #ifdef VEC_COMPILER_HAS_SSE41 # include "vec/impl/x86/sse41.h" #endif -#ifdef VEC_COMPILER_HAS_SSE42 -# include "vec/impl/x86/sse42.h" -#endif #ifdef VEC_COMPILER_HAS_AVX2 # include "vec/impl/x86/avx2.h" #endif #ifdef VEC_COMPILER_HAS_AVX512F # include "vec/impl/x86/avx512f.h" #endif -#ifdef VEC_COMPILER_HAS_AVX512BW -# include "vec/impl/x86/avx512bw.h" -#endif -#ifdef VEC_COMPILER_HAS_AVX512DQ -# include "vec/impl/x86/avx512dq.h" -#endif #ifdef VEC_COMPILER_HAS_ALTIVEC # include "vec/impl/ppc/altivec.h" #endif @@ -71,284 +59,166 @@ extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); // 16-bit -vint8x2_impl vint8x2_impl_cpu = {0}; -vuint8x2_impl vuint8x2_impl_cpu = {0}; +const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; +const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; // 32-bit -vint8x4_impl vint8x4_impl_cpu = {0}; -vuint8x4_impl vuint8x4_impl_cpu = {0}; -vint16x2_impl vint16x2_impl_cpu = {0}; -vuint16x2_impl vuint16x2_impl_cpu = {0}; +const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; +const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; +const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; +const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; // 64-bit -vint8x8_impl vint8x8_impl_cpu = {0}; -vuint8x8_impl vuint8x8_impl_cpu = {0}; -vint16x4_impl vint16x4_impl_cpu = {0}; -vuint16x4_impl vuint16x4_impl_cpu = {0}; -vint32x2_impl vint32x2_impl_cpu = {0}; -vuint32x2_impl vuint32x2_impl_cpu = {0}; +const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; +const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; +const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; +const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; +const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; +const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; // 128-bit -vint8x16_impl vint8x16_impl_cpu = {0}; -vuint8x16_impl vuint8x16_impl_cpu = {0}; -vint16x8_impl vint16x8_impl_cpu = {0}; -vuint16x8_impl vuint16x8_impl_cpu = {0}; -vint32x4_impl vint32x4_impl_cpu = {0}; -vuint32x4_impl vuint32x4_impl_cpu = {0}; -vint64x2_impl vint64x2_impl_cpu = {0}; -vuint64x2_impl vuint64x2_impl_cpu = {0}; +const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; +const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; +const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; +const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; +const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; +const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; +const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; +const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; // 256-bit -vint8x32_impl vint8x32_impl_cpu = {0}; -vuint8x32_impl vuint8x32_impl_cpu = {0}; -vint16x16_impl vint16x16_impl_cpu = {0}; -vuint16x16_impl vuint16x16_impl_cpu = {0}; -vint32x8_impl vint32x8_impl_cpu = {0}; -vuint32x8_impl vuint32x8_impl_cpu = {0}; -vint64x4_impl vint64x4_impl_cpu = {0}; -vuint64x4_impl vuint64x4_impl_cpu = {0}; +const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; +const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; +const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; +const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; +const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; +const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; +const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; +const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; // 512-bit -vint8x64_impl vint8x64_impl_cpu = {0}; -vuint8x64_impl vuint8x64_impl_cpu = {0}; -vint16x32_impl vint16x32_impl_cpu = {0}; -vuint16x32_impl vuint16x32_impl_cpu = {0}; -vint32x16_impl vint32x16_impl_cpu = {0}; -vuint32x16_impl vuint32x16_impl_cpu = {0}; -vint64x8_impl vint64x8_impl_cpu = {0}; -vuint64x8_impl vuint64x8_impl_cpu = {0}; +const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; +const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; +const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; +const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; +const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; +const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; +const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; +const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; static int vec_init_spinner = 0; -#define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \ - do { \ - if (!(cpu).func && (impl).func) \ - (cpu).func = (impl).func; \ - } while (0) - -#define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \ - do { \ - FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, load); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, store); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, add); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, div); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, band); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, min); \ - FILL_GIVEN_FUNC_PTR(cpu, impl, max); \ - } while (0) - -#define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \ - FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl) - // returns 0 or a negative error code on failure int vec_init(void) { // This function is NOT thread safe. However, once vec // is initialized, all of the vector functions are thread-safe. + // + // In fact, it's possible to use vec without calling + // vec_init() at all, but it would be completely useless since + // it would just use a generic implementation without any + // vectorization whatsoever (unless maybe the compiler is + // smart enough to optimize it into vectors) if (vec_init_spinner) return 0; // already initialized, do nothing vec_uint32 cpu = vec_get_CPU_features(); - /* Okay, this might be a little confusing: - * The way we do this is because of x86. For weird reasons, - * Intel decided to extend their prior CPU extensions to - * where SSE4.1 has some extended features of SSE2, AVX2 - * has some extended features that should've been in SSE - * in general, etc. - * - * For this, I've just decided to keep the function - * definitions private, and fill in as we go, with newer - * intrinsics preferred. Others are arbitrary and are - * mutually exclusive (i.e. Altivec vs NEON). This is simply - * the easiest way to go about it :) */ - - /* --- 512-bit */ -#ifdef VEC_COMPILER_HAS_AVX512DQ - if (cpu & VEC_CPU_HAS_AVX512DQ) { - /* these give us native multiply instructions */ - FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq); - FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq); - } +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (cpu & VEC_CPU_HAS_ALTIVEC) { + vint8x16_impl_cpu = &vint8x16_impl_altivec; + vuint8x16_impl_cpu = &vuint8x16_impl_altivec; + vint16x8_impl_cpu = &vint16x8_impl_altivec; + vuint16x8_impl_cpu = &vuint16x8_impl_altivec; + vint32x4_impl_cpu = &vint32x4_impl_altivec; + vuint32x4_impl_cpu = &vuint32x4_impl_altivec; +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { + vint64x2_impl_cpu = &vint64x2_impl_altivec; + vuint64x2_impl_cpu = &vuint64x2_impl_altivec; + } #endif -#ifdef VEC_COMPILER_HAS_AVX512BW - if (cpu & VEC_CPU_HAS_AVX512BW) { - FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw); - FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw); - FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw); - FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw); } #endif #ifdef VEC_COMPILER_HAS_AVX512F if (cpu & VEC_CPU_HAS_AVX512F) { - FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); - FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); - FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); - FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); - } -#endif - - /* --- 256-bit */ -#ifdef VEC_COMPILER_HAS_AVX2 - if (cpu & VEC_CPU_HAS_AVX2) { - FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); - FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); - FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); - FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); - FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); - FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); - FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); - FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); + vint8x64_impl_cpu = &vint8x64_impl_avx512f; + vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; + vint16x32_impl_cpu = &vint16x32_impl_avx512f; + vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; + vint32x16_impl_cpu = &vint32x16_impl_avx512f; + vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; + vint64x8_impl_cpu = &vint64x8_impl_avx512f; + vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; } #endif - - /* --- 128-bit */ -#ifdef VEC_COMPILER_HAS_SSE42 - if (cpu & VEC_CPU_HAS_SSE41) { - FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42); - FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42); - } -#endif -#ifdef VEC_COMPILER_HAS_SSE41 - if (cpu & VEC_CPU_HAS_SSE41) { - FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41); - FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41); - FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41); - FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41); - FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41); - FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41); - FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41); - FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41); - } -#endif -#ifdef VEC_COMPILER_HAS_SSE3 - if (cpu & VEC_CPU_HAS_SSE3) { - FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3); - FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3); - FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3); - FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3); - FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3); - FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3); - FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3); - FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3); +#ifdef VEC_COMPILER_HAS_AVX2 + if (cpu & VEC_CPU_HAS_AVX2) { + vint8x32_impl_cpu = &vint8x32_impl_avx2; + vuint8x32_impl_cpu = &vuint8x32_impl_avx2; + vint16x16_impl_cpu = &vint16x16_impl_avx2; + vuint16x16_impl_cpu = &vuint16x16_impl_avx2; + vint32x8_impl_cpu = &vint32x8_impl_avx2; + vuint32x8_impl_cpu = &vuint32x8_impl_avx2; + vint64x4_impl_cpu = &vint64x4_impl_avx2; + vuint64x4_impl_cpu = &vuint64x4_impl_avx2; } #endif #ifdef VEC_COMPILER_HAS_SSE2 if (cpu & VEC_CPU_HAS_SSE2) { - FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); - FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); - FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); - FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); - FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); - FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); - FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); - FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); + vint8x16_impl_cpu = &vint8x16_impl_sse2; + vuint8x16_impl_cpu = &vuint8x16_impl_sse2; + vint16x8_impl_cpu = &vint16x8_impl_sse2; + vuint16x8_impl_cpu = &vuint16x8_impl_sse2; +# ifdef VEC_COMPILER_HAS_SSE41 + if (cpu & VEC_CPU_HAS_SSE41) { + vint32x4_impl_cpu = &vint32x4_impl_sse41; + vuint32x4_impl_cpu = &vuint32x4_impl_sse41; + } else +# endif + { + vint32x4_impl_cpu = &vint32x4_impl_sse2; + vuint32x4_impl_cpu = &vuint32x4_impl_sse2; + } + vint64x2_impl_cpu = &vint64x2_impl_sse2; + vuint64x2_impl_cpu = &vuint64x2_impl_sse2; } #endif -#ifdef VEC_COMPILER_HAS_NEON - if (cpu & VEC_CPU_HAS_NEON) { - FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); - FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); - FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); - FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); - FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); - FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); - FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); - FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); - } -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (cpu & VEC_CPU_HAS_ALTIVEC) { - FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); - FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); - FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); - FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); - FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); - FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); - } -#endif - - /* --- 64-bit */ #ifdef VEC_COMPILER_HAS_MMX if (cpu & VEC_CPU_HAS_MMX) { - FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx); - FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx); - FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx); - FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx); - FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx); - FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx); + vint8x8_impl_cpu = &vint8x8_impl_mmx; + vuint8x8_impl_cpu = &vuint8x8_impl_mmx; + vint16x4_impl_cpu = &vint16x4_impl_mmx; + vuint16x4_impl_cpu = &vuint16x4_impl_mmx; + vint32x2_impl_cpu = &vint32x2_impl_mmx; + vuint32x2_impl_cpu = &vuint32x2_impl_mmx; } #endif #ifdef VEC_COMPILER_HAS_NEON if (cpu & VEC_CPU_HAS_NEON) { - FILL_GIVEN_FUNC_PTRS( , 8, 8, neon); - FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon); - FILL_GIVEN_FUNC_PTRS( , 16, 4, neon); - FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon); - FILL_GIVEN_FUNC_PTRS( , 32, 2, neon); - FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon); + // 64-bit + vint8x8_impl_cpu = &vint8x8_impl_neon; + vuint8x8_impl_cpu = &vuint8x8_impl_neon; + vint16x4_impl_cpu = &vint16x4_impl_neon; + vuint16x4_impl_cpu = &vuint16x4_impl_neon; + vint32x2_impl_cpu = &vint32x2_impl_neon; + vuint32x2_impl_cpu = &vuint32x2_impl_neon; + + // 128-bit + vint8x16_impl_cpu = &vint8x16_impl_neon; + vuint8x16_impl_cpu = &vuint8x16_impl_neon; + vint16x8_impl_cpu = &vint16x8_impl_neon; + vuint16x8_impl_cpu = &vuint16x8_impl_neon; + vint32x4_impl_cpu = &vint32x4_impl_neon; + vuint32x4_impl_cpu = &vuint32x4_impl_neon; + vint64x2_impl_cpu = &vint64x2_impl_neon; + vuint64x2_impl_cpu = &vuint64x2_impl_neon; } #endif - - /* fill any remaining function pointers with generics */ - FILL_GIVEN_FUNC_PTRS( , 8, 64, generic); - FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic); - FILL_GIVEN_FUNC_PTRS( , 16, 32, generic); - FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic); - FILL_GIVEN_FUNC_PTRS( , 32, 16, generic); - FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic); - FILL_GIVEN_FUNC_PTRS( , 64, 8, generic); - FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic); - - FILL_GIVEN_FUNC_PTRS( , 8, 32, generic); - FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic); - FILL_GIVEN_FUNC_PTRS( , 16, 16, generic); - FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic); - FILL_GIVEN_FUNC_PTRS( , 32, 8, generic); - FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic); - FILL_GIVEN_FUNC_PTRS( , 64, 4, generic); - FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic); - - FILL_GIVEN_FUNC_PTRS( , 8, 16, generic); - FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic); - FILL_GIVEN_FUNC_PTRS( , 16, 8, generic); - FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic); - FILL_GIVEN_FUNC_PTRS( , 32, 4, generic); - FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic); - FILL_GIVEN_FUNC_PTRS( , 64, 2, generic); - FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic); - - FILL_GIVEN_FUNC_PTRS( , 8, 8, generic); - FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic); - FILL_GIVEN_FUNC_PTRS( , 16, 4, generic); - FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic); - FILL_GIVEN_FUNC_PTRS( , 32, 2, generic); - FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic); - - FILL_GIVEN_FUNC_PTRS( , 8, 4, generic); - FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic); - FILL_GIVEN_FUNC_PTRS( , 16, 2, generic); - FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic); - - FILL_GIVEN_FUNC_PTRS( , 8, 2, generic); - FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic); + { + // do nothing, they're already set to generics + } vec_init_spinner++; @@ -371,6 +241,7 @@ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ @@ -378,9 +249,7 @@ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); + extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); #define VEC_DEFINE_OPERATIONS(bits, size) \ VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \