Mercurial > vec
diff src/vec.c @ 28:c6c99ab1088a
*: add min/max functions and a big big refactor (again)
agh, this time I added a few more implementations (and generally
made the code just a little faster...)
author | Paper <paper@tflc.us> |
---|---|
date | Thu, 24 Apr 2025 00:54:02 -0400 |
parents | 92156fe32755 |
children | e59c91d050c0 |
line wrap: on
line diff
--- a/src/vec.c Mon Nov 25 00:33:02 2024 -0500 +++ b/src/vec.c Thu Apr 24 00:54:02 2025 -0400 @@ -32,15 +32,27 @@ #ifdef VEC_COMPILER_HAS_SSE2 # include "vec/impl/x86/sse2.h" #endif +#ifdef VEC_COMPILER_HAS_SSE3 +# include "vec/impl/x86/sse3.h" +#endif #ifdef VEC_COMPILER_HAS_SSE41 # include "vec/impl/x86/sse41.h" #endif +#ifdef VEC_COMPILER_HAS_SSE42 +# include "vec/impl/x86/sse42.h" +#endif #ifdef VEC_COMPILER_HAS_AVX2 # include "vec/impl/x86/avx2.h" #endif #ifdef VEC_COMPILER_HAS_AVX512F # include "vec/impl/x86/avx512f.h" #endif +#ifdef VEC_COMPILER_HAS_AVX512BW +# include "vec/impl/x86/avx512bw.h" +#endif +#ifdef VEC_COMPILER_HAS_AVX512DQ +# include "vec/impl/x86/avx512dq.h" +#endif #ifdef VEC_COMPILER_HAS_ALTIVEC # include "vec/impl/ppc/altivec.h" #endif @@ -59,166 +71,284 @@ extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); // 16-bit -const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; -const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; +vint8x2_impl vint8x2_impl_cpu = {0}; +vuint8x2_impl vuint8x2_impl_cpu = {0}; // 32-bit -const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; -const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; -const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; -const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; +vint8x4_impl vint8x4_impl_cpu = {0}; +vuint8x4_impl vuint8x4_impl_cpu = {0}; +vint16x2_impl vint16x2_impl_cpu = {0}; +vuint16x2_impl vuint16x2_impl_cpu = {0}; // 64-bit -const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; -const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; -const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; -const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; -const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; -const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; +vint8x8_impl vint8x8_impl_cpu = {0}; +vuint8x8_impl vuint8x8_impl_cpu = {0}; +vint16x4_impl vint16x4_impl_cpu = {0}; +vuint16x4_impl vuint16x4_impl_cpu = {0}; +vint32x2_impl vint32x2_impl_cpu = {0}; +vuint32x2_impl vuint32x2_impl_cpu = {0}; // 128-bit -const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; -const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; -const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; -const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; -const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; -const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; -const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; -const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; +vint8x16_impl vint8x16_impl_cpu = {0}; +vuint8x16_impl vuint8x16_impl_cpu = {0}; +vint16x8_impl vint16x8_impl_cpu = {0}; +vuint16x8_impl vuint16x8_impl_cpu = {0}; +vint32x4_impl vint32x4_impl_cpu = {0}; +vuint32x4_impl vuint32x4_impl_cpu = {0}; +vint64x2_impl vint64x2_impl_cpu = {0}; +vuint64x2_impl vuint64x2_impl_cpu = {0}; // 256-bit -const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; -const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; -const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; -const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; -const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; -const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; -const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; -const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; +vint8x32_impl vint8x32_impl_cpu = {0}; +vuint8x32_impl vuint8x32_impl_cpu = {0}; +vint16x16_impl vint16x16_impl_cpu = {0}; +vuint16x16_impl vuint16x16_impl_cpu = {0}; +vint32x8_impl vint32x8_impl_cpu = {0}; +vuint32x8_impl vuint32x8_impl_cpu = {0}; +vint64x4_impl vint64x4_impl_cpu = {0}; +vuint64x4_impl vuint64x4_impl_cpu = {0}; // 512-bit -const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; -const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; -const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; -const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; -const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; -const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; -const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; -const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; +vint8x64_impl vint8x64_impl_cpu = {0}; +vuint8x64_impl vuint8x64_impl_cpu = {0}; +vint16x32_impl vint16x32_impl_cpu = {0}; +vuint16x32_impl vuint16x32_impl_cpu = {0}; +vint32x16_impl vint32x16_impl_cpu = {0}; +vuint32x16_impl vuint32x16_impl_cpu = {0}; +vint64x8_impl vint64x8_impl_cpu = {0}; +vuint64x8_impl vuint64x8_impl_cpu = {0}; static int vec_init_spinner = 0; +#define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \ + do { \ + if (!(cpu).func && (impl).func) \ + (cpu).func = (impl).func; \ + } while (0) + +#define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \ + do { \ + FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, load); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, store); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, add); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, div); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, band); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, min); \ + FILL_GIVEN_FUNC_PTR(cpu, impl, max); \ + } while (0) + +#define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \ + FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl) + // returns 0 or a negative error code on failure int vec_init(void) { // This function is NOT thread safe. However, once vec // is initialized, all of the vector functions are thread-safe. - // - // In fact, it's possible to use vec without calling - // vec_init() at all, but it would be completely useless since - // it would just use a generic implementation without any - // vectorization whatsoever (unless maybe the compiler is - // smart enough to optimize it into vectors) if (vec_init_spinner) return 0; // already initialized, do nothing vec_uint32 cpu = vec_get_CPU_features(); -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (cpu & VEC_CPU_HAS_ALTIVEC) { - vint8x16_impl_cpu = &vint8x16_impl_altivec; - vuint8x16_impl_cpu = &vuint8x16_impl_altivec; - vint16x8_impl_cpu = &vint16x8_impl_altivec; - vuint16x8_impl_cpu = &vuint16x8_impl_altivec; - vint32x4_impl_cpu = &vint32x4_impl_altivec; - vuint32x4_impl_cpu = &vuint32x4_impl_altivec; -#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX - if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { - vint64x2_impl_cpu = &vint64x2_impl_altivec; - vuint64x2_impl_cpu = &vuint64x2_impl_altivec; - } + /* Okay, this might be a little confusing: + * The way we do this is because of x86. For weird reasons, + * Intel decided to extend their prior CPU extensions to + * where SSE4.1 has some extended features of SSE2, AVX2 + * has some extended features that should've been in SSE + * in general, etc. + * + * For this, I've just decided to keep the function + * definitions private, and fill in as we go, with newer + * intrinsics preferred. Others are arbitrary and are + * mutually exclusive (i.e. Altivec vs NEON). This is simply + * the easiest way to go about it :) */ + + /* --- 512-bit */ +#ifdef VEC_COMPILER_HAS_AVX512DQ + if (cpu & VEC_CPU_HAS_AVX512DQ) { + /* these give us native multiply instructions */ + FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq); + FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq); + } #endif +#ifdef VEC_COMPILER_HAS_AVX512BW + if (cpu & VEC_CPU_HAS_AVX512BW) { + FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw); + FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw); + FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw); + FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw); } #endif #ifdef VEC_COMPILER_HAS_AVX512F if (cpu & VEC_CPU_HAS_AVX512F) { - vint8x64_impl_cpu = &vint8x64_impl_avx512f; - vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; - vint16x32_impl_cpu = &vint16x32_impl_avx512f; - vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; - vint32x16_impl_cpu = &vint32x16_impl_avx512f; - vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; - vint64x8_impl_cpu = &vint64x8_impl_avx512f; - vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; + FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); + FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); + FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); + FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); + } +#endif + + /* --- 256-bit */ +#ifdef VEC_COMPILER_HAS_AVX2 + if (cpu & VEC_CPU_HAS_AVX2) { + FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); + FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); + FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); + FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); + FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); + FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); + FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); + FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); } #endif -#ifdef VEC_COMPILER_HAS_AVX2 - if (cpu & VEC_CPU_HAS_AVX2) { - vint8x32_impl_cpu = &vint8x32_impl_avx2; - vuint8x32_impl_cpu = &vuint8x32_impl_avx2; - vint16x16_impl_cpu = &vint16x16_impl_avx2; - vuint16x16_impl_cpu = &vuint16x16_impl_avx2; - vint32x8_impl_cpu = &vint32x8_impl_avx2; - vuint32x8_impl_cpu = &vuint32x8_impl_avx2; - vint64x4_impl_cpu = &vint64x4_impl_avx2; - vuint64x4_impl_cpu = &vuint64x4_impl_avx2; + + /* --- 128-bit */ +#ifdef VEC_COMPILER_HAS_SSE42 + if (cpu & VEC_CPU_HAS_SSE41) { + FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42); + FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42); + } +#endif +#ifdef VEC_COMPILER_HAS_SSE41 + if (cpu & VEC_CPU_HAS_SSE41) { + FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41); + FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41); + FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41); + FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41); + FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41); + FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41); + FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41); + FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41); + } +#endif +#ifdef VEC_COMPILER_HAS_SSE3 + if (cpu & VEC_CPU_HAS_SSE3) { + FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3); + FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3); + FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3); + FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3); + FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3); + FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3); + FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3); + FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3); } #endif #ifdef VEC_COMPILER_HAS_SSE2 if (cpu & VEC_CPU_HAS_SSE2) { - vint8x16_impl_cpu = &vint8x16_impl_sse2; - vuint8x16_impl_cpu = &vuint8x16_impl_sse2; - vint16x8_impl_cpu = &vint16x8_impl_sse2; - vuint16x8_impl_cpu = &vuint16x8_impl_sse2; -# ifdef VEC_COMPILER_HAS_SSE41 - if (cpu & VEC_CPU_HAS_SSE41) { - vint32x4_impl_cpu = &vint32x4_impl_sse41; - vuint32x4_impl_cpu = &vuint32x4_impl_sse41; - } else -# endif - { - vint32x4_impl_cpu = &vint32x4_impl_sse2; - vuint32x4_impl_cpu = &vuint32x4_impl_sse2; - } - vint64x2_impl_cpu = &vint64x2_impl_sse2; - vuint64x2_impl_cpu = &vuint64x2_impl_sse2; + FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); + FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); + FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); + FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); + FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); + FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); + FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); + FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); } #endif +#ifdef VEC_COMPILER_HAS_NEON + if (cpu & VEC_CPU_HAS_NEON) { + FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); + FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); + FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); + FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); + FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); + FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); + FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); + FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); + } +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (cpu & VEC_CPU_HAS_ALTIVEC) { + FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); + FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); + FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); + FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); + FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); + FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); + } +#endif + + /* --- 64-bit */ #ifdef VEC_COMPILER_HAS_MMX if (cpu & VEC_CPU_HAS_MMX) { - vint8x8_impl_cpu = &vint8x8_impl_mmx; - vuint8x8_impl_cpu = &vuint8x8_impl_mmx; - vint16x4_impl_cpu = &vint16x4_impl_mmx; - vuint16x4_impl_cpu = &vuint16x4_impl_mmx; - vint32x2_impl_cpu = &vint32x2_impl_mmx; - vuint32x2_impl_cpu = &vuint32x2_impl_mmx; + FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx); + FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx); + FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx); + FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx); + FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx); + FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx); } #endif #ifdef VEC_COMPILER_HAS_NEON if (cpu & VEC_CPU_HAS_NEON) { - // 64-bit - vint8x8_impl_cpu = &vint8x8_impl_neon; - vuint8x8_impl_cpu = &vuint8x8_impl_neon; - vint16x4_impl_cpu = &vint16x4_impl_neon; - vuint16x4_impl_cpu = &vuint16x4_impl_neon; - vint32x2_impl_cpu = &vint32x2_impl_neon; - vuint32x2_impl_cpu = &vuint32x2_impl_neon; - - // 128-bit - vint8x16_impl_cpu = &vint8x16_impl_neon; - vuint8x16_impl_cpu = &vuint8x16_impl_neon; - vint16x8_impl_cpu = &vint16x8_impl_neon; - vuint16x8_impl_cpu = &vuint16x8_impl_neon; - vint32x4_impl_cpu = &vint32x4_impl_neon; - vuint32x4_impl_cpu = &vuint32x4_impl_neon; - vint64x2_impl_cpu = &vint64x2_impl_neon; - vuint64x2_impl_cpu = &vuint64x2_impl_neon; + FILL_GIVEN_FUNC_PTRS( , 8, 8, neon); + FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon); + FILL_GIVEN_FUNC_PTRS( , 16, 4, neon); + FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon); + FILL_GIVEN_FUNC_PTRS( , 32, 2, neon); + FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon); } #endif - { - // do nothing, they're already set to generics - } + + /* fill any remaining function pointers with generics */ + FILL_GIVEN_FUNC_PTRS( , 8, 64, generic); + FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic); + FILL_GIVEN_FUNC_PTRS( , 16, 32, generic); + FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic); + FILL_GIVEN_FUNC_PTRS( , 32, 16, generic); + FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic); + FILL_GIVEN_FUNC_PTRS( , 64, 8, generic); + FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic); + + FILL_GIVEN_FUNC_PTRS( , 8, 32, generic); + FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic); + FILL_GIVEN_FUNC_PTRS( , 16, 16, generic); + FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic); + FILL_GIVEN_FUNC_PTRS( , 32, 8, generic); + FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic); + FILL_GIVEN_FUNC_PTRS( , 64, 4, generic); + FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic); + + FILL_GIVEN_FUNC_PTRS( , 8, 16, generic); + FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic); + FILL_GIVEN_FUNC_PTRS( , 16, 8, generic); + FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic); + FILL_GIVEN_FUNC_PTRS( , 32, 4, generic); + FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic); + FILL_GIVEN_FUNC_PTRS( , 64, 2, generic); + FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic); + + FILL_GIVEN_FUNC_PTRS( , 8, 8, generic); + FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic); + FILL_GIVEN_FUNC_PTRS( , 16, 4, generic); + FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic); + FILL_GIVEN_FUNC_PTRS( , 32, 2, generic); + FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic); + + FILL_GIVEN_FUNC_PTRS( , 8, 4, generic); + FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic); + FILL_GIVEN_FUNC_PTRS( , 16, 2, generic); + FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic); + + FILL_GIVEN_FUNC_PTRS( , 8, 2, generic); + FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic); vec_init_spinner++; @@ -241,7 +371,6 @@ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ @@ -249,7 +378,9 @@ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); + extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); #define VEC_DEFINE_OPERATIONS(bits, size) \ VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \