diff src/vec.c @ 28:c6c99ab1088a

*: add min/max functions and a big big refactor (again) agh, this time I added a few more implementations (and generally made the code just a little faster...)
author Paper <paper@tflc.us>
date Thu, 24 Apr 2025 00:54:02 -0400
parents 92156fe32755
children e59c91d050c0
line wrap: on
line diff
--- a/src/vec.c	Mon Nov 25 00:33:02 2024 -0500
+++ b/src/vec.c	Thu Apr 24 00:54:02 2025 -0400
@@ -32,15 +32,27 @@
 #ifdef VEC_COMPILER_HAS_SSE2
 # include "vec/impl/x86/sse2.h"
 #endif
+#ifdef VEC_COMPILER_HAS_SSE3
+# include "vec/impl/x86/sse3.h"
+#endif
 #ifdef VEC_COMPILER_HAS_SSE41
 # include "vec/impl/x86/sse41.h"
 #endif
+#ifdef VEC_COMPILER_HAS_SSE42
+# include "vec/impl/x86/sse42.h"
+#endif
 #ifdef VEC_COMPILER_HAS_AVX2
 # include "vec/impl/x86/avx2.h"
 #endif
 #ifdef VEC_COMPILER_HAS_AVX512F
 # include "vec/impl/x86/avx512f.h"
 #endif
+#ifdef VEC_COMPILER_HAS_AVX512BW
+# include "vec/impl/x86/avx512bw.h"
+#endif
+#ifdef VEC_COMPILER_HAS_AVX512DQ
+# include "vec/impl/x86/avx512dq.h"
+#endif
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 # include "vec/impl/ppc/altivec.h"
 #endif
@@ -59,166 +71,284 @@
 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y);
 
 // 16-bit
-const vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
-const vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;
+vint8x2_impl   vint8x2_impl_cpu   = {0};
+vuint8x2_impl  vuint8x2_impl_cpu  = {0};
 
 // 32-bit
-const vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
-const vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
-const vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
-const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
+vint8x4_impl   vint8x4_impl_cpu   = {0};
+vuint8x4_impl  vuint8x4_impl_cpu  = {0};
+vint16x2_impl  vint16x2_impl_cpu  = {0};
+vuint16x2_impl vuint16x2_impl_cpu = {0};
 
 // 64-bit
-const vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
-const vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
-const vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
-const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
-const vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
-const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
+vint8x8_impl   vint8x8_impl_cpu   = {0};
+vuint8x8_impl  vuint8x8_impl_cpu  = {0};
+vint16x4_impl  vint16x4_impl_cpu  = {0};
+vuint16x4_impl vuint16x4_impl_cpu = {0};
+vint32x2_impl  vint32x2_impl_cpu  = {0};
+vuint32x2_impl vuint32x2_impl_cpu = {0};
 
 // 128-bit
-const vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
-const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
-const vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
-const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
-const vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
-const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
-const vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
-const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
+vint8x16_impl  vint8x16_impl_cpu  = {0};
+vuint8x16_impl vuint8x16_impl_cpu = {0};
+vint16x8_impl  vint16x8_impl_cpu  = {0};
+vuint16x8_impl vuint16x8_impl_cpu = {0};
+vint32x4_impl  vint32x4_impl_cpu  = {0};
+vuint32x4_impl vuint32x4_impl_cpu = {0};
+vint64x2_impl  vint64x2_impl_cpu  = {0};
+vuint64x2_impl vuint64x2_impl_cpu = {0};
 
 // 256-bit
-const vint8x32_impl   *vint8x32_impl_cpu   = &vint8x32_impl_generic;
-const vuint8x32_impl  *vuint8x32_impl_cpu  = &vuint8x32_impl_generic;
-const vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
-const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
-const vint32x8_impl   *vint32x8_impl_cpu   = &vint32x8_impl_generic;
-const vuint32x8_impl  *vuint32x8_impl_cpu  = &vuint32x8_impl_generic;
-const vint64x4_impl   *vint64x4_impl_cpu   = &vint64x4_impl_generic;
-const vuint64x4_impl  *vuint64x4_impl_cpu  = &vuint64x4_impl_generic;
+vint8x32_impl   vint8x32_impl_cpu   = {0};
+vuint8x32_impl  vuint8x32_impl_cpu  = {0};
+vint16x16_impl  vint16x16_impl_cpu  = {0};
+vuint16x16_impl vuint16x16_impl_cpu = {0};
+vint32x8_impl   vint32x8_impl_cpu   = {0};
+vuint32x8_impl  vuint32x8_impl_cpu  = {0};
+vint64x4_impl   vint64x4_impl_cpu   = {0};
+vuint64x4_impl  vuint64x4_impl_cpu  = {0};
 
 // 512-bit
-const vint8x64_impl   *vint8x64_impl_cpu   = &vint8x64_impl_generic;
-const vuint8x64_impl  *vuint8x64_impl_cpu  = &vuint8x64_impl_generic;
-const vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
-const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
-const vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
-const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
-const vint64x8_impl   *vint64x8_impl_cpu   = &vint64x8_impl_generic;
-const vuint64x8_impl  *vuint64x8_impl_cpu  = &vuint64x8_impl_generic;
+vint8x64_impl   vint8x64_impl_cpu   = {0};
+vuint8x64_impl  vuint8x64_impl_cpu  = {0};
+vint16x32_impl  vint16x32_impl_cpu  = {0};
+vuint16x32_impl vuint16x32_impl_cpu = {0};
+vint32x16_impl  vint32x16_impl_cpu  = {0};
+vuint32x16_impl vuint32x16_impl_cpu = {0};
+vint64x8_impl   vint64x8_impl_cpu   = {0};
+vuint64x8_impl  vuint64x8_impl_cpu  = {0};
 
 static int vec_init_spinner = 0;
 
+#define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \
+	do { \
+		if (!(cpu).func && (impl).func) \
+			(cpu).func = (impl).func; \
+	} while (0)
+
+#define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \
+	do { \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, load); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, store); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, add); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, div); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, band); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, min); \
+		FILL_GIVEN_FUNC_PTR(cpu, impl, max); \
+	} while (0)
+
+#define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \
+	FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl)
+
 // returns 0 or a negative error code on failure
 int vec_init(void)
 {
 	// This function is NOT thread safe. However, once vec
 	// is initialized, all of the vector functions are thread-safe.
-	//
-	// In fact, it's possible to use vec without calling
-	// vec_init() at all, but it would be completely useless since
-	// it would just use a generic implementation without any
-	// vectorization whatsoever (unless maybe the compiler is
-	// smart enough to optimize it into vectors)
 
 	if (vec_init_spinner)
 		return 0; // already initialized, do nothing
 
 	vec_uint32 cpu = vec_get_CPU_features();
 
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (cpu & VEC_CPU_HAS_ALTIVEC) {
-		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
-		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
-		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
-		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
-		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
-		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
-#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-		if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
-			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
-			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
-		}
+	/* Okay, this might be a little confusing:
+	 * The way we do this is because of x86. For weird reasons,
+	 * Intel decided to extend their prior CPU extensions to
+	 * where SSE4.1 has some extended features of SSE2, AVX2
+	 * has some extended features that should've been in SSE
+	 * in general, etc.
+	 *
+	 * For this, I've just decided to keep the function
+	 * definitions private, and fill in as we go, with newer
+	 * intrinsics preferred. Others are arbitrary and are
+	 * mutually exclusive (i.e. Altivec vs NEON). This is simply
+	 * the easiest way to go about it :) */
+
+	/* --- 512-bit */
+#ifdef VEC_COMPILER_HAS_AVX512DQ
+	if (cpu & VEC_CPU_HAS_AVX512DQ) {
+		/* these give us native multiply instructions */
+		FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq);
+	}
 #endif
+#ifdef VEC_COMPILER_HAS_AVX512BW
+	if (cpu & VEC_CPU_HAS_AVX512BW) {
+		FILL_GIVEN_FUNC_PTRS( , 8,  64, avx512bw);
+		FILL_GIVEN_FUNC_PTRS(u, 8,  64, avx512bw);
+		FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw);
 	}
 #endif
 #ifdef VEC_COMPILER_HAS_AVX512F
 	if (cpu & VEC_CPU_HAS_AVX512F) {
-		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
-		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
-		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
-		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
-		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
-		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
-		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
-		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
+		FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f);
+		FILL_GIVEN_FUNC_PTRS( , 64, 8,  avx512f);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 8,  avx512f);
+	}
+#endif
+
+	/* --- 256-bit */
+#ifdef VEC_COMPILER_HAS_AVX2
+	if (cpu & VEC_CPU_HAS_AVX2) {
+		FILL_GIVEN_FUNC_PTRS( , 8, 32,  avx2);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 32,  avx2);
+		FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2);
+		FILL_GIVEN_FUNC_PTRS( , 32, 8,  avx2);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 8,  avx2);
+		FILL_GIVEN_FUNC_PTRS( , 64, 4,  avx2);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 4,  avx2);
 	}
 #endif
-#ifdef VEC_COMPILER_HAS_AVX2
-	if (cpu & VEC_CPU_HAS_AVX2) {
-		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
-		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
-		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
-		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
-		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
-		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
-		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
-		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
+
+	/* --- 128-bit */
+#ifdef VEC_COMPILER_HAS_SSE42
+	if (cpu & VEC_CPU_HAS_SSE41) {
+		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42);
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_SSE41
+	if (cpu & VEC_CPU_HAS_SSE41) {
+		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41);
+		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41);
+		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41);
+		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41);
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_SSE3
+	if (cpu & VEC_CPU_HAS_SSE3) {
+		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3);
+		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3);
+		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3);
+		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3);
 	}
 #endif
 #ifdef VEC_COMPILER_HAS_SSE2
 	if (cpu & VEC_CPU_HAS_SSE2) {
-		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
-		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
-		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
-		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
-# ifdef VEC_COMPILER_HAS_SSE41
-		if (cpu & VEC_CPU_HAS_SSE41) {
-			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
-			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
-		} else
-# endif
-		{
-			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
-			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
-		}
-		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
-		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
+		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2);
+		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2);
+		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2);
+		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2);
 	}
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	if (cpu & VEC_CPU_HAS_NEON) {
+		FILL_GIVEN_FUNC_PTRS( , 8, 16, neon);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon);
+		FILL_GIVEN_FUNC_PTRS( , 16, 8, neon);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon);
+		FILL_GIVEN_FUNC_PTRS( , 32, 4, neon);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon);
+		FILL_GIVEN_FUNC_PTRS( , 64, 2, neon);
+		FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon);
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_ALTIVEC
+	if (cpu & VEC_CPU_HAS_ALTIVEC) {
+		FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec);
+		FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec);
+		FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec);
+	}
+#endif
+
+	/* --- 64-bit */
 #ifdef VEC_COMPILER_HAS_MMX
 	if (cpu & VEC_CPU_HAS_MMX) {
-		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
-		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
-		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
-		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
-		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
-		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
+		FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx);
+		FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx);
+		FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx);
 	}
 #endif
 #ifdef VEC_COMPILER_HAS_NEON
 	if (cpu & VEC_CPU_HAS_NEON) {
-		// 64-bit
-		vint8x8_impl_cpu  = &vint8x8_impl_neon;
-		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
-		vint16x4_impl_cpu  = &vint16x4_impl_neon;
-		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
-		vint32x2_impl_cpu  = &vint32x2_impl_neon;
-		vuint32x2_impl_cpu = &vuint32x2_impl_neon;
-
-		// 128-bit
-		vint8x16_impl_cpu  = &vint8x16_impl_neon;
-		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
-		vint16x8_impl_cpu  = &vint16x8_impl_neon;
-		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
-		vint32x4_impl_cpu  = &vint32x4_impl_neon;
-		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
-		vint64x2_impl_cpu  = &vint64x2_impl_neon;
-		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
+		FILL_GIVEN_FUNC_PTRS( , 8, 8, neon);
+		FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon);
+		FILL_GIVEN_FUNC_PTRS( , 16, 4, neon);
+		FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon);
+		FILL_GIVEN_FUNC_PTRS( , 32, 2, neon);
+		FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon);
 	}
 #endif
-	{
-		// do nothing, they're already set to generics
-	}
+
+	/* fill any remaining function pointers with generics */
+	FILL_GIVEN_FUNC_PTRS( , 8, 64,  generic);
+	FILL_GIVEN_FUNC_PTRS(u, 8, 64,  generic);
+	FILL_GIVEN_FUNC_PTRS( , 16, 32, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic);
+	FILL_GIVEN_FUNC_PTRS( , 32, 16, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic);
+	FILL_GIVEN_FUNC_PTRS( , 64, 8,  generic);
+	FILL_GIVEN_FUNC_PTRS(u, 64, 8,  generic);
+
+	FILL_GIVEN_FUNC_PTRS( , 8, 32,  generic);
+	FILL_GIVEN_FUNC_PTRS(u, 8, 32,  generic);
+	FILL_GIVEN_FUNC_PTRS( , 16, 16, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic);
+	FILL_GIVEN_FUNC_PTRS( , 32, 8,  generic);
+	FILL_GIVEN_FUNC_PTRS(u, 32, 8,  generic);
+	FILL_GIVEN_FUNC_PTRS( , 64, 4,  generic);
+	FILL_GIVEN_FUNC_PTRS(u, 64, 4,  generic);
+
+	FILL_GIVEN_FUNC_PTRS( , 8, 16, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic);
+	FILL_GIVEN_FUNC_PTRS( , 16, 8, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic);
+	FILL_GIVEN_FUNC_PTRS( , 32, 4, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic);
+	FILL_GIVEN_FUNC_PTRS( , 64, 2, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic);
+
+	FILL_GIVEN_FUNC_PTRS( , 8, 8, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic);
+	FILL_GIVEN_FUNC_PTRS( , 16, 4, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic);
+	FILL_GIVEN_FUNC_PTRS( , 32, 2, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic);
+
+	FILL_GIVEN_FUNC_PTRS( , 8, 4, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic);
+	FILL_GIVEN_FUNC_PTRS( , 16, 2, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic);
+
+	FILL_GIVEN_FUNC_PTRS( , 8, 2, generic);
+	FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic);
 
 	vec_init_spinner++;
 
@@ -241,7 +371,6 @@
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
@@ -249,7 +378,9 @@
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2);
 
 #define VEC_DEFINE_OPERATIONS(bits, size) \
 	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \