diff src/vec.c @ 31:bf6ad516f1e6

Backed out changeset c6c99ab1088a
author Paper <paper@tflc.us>
date Fri, 25 Apr 2025 17:40:33 -0400
parents 641d8c79b1da
children 8b5e0974fd41
line wrap: on
line diff
--- a/src/vec.c	Fri Apr 25 17:40:30 2025 -0400
+++ b/src/vec.c	Fri Apr 25 17:40:33 2025 -0400
@@ -32,27 +32,15 @@
 #ifdef VEC_COMPILER_HAS_SSE2
 # include "vec/impl/x86/sse2.h"
 #endif
-#ifdef VEC_COMPILER_HAS_SSE3
-# include "vec/impl/x86/sse3.h"
-#endif
 #ifdef VEC_COMPILER_HAS_SSE41
 # include "vec/impl/x86/sse41.h"
 #endif
-#ifdef VEC_COMPILER_HAS_SSE42
-# include "vec/impl/x86/sse42.h"
-#endif
 #ifdef VEC_COMPILER_HAS_AVX2
 # include "vec/impl/x86/avx2.h"
 #endif
 #ifdef VEC_COMPILER_HAS_AVX512F
 # include "vec/impl/x86/avx512f.h"
 #endif
-#ifdef VEC_COMPILER_HAS_AVX512BW
-# include "vec/impl/x86/avx512bw.h"
-#endif
-#ifdef VEC_COMPILER_HAS_AVX512DQ
-# include "vec/impl/x86/avx512dq.h"
-#endif
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 # include "vec/impl/ppc/altivec.h"
 #endif
@@ -71,284 +59,166 @@
 extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y);
 
 // 16-bit
-vint8x2_impl   vint8x2_impl_cpu   = {0};
-vuint8x2_impl  vuint8x2_impl_cpu  = {0};
+const vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
+const vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;
 
 // 32-bit
-vint8x4_impl   vint8x4_impl_cpu   = {0};
-vuint8x4_impl  vuint8x4_impl_cpu  = {0};
-vint16x2_impl  vint16x2_impl_cpu  = {0};
-vuint16x2_impl vuint16x2_impl_cpu = {0};
+const vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
+const vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
+const vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
+const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
 
 // 64-bit
-vint8x8_impl   vint8x8_impl_cpu   = {0};
-vuint8x8_impl  vuint8x8_impl_cpu  = {0};
-vint16x4_impl  vint16x4_impl_cpu  = {0};
-vuint16x4_impl vuint16x4_impl_cpu = {0};
-vint32x2_impl  vint32x2_impl_cpu  = {0};
-vuint32x2_impl vuint32x2_impl_cpu = {0};
+const vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
+const vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
+const vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
+const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
+const vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
+const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
 
 // 128-bit
-vint8x16_impl  vint8x16_impl_cpu  = {0};
-vuint8x16_impl vuint8x16_impl_cpu = {0};
-vint16x8_impl  vint16x8_impl_cpu  = {0};
-vuint16x8_impl vuint16x8_impl_cpu = {0};
-vint32x4_impl  vint32x4_impl_cpu  = {0};
-vuint32x4_impl vuint32x4_impl_cpu = {0};
-vint64x2_impl  vint64x2_impl_cpu  = {0};
-vuint64x2_impl vuint64x2_impl_cpu = {0};
+const vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
+const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
+const vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
+const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
+const vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
+const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
+const vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
+const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
 
 // 256-bit
-vint8x32_impl   vint8x32_impl_cpu   = {0};
-vuint8x32_impl  vuint8x32_impl_cpu  = {0};
-vint16x16_impl  vint16x16_impl_cpu  = {0};
-vuint16x16_impl vuint16x16_impl_cpu = {0};
-vint32x8_impl   vint32x8_impl_cpu   = {0};
-vuint32x8_impl  vuint32x8_impl_cpu  = {0};
-vint64x4_impl   vint64x4_impl_cpu   = {0};
-vuint64x4_impl  vuint64x4_impl_cpu  = {0};
+const vint8x32_impl   *vint8x32_impl_cpu   = &vint8x32_impl_generic;
+const vuint8x32_impl  *vuint8x32_impl_cpu  = &vuint8x32_impl_generic;
+const vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
+const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
+const vint32x8_impl   *vint32x8_impl_cpu   = &vint32x8_impl_generic;
+const vuint32x8_impl  *vuint32x8_impl_cpu  = &vuint32x8_impl_generic;
+const vint64x4_impl   *vint64x4_impl_cpu   = &vint64x4_impl_generic;
+const vuint64x4_impl  *vuint64x4_impl_cpu  = &vuint64x4_impl_generic;
 
 // 512-bit
-vint8x64_impl   vint8x64_impl_cpu   = {0};
-vuint8x64_impl  vuint8x64_impl_cpu  = {0};
-vint16x32_impl  vint16x32_impl_cpu  = {0};
-vuint16x32_impl vuint16x32_impl_cpu = {0};
-vint32x16_impl  vint32x16_impl_cpu  = {0};
-vuint32x16_impl vuint32x16_impl_cpu = {0};
-vint64x8_impl   vint64x8_impl_cpu   = {0};
-vuint64x8_impl  vuint64x8_impl_cpu  = {0};
+const vint8x64_impl   *vint8x64_impl_cpu   = &vint8x64_impl_generic;
+const vuint8x64_impl  *vuint8x64_impl_cpu  = &vuint8x64_impl_generic;
+const vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
+const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
+const vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
+const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
+const vint64x8_impl   *vint64x8_impl_cpu   = &vint64x8_impl_generic;
+const vuint64x8_impl  *vuint64x8_impl_cpu  = &vuint64x8_impl_generic;
 
 static int vec_init_spinner = 0;
 
-#define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \
-	do { \
-		if (!(cpu).func && (impl).func) \
-			(cpu).func = (impl).func; \
-	} while (0)
-
-#define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \
-	do { \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, load); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, store); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, add); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, div); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, band); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, min); \
-		FILL_GIVEN_FUNC_PTR(cpu, impl, max); \
-	} while (0)
-
-#define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \
-	FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl)
-
 // returns 0 or a negative error code on failure
 int vec_init(void)
 {
 	// This function is NOT thread safe. However, once vec
 	// is initialized, all of the vector functions are thread-safe.
+	//
+	// In fact, it's possible to use vec without calling
+	// vec_init() at all, but it would be completely useless since
+	// it would just use a generic implementation without any
+	// vectorization whatsoever (unless maybe the compiler is
+	// smart enough to optimize it into vectors)
 
 	if (vec_init_spinner)
 		return 0; // already initialized, do nothing
 
 	vec_uint32 cpu = vec_get_CPU_features();
 
-	/* Okay, this might be a little confusing:
-	 * The way we do this is because of x86. For weird reasons,
-	 * Intel decided to extend their prior CPU extensions to
-	 * where SSE4.1 has some extended features of SSE2, AVX2
-	 * has some extended features that should've been in SSE
-	 * in general, etc.
-	 *
-	 * For this, I've just decided to keep the function
-	 * definitions private, and fill in as we go, with newer
-	 * intrinsics preferred. Others are arbitrary and are
-	 * mutually exclusive (i.e. Altivec vs NEON). This is simply
-	 * the easiest way to go about it :) */
-
-	/* --- 512-bit */
-#ifdef VEC_COMPILER_HAS_AVX512DQ
-	if (cpu & VEC_CPU_HAS_AVX512DQ) {
-		/* these give us native multiply instructions */
-		FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq);
-	}
+#ifdef VEC_COMPILER_HAS_ALTIVEC
+	if (cpu & VEC_CPU_HAS_ALTIVEC) {
+		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
+		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
+		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
+		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
+		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
+		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
+#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
+		if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
+			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
+			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
+		}
 #endif
-#ifdef VEC_COMPILER_HAS_AVX512BW
-	if (cpu & VEC_CPU_HAS_AVX512BW) {
-		FILL_GIVEN_FUNC_PTRS( , 8,  64, avx512bw);
-		FILL_GIVEN_FUNC_PTRS(u, 8,  64, avx512bw);
-		FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw);
 	}
 #endif
 #ifdef VEC_COMPILER_HAS_AVX512F
 	if (cpu & VEC_CPU_HAS_AVX512F) {
-		FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f);
-		FILL_GIVEN_FUNC_PTRS( , 64, 8,  avx512f);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 8,  avx512f);
-	}
-#endif
-
-	/* --- 256-bit */
-#ifdef VEC_COMPILER_HAS_AVX2
-	if (cpu & VEC_CPU_HAS_AVX2) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 32,  avx2);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 32,  avx2);
-		FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2);
-		FILL_GIVEN_FUNC_PTRS( , 32, 8,  avx2);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 8,  avx2);
-		FILL_GIVEN_FUNC_PTRS( , 64, 4,  avx2);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 4,  avx2);
+		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
+		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
+		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
+		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
+		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
+		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
+		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
+		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
 	}
 #endif
-
-	/* --- 128-bit */
-#ifdef VEC_COMPILER_HAS_SSE42
-	if (cpu & VEC_CPU_HAS_SSE41) {
-		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42);
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_SSE41
-	if (cpu & VEC_CPU_HAS_SSE41) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41);
-		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41);
-		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41);
-		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41);
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_SSE3
-	if (cpu & VEC_CPU_HAS_SSE3) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3);
-		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3);
-		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3);
-		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3);
+#ifdef VEC_COMPILER_HAS_AVX2
+	if (cpu & VEC_CPU_HAS_AVX2) {
+		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
+		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
+		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
+		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
+		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
+		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
+		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
+		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
 	}
 #endif
 #ifdef VEC_COMPILER_HAS_SSE2
 	if (cpu & VEC_CPU_HAS_SSE2) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2);
-		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2);
-		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2);
-		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2);
+		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
+		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
+		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
+		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
+# ifdef VEC_COMPILER_HAS_SSE41
+		if (cpu & VEC_CPU_HAS_SSE41) {
+			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
+			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
+		} else
+# endif
+		{
+			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
+			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
+		}
+		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
+		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
 	}
 #endif
-#ifdef VEC_COMPILER_HAS_NEON
-	if (cpu & VEC_CPU_HAS_NEON) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 16, neon);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon);
-		FILL_GIVEN_FUNC_PTRS( , 16, 8, neon);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon);
-		FILL_GIVEN_FUNC_PTRS( , 32, 4, neon);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon);
-		FILL_GIVEN_FUNC_PTRS( , 64, 2, neon);
-		FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon);
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (cpu & VEC_CPU_HAS_ALTIVEC) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec);
-		FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec);
-		FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec);
-	}
-#endif
-
-	/* --- 64-bit */
 #ifdef VEC_COMPILER_HAS_MMX
 	if (cpu & VEC_CPU_HAS_MMX) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx);
-		FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx);
-		FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx);
+		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
+		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
+		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
+		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
+		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
+		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
 	}
 #endif
 #ifdef VEC_COMPILER_HAS_NEON
 	if (cpu & VEC_CPU_HAS_NEON) {
-		FILL_GIVEN_FUNC_PTRS( , 8, 8, neon);
-		FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon);
-		FILL_GIVEN_FUNC_PTRS( , 16, 4, neon);
-		FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon);
-		FILL_GIVEN_FUNC_PTRS( , 32, 2, neon);
-		FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon);
+		// 64-bit
+		vint8x8_impl_cpu  = &vint8x8_impl_neon;
+		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
+		vint16x4_impl_cpu  = &vint16x4_impl_neon;
+		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
+		vint32x2_impl_cpu  = &vint32x2_impl_neon;
+		vuint32x2_impl_cpu = &vuint32x2_impl_neon;
+
+		// 128-bit
+		vint8x16_impl_cpu  = &vint8x16_impl_neon;
+		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
+		vint16x8_impl_cpu  = &vint16x8_impl_neon;
+		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
+		vint32x4_impl_cpu  = &vint32x4_impl_neon;
+		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
+		vint64x2_impl_cpu  = &vint64x2_impl_neon;
+		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
 	}
 #endif
-
-	/* fill any remaining function pointers with generics */
-	FILL_GIVEN_FUNC_PTRS( , 8, 64,  generic);
-	FILL_GIVEN_FUNC_PTRS(u, 8, 64,  generic);
-	FILL_GIVEN_FUNC_PTRS( , 16, 32, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic);
-	FILL_GIVEN_FUNC_PTRS( , 32, 16, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic);
-	FILL_GIVEN_FUNC_PTRS( , 64, 8,  generic);
-	FILL_GIVEN_FUNC_PTRS(u, 64, 8,  generic);
-
-	FILL_GIVEN_FUNC_PTRS( , 8, 32,  generic);
-	FILL_GIVEN_FUNC_PTRS(u, 8, 32,  generic);
-	FILL_GIVEN_FUNC_PTRS( , 16, 16, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic);
-	FILL_GIVEN_FUNC_PTRS( , 32, 8,  generic);
-	FILL_GIVEN_FUNC_PTRS(u, 32, 8,  generic);
-	FILL_GIVEN_FUNC_PTRS( , 64, 4,  generic);
-	FILL_GIVEN_FUNC_PTRS(u, 64, 4,  generic);
-
-	FILL_GIVEN_FUNC_PTRS( , 8, 16, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic);
-	FILL_GIVEN_FUNC_PTRS( , 16, 8, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic);
-	FILL_GIVEN_FUNC_PTRS( , 32, 4, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic);
-	FILL_GIVEN_FUNC_PTRS( , 64, 2, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic);
-
-	FILL_GIVEN_FUNC_PTRS( , 8, 8, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic);
-	FILL_GIVEN_FUNC_PTRS( , 16, 4, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic);
-	FILL_GIVEN_FUNC_PTRS( , 32, 2, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic);
-
-	FILL_GIVEN_FUNC_PTRS( , 8, 4, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic);
-	FILL_GIVEN_FUNC_PTRS( , 16, 2, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic);
-
-	FILL_GIVEN_FUNC_PTRS( , 8, 2, generic);
-	FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic);
+	{
+		// do nothing, they're already set to generics
+	}
 
 	vec_init_spinner++;
 
@@ -371,6 +241,7 @@
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
@@ -378,9 +249,7 @@
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
 	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2);
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
 
 #define VEC_DEFINE_OPERATIONS(bits, size) \
 	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \