Mercurial > vec

--- a/CMakeLists.txt	Tue Nov 19 15:55:01 2024 -0500
+++ b/CMakeLists.txt	Wed Nov 20 04:10:37 2024 -0500
@@ -1,10 +1,42 @@
 cmake_minimum_required(VERSION 3.5)

-project(vec VERSION 1.0.0 DESCRIPTION "a tiny C99 SIMD vector library")
+project(vec VERSION 2.0.0 DESCRIPTION "a tiny C99 SIMD vector library")

 add_library(vec SHARED src/vec.c)

-set_target_properties(vec PROPERTIES PUBLIC_HEADER include/vec/vec.h)
+include(CheckCCompilerFlag)
+
+if(MSVC)
+	# TODO ?
+else()
+	check_c_compiler_flag("-maltivec" COMPILER_HAS_ALTIVEC)
+	if(COMPILER_HAS_ALTIVEC)
+		target_compile_options(vec PRIVATE "-maltivec")
+	endif()
+	check_c_compiler_flag("-mmmx" COMPILER_HAS_MMX)
+	if(COMPILER_HAS_MMX)
+		target_compile_options(vec PRIVATE "-mmmx")
+	endif()
+	check_c_compiler_flag("-msse2" COMPILER_HAS_SSE2)
+	if(COMPILER_HAS_SSE2)
+		target_compile_options(vec PRIVATE "-msse2")
+	endif()
+	check_c_compiler_flag("-msse4.1" COMPILER_HAS_SSE41)
+	if(COMPILER_HAS_SSE41)
+		target_compile_options(vec PRIVATE "-msse4.1")
+	endif()
+	check_c_compiler_flag("-mavx2" COMPILER_HAS_AVX2)
+	if(COMPILER_HAS_AVX2)
+		target_compile_options(vec PRIVATE "-mavx2")
+	endif()
+	check_c_compiler_flag("-mavx512f" COMPILER_HAS_AVX512F)
+	if(COMPILER_HAS_AVX512F)
+		target_compile_options(vec PRIVATE "-mavx512f")
+	endif()
+endif()
+
+
+set_target_properties(vec PROPERTIES PUBLIC_HEADER include/vec/vec.h C_STANDARD 99)

 target_include_directories(vec PRIVATE include)

@@ -13,9 +45,9 @@
 include(GNUInstallDirs)

 install(TARGETS vec
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+	LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+	PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

 # pkg-config
 configure_file(vec.pc.in vec.pc @ONLY)
-install(FILES ${CMAKE_BINARY_DIR}/vec.pc DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig)
\ No newline at end of file
+install(FILES ${CMAKE_BINARY_DIR}/vec.pc DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig)
--- a/README	Tue Nov 19 15:55:01 2024 -0500
+++ b/README	Wed Nov 20 04:10:37 2024 -0500
@@ -1,19 +1,19 @@
 vec - a tiny SIMD vector header-only library written in C99

-it comes with an extremely basic (and somewhat lacking) API,
-where there are eight supported vector types, all 128-bit:
+it comes with an extremely basic API that is similar to other intrinsics
+libraries; each type is in the exact same format:

-	vint8x16  - 16 signed 8-bit integers
-	vint16x8  - 8 signed 16-bit integers
-	vint32x4  - 4 signed 32-bit integers
-	vint64x2  - 2 signed 64-bit integers
-	vuint8x16 - 16 unsigned 8-bit integers
-	vuint16x8 - 8 unsigned 16-bit integers
-	vuint32x4 - 4 unsigned 32-bit integers
-	vuint32x4 - 2 unsigned 64-bit integers
+	v[sign][bits]x[size]
+		where `sign' is either nothing (for signed) or `u' (for unsigned),
+		`bits' is the bit size of the integer format,
+		and `size' is the how many integers are in the vector

-all of these have many operations that are prefixed with the
-name of the type and an underscore, for example:
+vec provides types for 64-bit, 128-bit, 256-bit, and 512-bit SIMD intrinsics
+on processors where vec has an implementation and falls back to array-based
+implementations where they are not.
+
+all of these have many operations that are prefixed with the name of the
+type and an underscore, for example:

 	vint8x16 vint8x16_splat(uint8_t x)
 	- creates a vint8x16 where all of the values are filled
@@ -106,3 +106,10 @@
 		the result vector if the value in `vec1' is greater
 		than or equal to the corresponding value in `vec2',
 		else all of the bits are turned off.
+
+to initialize vec, you MUST call `vec_init()' when your programs starts up.
+
+note that `vec_init()' is NOT thread-safe, and things can and will
+blow up if you call it simultaneously from different threads (i.e. you
+try to only initialize it when you need to... please just initialize
+it on startup so you don't have to worry about that!!!)
--- a/include/vec/impl/altivec.h	Tue Nov 19 15:55:01 2024 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,219 +0,0 @@
-/**
- * vec - a tiny SIMD vector library in plain C99
- *
- * Copyright (c) 2024 Paper
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
-/* Altivec vector support. */
-
-#include <stdint.h>
-#include <string.h>
-
-#include <altivec.h>
-
-#define VEC_ALTIVEC_ALIGNMENT 16
-
-/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */
-#ifdef vec_mul
-# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \
-	VEC_DECL_MUL(sign, csign, bits, size) \
-	{ \
-		return vec_mul(vec1, vec2); \
-	}
-#else
-# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \
-	VEC_GENERIC_MULTIPLY(sign, csign, bits, size)
-#endif
-
-#ifdef vec_splats
-# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \
-	VEC_DECL_SPLAT(sign, bits, size) \
-	{ \
-		return vec_splats(x); \
-	}
-#else
-# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \
-	VEC_GENERIC_SPLAT(sign, csign, bits, size)
-#endif
-
-#define VEC_ALTIVEC_uRSHIFT vec_sr
-#define VEC_ALTIVEC_RSHIFT vec_sra
-
-#define VEC_ALTIVEC_uLRSHIFT(sign, csign, bits, size) \
-	VEC_DECL_SHIFT(sign, bits, size, l, r) \
-	{ \
-		return vec_sr(vec1, vec2); \
-	}
-#define VEC_ALTIVEC_LRSHIFT(sign, csign, bits, size) \
-	VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r)
-
-/* Since altivec conveniently made their API super user friendly, we can just use
- * one giant macro to define literally everything */
-#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
-	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
-	{ \
-		return vec_ld(0, in); \
-	} \
-	\
-	VEC_DECL_LOAD(sign, bits, size) \
-	{ \
-		return vec_perm(vec_ld(0, in), vec_ld(VEC_ALTIVEC_ALIGNMENT, in), vec_lvsl(0, in)); \
-	} \
-	\
-	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
-	{ \
-		vec_st(vec, 0, out); \
-	} \
-	\
-	VEC_DECL_STORE(sign, bits, size) \
-	{ \
-		VEC_ALIGNED_ARRAY(sign##int##bits##_t, aligned_out, size, VEC_ALTIVEC_ALIGNMENT); \
-		vec_st(vec, 0, aligned_out); \
-		memcpy(out, aligned_out, size * sizeof(*aligned_out)); \
-	} \
-	\
-	VEC_DECL_ADD(sign, bits, size) \
-	{ \
-		return vec_add(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_SUB(sign, bits, size) \
-	{ \
-		return vec_sub(vec1, vec2); \
-	} \
-	\
-	VEC_ALTIVEC_MUL(sign, csign, bits, size) \
-	\
-	VEC_DECL_SHIFT(sign, bits, size, , l) \
-	{ \
-		return vec_sl(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_SHIFT(sign, bits, size, , r) \
-	{ \
-		return VEC_ALTIVEC_##sign##RSHIFT(vec1, vec2); \
-	} \
-	\
-	VEC_ALTIVEC_##sign##LRSHIFT(sign, csign, bits, size) \
-	\
-	VEC_DECL_AVG(sign, bits, size) \
-	{ \
-		return vec_avg(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_AND(sign, bits, size) \
-	{ \
-		return vec_and(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_OR(sign, bits, size) \
-	{ \
-		return vec_or(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_XOR(sign, bits, size) \
-	{ \
-		return vec_xor(vec1, vec2); \
-	} \
-	\
-	VEC_GENERIC_COMPARISONS(sign, csign, bits, size) \
-	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
-	VEC_ALTIVEC_SPLAT(sign, csign, bits, size)
-
-#ifndef VEC_VUINT8X16
-# define VEC_VUINT8X16
-typedef vector unsigned char vuint8x16;
-# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-# define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 8, 16)
-#endif /* VEC_VUINT8X16 */
-
-#ifndef VEC_VINT8X16
-# define VEC_VINT8X16
-typedef vector signed char vint8x16;
-# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-# define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 8, 16)
-#endif /* VEC_VINT8X16 */
-
-#ifndef VEC_VUINT16X8
-# define VEC_VUINT16X8
-typedef vector unsigned short vuint16x8;
-# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(vuint16x8){ a, b, c, d, e, f, g, h }
-# define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 16, 8)
-#endif /* VEC_VUINT16X8 */
-
-#ifndef VEC_VINT16X8
-# define VEC_VINT16X8
-typedef vector signed short vint16x8;
-# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(vint16x8){ a, b, c, d, e, f, g, h }
-# define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 16, 8)
-#endif /* VEC_VINT16X8 */
-
-#ifndef VEC_VUINT32X4
-# define VEC_VUINT32X4
-typedef vector unsigned int vuint32x4;
-# define VUINT32x4_CONSTANT(a, b, c, d) \
-	(vuint32x4){ a, b, c, d }
-# define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 32, 4)
-#endif /* VEC_VUINT32X4 */
-
-#ifndef VEC_VINT32X4
-# define VEC_VINT32X4
-typedef vector signed int vint32x4;
-# define VINT32x4_CONSTANT(a, b, c, d) \
-	(vint32x4){ a, b, c, d }
-# define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 32, 4)
-#endif /* VEC_VINT32X4 */
-
-#if defined(__POWER8__) && defined(__VSX__)
-
-# ifndef VEC_VUINT64X2
-#  define VEC_VUINT64X2
-typedef vector unsigned long long vuint64x2;
-#  define VUINT64x2_CONSTANT(a, b) \
-	(vuint64x2){ a, b }
-#  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 64, 2)
-# endif /* VEC_VUINT64X2 */
-
-# ifndef VEC_VINT64X2
-#  define VEC_VINT64X2
-typedef vector signed long long vint64x2;
-#  define VINT64x2_CONSTANT(a, b) \
-	(vint64x2){ a, b }
-#  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 64, 2)
-# endif /* VEC_VINT64X2 */
-
-#endif /* defined(__POWER8__) && defined(__VSX__) */
-
-#undef VEC_DEFINE_OPERATIONS
-#undef VEC_ALTIVEC_MUL
-#undef VEC_ALTIVEC_SPLAT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/cpu.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,397 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_CPU_H_
+#define VEC_IMPL_CPU_H_
+
+/* Detect CPU SIMD support. Much of this code was stolen from SDL.
+ *
+ * Simple DirectMedia Layer
+ * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+*/
+
+# if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
+#  include <sys/sysctl.h> // For AltiVec check
+# elif defined(__OpenBSD__) && defined(__powerpc__)
+#  include <sys/types.h>
+#  include <sys/sysctl.h> // For AltiVec check
+#  include <machine/cpu.h>
+# elif defined(__FreeBSD__) && defined(__powerpc__)
+#  include <machine/cpu.h>
+#  include <sys/auxv.h>
+# elif defined(__ALTIVEC__)
+#  include <signal.h>
+#  include <setjmp.h>
+# endif
+
+# ifdef __FreeBSD__
+#  include <sys/param.h>
+# endif
+
+static inline int vec_CPU_have_CPUID(void)
+{
+	int has_CPUID = 0;
+
+#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
+	__asm__ (
+"        pushfl                      # Get original EFLAGS             \n"
+"        popl    %%eax                                                 \n"
+"        movl    %%eax,%%ecx                                           \n"
+"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
+"        pushl   %%eax               # Save new EFLAGS value on stack  \n"
+"        popfl                       # Replace current EFLAGS value    \n"
+"        pushfl                      # Get new EFLAGS                  \n"
+"        popl    %%eax               # Store new EFLAGS in EAX         \n"
+"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
+"        jz      1f                  # Processor=80486                 \n"
+"        movl    $1,%0               # We have CPUID support           \n"
+"1:                                                                    \n"
+	: "=m" (has_CPUID)
+	:
+	: "%eax", "%ecx"
+	);
+#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
+/* Technically, if this is being compiled under __x86_64__ then it has
+   CPUid by definition.  But it's nice to be able to prove it.  :)      */
+	__asm__ (
+"        pushfq                      # Get original EFLAGS             \n"
+"        popq    %%rax                                                 \n"
+"        movq    %%rax,%%rcx                                           \n"
+"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
+"        pushq   %%rax               # Save new EFLAGS value on stack  \n"
+"        popfq                       # Replace current EFLAGS value    \n"
+"        pushfq                      # Get new EFLAGS                  \n"
+"        popq    %%rax               # Store new EFLAGS in EAX         \n"
+"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
+"        jz      1f                  # Processor=80486                 \n"
+"        movl    $1,%0               # We have CPUID support           \n"
+"1:                                                                    \n"
+	: "=m" (has_CPUID)
+	:
+	: "%rax", "%rcx"
+	);
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+	__asm {
+		pushfd                      ; Get original EFLAGS
+		pop     eax
+		mov     ecx, eax
+		xor     eax, 200000h        ; Flip ID bit in EFLAGS
+		push    eax                 ; Save new EFLAGS value on stack
+		popfd                       ; Replace current EFLAGS value
+		pushfd                      ; Get new EFLAGS
+		pop     eax                 ; Store new EFLAGS in EAX
+		xor     eax, ecx            ; Can not toggle ID bit,
+		jz      done                ; Processor=80486
+		mov     has_CPUID,1         ; We have CPUID support
+done:
+	}
+#elif defined(_MSC_VER) && defined(_M_X64)
+	has_CPUID = 1;
+#elif defined(__sun) && defined(__i386)
+	__asm (
+"       pushfl                 \n"
+"       popl    %eax           \n"
+"       movl    %eax,%ecx      \n"
+"       xorl    $0x200000,%eax \n"
+"       pushl   %eax           \n"
+"       popfl                  \n"
+"       pushfl                 \n"
+"       popl    %eax           \n"
+"       xorl    %ecx,%eax      \n"
+"       jz      1f             \n"
+"       movl    $1,-8(%ebp)    \n"
+"1:                            \n"
+	);
+#elif defined(__sun) && defined(__amd64)
+	__asm (
+"       pushfq                 \n"
+"       popq    %rax           \n"
+"       movq    %rax,%rcx      \n"
+"       xorl    $0x200000,%eax \n"
+"       pushq   %rax           \n"
+"       popfq                  \n"
+"       pushfq                 \n"
+"       popq    %rax           \n"
+"       xorl    %ecx,%eax      \n"
+"       jz      1f             \n"
+"       movl    $1,-8(%rbp)    \n"
+"1:                            \n"
+	);
+#endif
+
+	return has_CPUID;
+}
+
+#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm__ __volatile__( \
+		"        pushl %%ebx        \n" \
+		"        xorl %%ecx,%%ecx   \n" \
+		"        cpuid              \n" \
+		"        movl %%ebx, %%esi  \n" \
+		"        popl %%ebx         \n" \
+		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
+		: "a"(func))
+#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm__ __volatile__( \
+		"        pushq %%rbx        \n" \
+		"        xorq %%rcx,%%rcx   \n" \
+		"        cpuid              \n" \
+		"        movq %%rbx, %%rsi  \n" \
+		"        popq %%rbx         \n" \
+		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
+		: "a"(func))
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm { \
+		__asm mov eax, func \
+		__asm xor ecx, ecx \
+		__asm cpuid \
+		__asm mov a, eax \
+		__asm mov b, ebx \
+		__asm mov c, ecx \
+		__asm mov d, edx \
+	}
+#elif (defined(_MSC_VER) && defined(_M_X64))
+// Use __cpuidex instead of __cpuid because ICL does not clear ecx register
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	do { \
+		int CPUInfo[4]; \
+		__cpuidex(CPUInfo, func, 0); \
+		a = CPUInfo[0]; \
+		b = CPUInfo[1]; \
+		c = CPUInfo[2]; \
+		d = CPUInfo[3]; \
+	} while (0)
+#else
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	do { \
+		a = b = c = d = 0; \
+		(void)a; \
+		(void)b; \
+		(void)c; \
+		(void)d; \
+	} while (0)
+#endif
+
+// ---------------------------------------------------------------
+
+static int vec_CPU_CPUIDFeatures[4];
+static int vec_CPU_CPUIDMaxFunction = 0;
+static int vec_CPU_OSSavesYMM = 0;
+static int vec_CPU_OSSavesZMM = 0;
+
+static inline void vec_CPU_get_CPUID_features(void)
+{
+	static int checked = 0;
+	if (!checked) {
+		checked = 1;
+		if (vec_CPU_have_CPUID()) {
+			int a, b, c, d;
+			VEC_CPU_CPUID(0, a, b, c, d);
+			vec_CPU_CPUIDMaxFunction = a;
+			if (vec_CPU_CPUIDMaxFunction >= 1) {
+				VEC_CPU_CPUID(1, a, b, c, d);
+				vec_CPU_CPUIDFeatures[0] = a;
+				vec_CPU_CPUIDFeatures[1] = b;
+				vec_CPU_CPUIDFeatures[2] = c;
+				vec_CPU_CPUIDFeatures[3] = d;
+
+				// Check to make sure we can call xgetbv
+				if (c & 0x08000000) {
+					// Call xgetbv to see if YMM (etc) register state is saved
+#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
+					__asm__(".byte 0x0f, 0x01, 0xd0"
+							: "=a"(a)
+							: "c"(0)
+							: "%edx");
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
+					a = (int)_xgetbv(0);
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+					__asm {
+						xor ecx, ecx
+						_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
+						mov a, eax
+					}
+#endif
+					vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
+					vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
+				}
+			}
+		}
+	}
+}
+
+#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
+static jmp_buf vec_jmpbuf;
+static void vec_CPU_illegal_instruction(int sig)
+{
+	longjmp(vec_jmpbuf, 1);
+}
+#endif
+
+static int vec_CPU_have_ALTIVEC(void)
+{
+	volatile int altivec = 0;
+#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
+	int selectors[2] = {
+# ifdef __OpenBSD__
+		CTL_MACHDEP, CPU_ALTIVEC
+# else
+		CTL_HW, HW_VECTORUNIT
+# endif
+	};
+	int hasVectorUnit = 0;
+	size_t length = sizeof(hasVectorUnit);
+	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+	if (!error)
+		altivec = (hasVectorUnit != 0);
+#elif defined(__FreeBSD__) && defined(__powerpc__)
+	unsigned long cpufeatures = 0;
+	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
+#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
+    void (*handler)(int sig);
+    handler = signal(SIGILL, vec_CPU_illegal_instruction);
+    if (!setjmp(vec_jmpbuf)) {
+        asm volatile("mtspr 256, %0\n\t"
+                     "vand %%v0, %%v0, %%v0" ::"r"(-1));
+        altivec = 1;
+    }
+    signal(SIGILL, handler);
+#endif
+	return altivec;
+}
+
+static int vec_CPU_have_ALTIVEC_VSX(void)
+{
+	volatile int vsx = 0;
+#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
+	void (*handler)(int sig);
+	handler = signal(SIGILL, vec_CPU_illegal_instruction);
+	if (!setjmp(vec_jmpbuf)) {
+		// this is completely untested
+		asm volatile("mtspr 256, %0\n\t"
+					 "xxland %%v0, %%v0, %%v0" ::"r"(-1));
+		vsx = 1;
+	}
+	signal(SIGILL, handler);
+#endif
+	return vsx;
+}
+
+#define vec_CPU_have_MMX()   (vec_CPU_CPUIDFeatures[3] & 0x00800000)
+#define vec_CPU_have_SSE()   (vec_CPU_CPUIDFeatures[3] & 0x02000000)
+#define vec_CPU_have_SSE2()  (vec_CPU_CPUIDFeatures[3] & 0x04000000)
+#define vec_CPU_have_SSE3()  (vec_CPU_CPUIDFeatures[2] & 0x00000001)
+#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
+#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
+#define vec_CPU_have_AVX()   (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))
+
+static inline int vec_CPU_have_AVX2(void)
+{
+	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
+		int a, b, c, d;
+		VEC_CPU_CPUID(7, a, b, c, d);
+		return b & 0x00000020;
+		(void)a, (void)c, (void)d;
+	}
+	return 0;
+}
+
+static inline int vec_CPU_have_AVX512F(void)
+{
+	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
+		int a, b, c, d;
+		VEC_CPU_CPUID(7, a, b, c, d);
+		return b & 0x00000020;
+		(void)a, (void)c, (void)d;
+	}
+	return 0;
+}
+
+enum {
+	VEC_CPU_HAS_ALTIVEC = (1 << 0),
+	VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1),
+	VEC_CPU_HAS_MMX = (1 << 2),
+	VEC_CPU_HAS_SSE = (1 << 3),
+	VEC_CPU_HAS_SSE2 = (1 << 4),
+	VEC_CPU_HAS_SSE3 = (1 << 5),
+	VEC_CPU_HAS_SSE41 = (1 << 6),
+	VEC_CPU_HAS_SSE42 = (1 << 7),
+	VEC_CPU_HAS_AVX = (1 << 8),
+	VEC_CPU_HAS_AVX2 = (1 << 9),
+	VEC_CPU_HAS_AVX512F = (1 << 10),
+};
+
+#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF)
+
+static uint32_t vec_CPU_features = VEC_CPU_FEATURES_RESET;
+
+static void vec_get_CPU_features(void)
+{
+	vec_CPU_get_CPUID_features();
+	vec_CPU_features = 0;
+	if (vec_CPU_have_ALTIVEC())
+		vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
+	if (vec_CPU_have_ALTIVEC_VSX())
+		vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
+    if (vec_CPU_have_MMX())
+        vec_CPU_features |= VEC_CPU_HAS_MMX;
+	if (vec_CPU_have_SSE())
+		vec_CPU_features |= VEC_CPU_HAS_SSE;
+	if (vec_CPU_have_SSE2())
+		vec_CPU_features |= VEC_CPU_HAS_SSE2;
+	if (vec_CPU_have_SSE3())
+		vec_CPU_features |= VEC_CPU_HAS_SSE3;
+	if (vec_CPU_have_SSE41())
+		vec_CPU_features |= VEC_CPU_HAS_SSE41;
+	if (vec_CPU_have_SSE42())
+		vec_CPU_features |= VEC_CPU_HAS_SSE42;
+	if (vec_CPU_have_AVX())
+		vec_CPU_features |= VEC_CPU_HAS_AVX;
+	if (vec_CPU_have_AVX2())
+		vec_CPU_features |= VEC_CPU_HAS_AVX2;
+	if (vec_CPU_have_AVX512F())
+		vec_CPU_features |= VEC_CPU_HAS_AVX512F;
+}
+
+#endif /* VEC_IMPL_CPU_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/fallback.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,202 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_FALLBACK_H_
+#define VEC_IMPL_FALLBACK_H_
+
+// Fallback implementations - this is what an implementation should use if it
+// doesn't support a specific function. Note that the load_aligned and
+// store_aligned functions are not implemented here - this is on purpose;
+// every single implementation *needs* to have one of these.
+
+#define VEC_FALLBACK_OPERATION(op, sign, csign, bits, size) \
+	do { \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \
+	\
+		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
+		v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \
+	\
+		for (int i = 0; i < size; i++) varr1[i] = (op); \
+	\
+		return v##sign##int##bits##x##size##_load_aligned(varr1); \
+	} while (0)
+
+#define VEC_FALLBACK_CMP(op, sign, csign, bits, size) \
+	VEC_FALLBACK_OPERATION((varr1[i] op varr2[i]) ? UINT##bits##_MAX : 0, sign, csign, bits, size)
+
+#define VEC_FALLBACK_SHIFT(op, sign, csign, bits, size) \
+	do { \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
+		VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \
+	\
+		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
+		vuint##bits##x##size##_store_aligned(vec2, varr2); \
+	\
+		for (int i = 0; i < size; i++) varr1[i] = (op); \
+	\
+		return v##sign##int##bits##x##size##_load_aligned(varr1); \
+	} while (0)
+
+#define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(sign##int##bits##_t x) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
+		for (int i = 0; i < size; i++) arr[i] = x; \
+		return v##sign##int##bits##x##size##_load_aligned(arr); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const sign##int##bits##_t in[size]) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
+		memcpy(arr, in, sizeof(sign##int##bits##_t) * size); \
+		return v##sign##int##bits##x##size##_load_aligned(arr); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
+		v##sign##int##bits##x##size##_store_aligned(vec, arr); \
+		memcpy(out, arr, sizeof(sign##int##bits##_t) * size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_add(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \
+	{ \
+		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(<, sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(<=, sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(==, sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(>=, sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(>, sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size); \
+	}
+
+#define VEC_DEFINE_FALLBACK_OPERATIONS(bits, size) \
+	VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(u, U, bits, size)
+
+// 64-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 8)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 4)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 2)
+
+// 128-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 16)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 8)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 4)
+VEC_DEFINE_FALLBACK_OPERATIONS(64, 2)
+
+// 256-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 32)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 16)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 8)
+VEC_DEFINE_FALLBACK_OPERATIONS(64, 4)
+
+// 512-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 64)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 32)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 16)
+VEC_DEFINE_FALLBACK_OPERATIONS(64, 8)
+
+#undef VEC_FALLBACK_OPERATION
+#undef VEC_FALLBACK_CMP
+#undef VEC_FALLBACK_SHIFT
+#undef VEC_DEFINE_FALLBACK_OPERATIONS
+#undef VEC_DEFINE_FALLBACK_OPERATIONS_SIGN
+
+#endif /* VEC_IMPL_FALLBACK_H_ */
--- a/include/vec/impl/gcc.h	Tue Nov 19 15:55:01 2024 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,221 +0,0 @@
-/**
- * vec - a tiny SIMD vector library in plain C99
- *
- * Copyright (c) 2024 Paper
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
-/* GCC built in vectors */
-
-#include <stdint.h>
-#include <string.h>
-
-#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
-	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		memcpy(&vec, in, sizeof(*in) * size); \
-		return vec; \
-	} \
-	\
-	VEC_DECL_LOAD(sign, bits, size) \
-	{ \
-		return v##sign##int##bits##x##size##_load_aligned(in); \
-	} \
-	\
-	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
-	{ \
-		memcpy(out, &vec, sizeof(vec)); \
-	} \
-	\
-	VEC_DECL_STORE(sign, bits, size) \
-	{ \
-		return v##sign##int##bits##x##size##_store_aligned(vec, out); \
-	} \
-	\
-	VEC_DECL_ADD(sign, bits, size) \
-	{ \
-		return vec1 + vec2; \
-	} \
-	\
-	VEC_DECL_SUB(sign, bits, size) \
-	{ \
-		return vec1 - vec2; \
-	} \
-	\
-	VEC_DECL_MUL(sign, bits, size) \
-	{ \
-		return vec1 * vec2; \
-	} \
-	\
-	VEC_DECL_AND(sign, bits, size) \
-	{ \
-		return vec1 & vec2; \
-	} \
-	\
-	VEC_DECL_OR(sign, bits, size) \
-	{ \
-		return vec1 | vec2; \
-	} \
-	\
-	VEC_DECL_XOR(sign, bits, size) \
-	{ \
-		return vec1 ^ vec2; \
-	} \
-	VEC_DECL_CMPLT(sign, bits, size) \
-	{ \
-		return vec1 < vec2; \
-	} \
-	VEC_DECL_CMPGT(sign, bits, size) \
-	{ \
-		return vec1 > vec2; \
-	} \
-	VEC_DECL_CMPEQ(sign, bits, size) \
-	{ \
-		return vec1 == vec2; \
-	} \
-	VEC_DECL_CMPLE(sign, bits, size) \
-	{ \
-		return vec1 <= vec2; \
-	} \
-	VEC_DECL_CMPGE(sign, bits, size) \
-	{ \
-		return vec1 >= vec2; \
-	} \
-	\
-	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
-	VEC_GENERIC_SPLAT(sign, csign, bits, size) \
-	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
-	VEC_GENERIC_AVG(sign, bits, size)
-
-// -----------------------------------------------------------------------------------
-// 128-bit vector types
-
-#ifndef VEC_VUINT8X16
-# define VEC_VUINT8X16
-typedef uint8_t vuint8x16 __attribute__((__vector_size__(16)));
-# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-# define VUINT8x16_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 8, 16)
-#endif
-
-#ifndef VEC_VUINT16X8
-# define VEC_VUINT16X8
-typedef uint16_t vuint16x8 __attribute__((__vector_size__(16)));
-# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(vuint16x8){ a, b, c, d, e, f, g, h }
-# define VUINT16x8_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 16, 8)
-#endif
-
-#ifndef VEC_VUINT32X4
-# define VEC_VUINT32X4
-typedef uint32_t vuint32x4 __attribute__((__vector_size__(16)));
-# define VUINT32x4_CONSTANT(a, b, c, d) \
-	(vuint32x4){ a, b, c, d }
-# define VUINT32x4_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 32, 4)
-#endif
-
-#ifndef VEC_VUINT64X2
-# define VEC_VUINT64X2
-typedef uint64_t vuint64x2 __attribute__((__vector_size__(16)));
-# define VUINT64x2_CONSTANT(a, b) \
-	(vuint64x2){ a, b }
-# define VUINT64x2_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 64, 2)
-#endif
-
-#ifndef VEC_VINT8X16
-# define VEC_VINT8X16
-typedef int8_t vint8x16 __attribute__((__vector_size__(16)));
-# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-# define VINT8x16_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(, , 8, 16)
-#endif
-
-#ifndef VEC_VINT16X8
-# define VEC_VINT16X8
-typedef int16_t vint16x8 __attribute__((__vector_size__(16)));
-# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(vint16x8){ a, b, c, d, e, f, g, h }
-# define VINT16x8_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(, , 16, 8)
-#endif
-
-#ifndef VEC_VINT32X4
-# define VEC_VINT32X4
-typedef int32_t vint32x4 __attribute__((__vector_size__(16)));
-# define VINT32x4_CONSTANT(a, b, c, d) \
-	(vint32x4){ a, b, c, d }
-# define VINT32x4_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(, , 32, 4)
-#endif
-
-#ifndef VEC_VINT64X2
-# define VEC_VINT64X2
-typedef int64_t vint64x2 __attribute__((__vector_size__(16)));
-# define VINT64x2_CONSTANT(a, b) \
-	(vint64x2){ a, b }
-# define VINT64x2_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(, , 64, 2)
-#endif
-
-#ifndef VEC_VUINT8X16
-# define VEC_VUINT8X16
-typedef uint8_t vuint8x16 __attribute__((__vector_size__(16)));
-# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-# define VUINT8x16_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 8, 16)
-#endif
-
-#ifndef VEC_VUINT16X8
-# define VEC_VUINT16X8
-typedef uint16_t vuint16x8 __attribute__((__vector_size__(16)));
-# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(vuint16x8){ a, b, c, d, e, f, g, h }
-# define VUINT16x8_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 16, 8)
-#endif
-
-#ifndef VEC_VUINT32X4
-# define VEC_VUINT32X4
-typedef uint32_t vuint32x4 __attribute__((__vector_size__(16)));
-# define VUINT32x4_CONSTANT(a, b, c, d) \
-	(vuint32x4){ a, b, c, d }
-# define VUINT32x4_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 32, 4)
-#endif
-
-#ifndef VEC_VUINT64X2
-# define VEC_VUINT64X2
-typedef uint64_t vuint64x2 __attribute__((__vector_size__(16)));
-# define VUINT64x2_CONSTANT(a, b) \
-	(vuint64x2){ a, b }
-# define VUINT64x2_ALIGNMENT 16
-VEC_DEFINE_OPERATIONS(u, U, 64, 2)
-#endif
-
-// ----------------------------------------------------------
-
-#undef VEC_DEFINE_OPERATIONS
--- a/include/vec/impl/generic.h	Tue Nov 19 15:55:01 2024 -0500
+++ b/include/vec/impl/generic.h	Wed Nov 20 04:10:37 2024 -0500
@@ -24,362 +24,91 @@

 /* Generic array-based implementation. */

+#ifndef VEC_IMPL_GENERIC_H_
+#define VEC_IMPL_GENERIC_H_
+
 #include <stdint.h>
 #include <string.h>

-#define VEC_DEFINE_STRUCT(sign, bits, size) \
-	typedef struct { \
-		sign##int##bits##_t arr[size]; \
-	} v##sign##int##bits##x##size;
+// -----------------------------------------------------------------

-#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
-	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
+// TODO implement these so we don't waste stack space by doing the
+// fallbacks
+#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \
 	{ \
 		v##sign##int##bits##x##size vec; \
-		memcpy(vec.arr, in, sizeof(vec.arr)); \
-		return vec; \
-	} \
-	\
-	VEC_DECL_LOAD(sign, bits, size) \
-	{ \
-		return v##sign##int##bits##x##size##_load_aligned(in); \
-	} \
-	\
-	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
-	{ \
-		memcpy(out, vec.arr, sizeof(vec.arr)); \
-	} \
-	\
-	VEC_DECL_STORE(sign, bits, size) \
-	{ \
-		return v##sign##int##bits##x##size##_store_aligned(vec, out); \
-	} \
-	\
-	VEC_DECL_ADD(sign, bits, size) \
-	{ \
-		for (int i = 0; i < size; i++) vec1.arr[i] += vec2.arr[i]; \
-		return vec1; \
-	} \
-	\
-	VEC_DECL_SUB(sign, bits, size) \
-	{ \
-		for (int i = 0; i < size; i++) vec1.arr[i] -= vec2.arr[i]; \
-		return vec1; \
-	} \
-	\
-	VEC_DECL_MUL(sign, bits, size) \
-	{ \
-		for (int i = 0; i < size; i++) vec1.arr[i] *= vec2.arr[i]; \
-		return vec1; \
-	} \
-	\
-	VEC_DECL_AND(sign, bits, size) \
-	{ \
-		for (int i = 0; i < size; i++) vec1.arr[i] &= vec2.arr[i]; \
-		return vec1; \
-	} \
-	\
-	VEC_DECL_OR(sign, bits, size) \
-	{ \
-		for (int i = 0; i < size; i++) vec1.arr[i] |= vec2.arr[i]; \
-		return vec1; \
-	} \
-	\
-	VEC_DECL_XOR(sign, bits, size) \
-	{ \
-		for (int i = 0; i < size; i++) vec1.arr[i] ^= vec2.arr[i]; \
-		return vec1; \
-	} \
-	\
-	VEC_GENERIC_SPLAT(sign, csign, bits, size) \
-	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
-	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
-	VEC_GENERIC_AVG(sign, bits, size) \
-	VEC_GENERIC_COMPARISONS(sign, csign, bits, size)
-
-#ifndef VEC_VUINT8X16
-# define VEC_VUINT8X16
-VEC_DEFINE_STRUCT(u, 8, 16)
-# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
-# define VUINT8x16_ALIGNMENT 1
-VEC_DEFINE_OPERATIONS(u, U, 8, 16)
-#endif
-
-#ifndef VEC_VUINT16X8
-# define VEC_VUINT16X8
-VEC_DEFINE_STRUCT(u, 16, 8)
-# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } })
-# define VUINT16x8_ALIGNMENT 2
-VEC_DEFINE_OPERATIONS(u, U, 16, 8)
-#endif
-
-#ifndef VEC_VUINT32X4
-# define VEC_VUINT32X4
-VEC_DEFINE_STRUCT(u, 32, 4)
-# define VUINT32x4_CONSTANT(a, b, c, d) \
-	((vuint32x4){ .arr = { a, b, c, d } })
-# define VUINT32x4_ALIGNMENT 4
-VEC_DEFINE_OPERATIONS(u, U, 32, 4)
-#endif
-
-#ifndef VEC_VUINT64X2
-# define VEC_VUINT64X2
-VEC_DEFINE_STRUCT(u, 64, 2)
-# define VUINT64x2_CONSTANT(a, b) \
-	((vuint64x2){ .arr = { a, b } })
-# define VUINT64x2_ALIGNMENT 8
-VEC_DEFINE_OPERATIONS(u, U, 64, 2)
-#endif
-
-#ifndef VEC_VINT16X8
-# define VEC_VINT16X8
-VEC_DEFINE_STRUCT(, 16, 8)
-# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	((vint16x8){ .arr = { a, b, c, d, e, f, g, h } })
-# define VINT16x8_ALIGNMENT 2
-VEC_DEFINE_OPERATIONS(, , 16, 8)
-#endif
-
-#ifndef VEC_VINT32X4
-# define VEC_VINT32X4
-VEC_DEFINE_STRUCT(, 32, 4)
-# define VINT32x4_CONSTANT(a, b, c, d) \
-	((vint32x4){ .arr = { a, b, c, d } })
-# define VINT32x4_ALIGNMENT 4
-VEC_DEFINE_OPERATIONS(, , 32, 4)
-#endif
-
-#ifndef VEC_VINT64X2
-# define VEC_VINT64X2
-VEC_DEFINE_STRUCT(, 64, 2)
-# define VINT64x2_CONSTANT(a, b) \
-	((vint64x2){ .arr = { a, b } })
-# define VINT64x2_ALIGNMENT 8
-VEC_DEFINE_OPERATIONS(, , 64, 2)
-#endif
-
-#undef VEC_DEFINE_STRUCT
-#undef VEC_DEFINE_OPERATIONS
-
-// -----------------------------------------------------------------
-// Okay, now we can implement our "double" structures.
-// These use existing structures that are 128 bits in
-// size to provide 256-bit or even 512-bit data types.
-
-#define VEC_DEFINE_STRUCT(sign, bits, size, halfsize) \
-	typedef struct { \
-		v##sign##int##bits##x##halfsize vecs[2]; \
-	} v##sign##int##bits##x##size;
-
-#define VEC_DEFINE_OP(opcap, op, sign, bits, size, halfsize) \
-	VEC_DECL_##opcap(sign, bits, size) \
-	{ \
-		vec1.vecs[0] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[0], vec2.vecs[0]); \
-		vec1.vecs[1] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[1], vec2.vecs[1]); \
-		return vec1; \
-	}
-
-// This could be in way fewer lines, but whatever
-#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size, halfsize) \
-	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.vecs[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \
-		vec.vecs[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \
-		return vec; \
-	} \
-	\
-	VEC_DECL_LOAD(sign, bits, size) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.vecs[0] = v##sign##int##bits##x##halfsize##_load(in); \
-		vec.vecs[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \
+		memcpy(vec.generic, in, sizeof(sign##int##bits##_t) * size); \
 		return vec; \
 	} \
 	\
-	VEC_DECL_SPLAT(sign, bits, size) \
+	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		memcpy(out, vec.generic, sizeof(sign##int##bits##_t) * size); \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
+		.load_aligned  = v##sign##int##bits##x##size##_generic_load_aligned, \
+		.store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \
+	};
+
+#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
+
+VEC_GENERIC_DEFINE_OPERATIONS(8, 8)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 2)
+VEC_GENERIC_DEFINE_OPERATIONS(64, 2)
+
+#undef VEC_GENERIC_DEFINE_OPERATIONS
+#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN
+
+// -----------------------------------------------------------------
+// now we can just keep doubling the same implementation
+
+#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \
 	{ \
 		v##sign##int##bits##x##size vec; \
-		vec.vecs[0] = v##sign##int##bits##x##halfsize##_splat(x); \
-		vec.vecs[1] = v##sign##int##bits##x##halfsize##_splat(x); \
+		vec.generic[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \
+		vec.generic[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \
 		return vec; \
 	} \
 	\
-	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
+	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
 	{ \
-		v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[0], out); \
-		v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[1], out + halfsize); \
-	} \
-	\
-	VEC_DECL_STORE(sign, bits, size) \
-	{ \
-		v##sign##int##bits##x##halfsize##_store(vec.vecs[0], out); \
-		v##sign##int##bits##x##halfsize##_store(vec.vecs[1], out + halfsize); \
+		v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[0], out); \
+		v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[1], out + halfsize); \
 	} \
 	\
-	VEC_DEFINE_OP(ADD, add, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(SUB, sub, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(MUL, mul, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(AND, and, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(OR, or, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(XOR, xor, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(LSHIFT, lshift, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(RSHIFT, rshift, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(LRSHIFT, lrshift, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(DIV, div, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(AVG, avg, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(CMPLT, cmplt, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(CMPGT, cmpgt, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(CMPEQ, cmpeq, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(CMPGE, cmpge, sign, bits, size, halfsize) \
-	VEC_DEFINE_OP(CMPLE, cmple, sign, bits, size, halfsize)
-
-#ifndef VEC_VUINT8X32
-# define VEC_VUINT8X32
-VEC_DEFINE_STRUCT(u, 8, 32, 16)
-# define VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
-	((vuint8x32){ .vecs = { VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
-# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 8, 32, 16)
-#endif
-
-#ifndef VEC_VUINT16X16
-# define VEC_VUINT16X16
-VEC_DEFINE_STRUCT(u, 16, 16, 8)
-# define VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	((vuint16x16){ .vecs = { VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } })
-# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 16, 16, 8)
-#endif
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
+		.load_aligned  = v##sign##int##bits##x##size##_generic_load_aligned, \
+		.store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \
+	};

-#ifndef VEC_VUINT32X8
-# define VEC_VUINT32X8
-VEC_DEFINE_STRUCT(u, 32, 8, 4)
-# define VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	((vuint32x8){ .vecs = { VUINT32x4_CONSTANT(a, b, c, d), VUINT32x4_CONSTANT(e, f, g, h) } })
-# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 32, 8, 4)
-#endif
-
-#ifndef VEC_VUINT64X4
-# define VEC_VUINT64X4
-VEC_DEFINE_STRUCT(u, 64, 4, 2)
-# define VUINT64x4_CONSTANT(a, b, c, d) \
-	((vuint64x4){ .vecs = { VUINT64x2_CONSTANT(a, b), VUINT64x2_CONSTANT(c, d) } })
-# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 64, 4, 2)
-#endif
+#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size, halfsize) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize)

-#ifndef VEC_VINT8X32
-# define VEC_VINT8X32
-VEC_DEFINE_STRUCT(, 8, 32, 16)
-# define VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
-	((vint8x32){ .vecs = { VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
-# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 8, 32, 16)
-#endif
-
-#ifndef VEC_VINT16X16
-# define VEC_VINT16X16
-VEC_DEFINE_STRUCT(, 16, 16, 8)
-# define VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	((vint16x16){ .vecs = { VINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } })
-# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 16, 16, 8)
-#endif
-
-#ifndef VEC_VINT32X8
-# define VEC_VINT32X8
-VEC_DEFINE_STRUCT(, 32, 8, 4)
-# define VINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	((vuint32x8){ .vecs = { VINT32x4_CONSTANT(a, b, c, d), VINT32x4_CONSTANT(e, f, g, h) } })
-# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 32, 8, 4)
-#endif
+// 128-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 4, 2)

-#ifndef VEC_VINT64X4
-# define VEC_VINT64X4
-VEC_DEFINE_STRUCT(, 64, 4, 2)
-# define VINT64x4_CONSTANT(a, b, c, d) \
-	((vint64x4){ .vecs = { VINT64x2_CONSTANT(a, b), VINT64x2_CONSTANT(c, d) } })
-# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 64, 4, 2)
-#endif
-
-#ifndef VEC_VUINT8X64
-# define VEC_VUINT8X64
-VEC_DEFINE_STRUCT(u, 8, 64, 32)
-# define VUINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \
-	((vuint8x64){ .vecs = { VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VUINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } })
-# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 8, 64, 32)
-#endif
-
-#ifndef VEC_VUINT16X32
-# define VEC_VUINT16X32
-VEC_DEFINE_STRUCT(u, 16, 32, 16)
-# define VUINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
-	((vuint16x32){ .vecs = { VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
-# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 16, 32, 16)
-#endif
-
-#ifndef VEC_VUINT32X16
-# define VEC_VUINT32X16
-VEC_DEFINE_STRUCT(u, 32, 16, 8)
-# define VUINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	((vuint32x16){ .vecs = { VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } })
-# define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 32, 16, 8)
-#endif
+// 256-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 32, 16)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 16, 8)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 8, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(64, 4, 2)

-#ifndef VEC_VUINT64X8
-# define VEC_VUINT64X8
-VEC_DEFINE_STRUCT(u, 64, 8, 4)
-# define VUINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	((vuint64x8){ .vecs = { VUINT64x4_CONSTANT(a, b, c, d), VUINT64x4_CONSTANT(e, f, g, h) } })
-# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 64, 8, 4)
-#endif
-
-#ifndef VEC_VINT8X64
-# define VEC_VINT8X64
-VEC_DEFINE_STRUCT(, 8, 64, 32)
-# define VINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \
-	((vint8x64){ .vecs = { VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } })
-# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 8, 64, 32)
-#endif
+// 512-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 64, 32)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 32, 16)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 16, 8)
+VEC_GENERIC_DEFINE_OPERATIONS(64, 8, 4)

-#ifndef VEC_VINT16X32
-# define VEC_VINT16X32
-VEC_DEFINE_STRUCT(, 16, 32, 16)
-# define VINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
-	((vint16x32){ .vecs = { VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
-# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 16, 32, 16)
-#endif
+#undef VEC_GENERIC_DEFINE_OPERATIONS
+#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN

-#ifndef VEC_VINT32X16
-# define VEC_VINT32X16
-VEC_DEFINE_STRUCT(, 32, 16, 8)
-# define VINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	((vint32x16){ .vecs = { VINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } })
-# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 32, 16, 8)
-#endif
-
-#ifndef VEC_VINT64X8
-# define VEC_VINT64X8
-VEC_DEFINE_STRUCT(, 64, 8, 4)
-# define VINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	((vint64x8){ .vecs = { VINT64x4_CONSTANT(a, b, c, d), VINT64x4_CONSTANT(e, f, g, h) } })
-# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 64, 8, 4)
-#endif
-
-#undef VEC_DEFINE_STRUCT
-#undef VEC_DEFINE_OPERATIONS
-#undef VEC_DEFINE_OP
+#endif /* VEC_IMPL_GENERIC_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/ppc/altivec.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,183 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+/* Altivec vector support. */
+
+#ifndef VEC_IMPL_PPC_ALTIVEC_H_
+#define VEC_IMPL_PPC_ALTIVEC_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include <altivec.h>
+
+#define VEC_ALTIVEC_ALIGNMENT 16
+
+/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */
+#ifdef vec_mul
+# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_mul(vec1.altivec, vec2.altivec) }; \
+	}
+# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \
+	.mul = v##sign##int##bits##x##size##_altivec_mul,
+#else
+# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size)
+# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size)
+#endif
+
+#ifdef vec_splats
+# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(sign##int##bits##_t x) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_splats(x) }; \
+	}
+# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \
+	.splat = v##sign##int##bits##x##size##_altivec_splat,
+#else
+# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size)
+# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size)
+#endif
+
+#define VEC_ALTIVEC_uRSHIFT vec_sr
+#define VEC_ALTIVEC_RSHIFT vec_sra
+
+#define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_sr(vec1.altivec, vec2.altivec) }; \
+	}
+#define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \
+	.lrshift = v##sign##int##bits##x##size##_altivec_lrshift,
+
+#define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size)
+#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size)
+
+/* Since altivec conveniently made their API super user friendly, we can just use
+ * one giant macro to define literally everything */
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_ld(0, in) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) }; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		vec_st(vec.altivec, 0, out); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_add(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_sub(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_sl(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_avg(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_and(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_or(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .altivec = vec_xor(vec1.altivec, vec2.altivec) }; \
+	} \
+	\
+	VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \
+		.load_aligned  = v##sign##int##bits##x##size##_altivec_load_aligned, \
+		.load          = v##sign##int##bits##x##size##_altivec_load, \
+		.store_aligned = v##sign##int##bits##x##size##_altivec_store_aligned, \
+		.add           = v##sign##int##bits##x##size##_altivec_add, \
+		.sub           = v##sign##int##bits##x##size##_altivec_sub, \
+		VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \
+		.lshift        = v##sign##int##bits##x##size##_altivec_lshift, \
+		.rshift        = v##sign##int##bits##x##size##_altivec_rshift, \
+		VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size) \
+		.avg           = v##sign##int##bits##x##size##_altivec_avg, \
+		.and           = v##sign##int##bits##x##size##_altivec_and, \
+		.or            = v##sign##int##bits##x##size##_altivec_or, \
+		.xor           = v##sign##int##bits##x##size##_altivec_xor, \
+		VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \
+	};
+
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
+
+VEC_DEFINE_OPERATIONS(8, 16)
+VEC_DEFINE_OPERATIONS(16, 8)
+VEC_DEFINE_OPERATIONS(32, 4)
+#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
+VEC_DEFINE_OPERATIONS(64, 2)
+#endif
+
+#undef VEC_DEFINE_OPERATIONS
+#undef VEC_DEFINE_OPERATIONS_SIGN
+#undef VEC_ALTIVEC_DEFINE_MUL
+#undef VEC_ALTIVEC_STRUCT_MUL
+#undef VEC_ALTIVEC_DEFINE_LRSHIFT
+#undef VEC_ALTIVEC_STRUCT_LRSHIFT
+#undef VEC_ALTIVEC_DEFINE_uLRSHIFT
+#undef VEC_ALTIVEC_STRUCT_uLRSHIFT
+#undef VEC_ALTIVEC_DEFINE_SPLAT
+#undef VEC_ALTIVEC_STRUCT_SPLAT
+#undef VEC_ALTIVEC_uRSHIFT
+#undef VEC_ALTIVEC_RSHIFT
+
+#endif /* VEC_IMPL_PPC_ALTIVEC_H_ */
--- a/include/vec/impl/sse2.h	Tue Nov 19 15:55:01 2024 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,225 +0,0 @@
-/**
- * vec - a tiny SIMD vector library in plain C99
- *
- * Copyright (c) 2024 Paper
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
-#include <emmintrin.h>
-
-#define VEC_SSE2_ALIGNMENT 16
-
-#define VEC_SSE2_MUL_8x16(sign) \
-	VEC_DECL_MUL(sign, 8, 16) \
-	{ \
-		/* unpack and multiply */ \
-		__m128i dst_even = _mm_mullo_epi16(vec1, vec2); \
-		__m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8)); \
-	\
-		/* repack */ \
-		return _mm_or_si128( \
-			_mm_slli_epi16(dst_odd, 8), \
-			_mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
-		); \
-	}
-
-#define VEC_SSE2_MUL_16x8(sign) \
-	VEC_DECL_MUL(sign, 16, 8) \
-	{ \
-		/* we have a real instruction for this */ \
-		return _mm_mullo_epi16(vec1, vec2); \
-	}
-
-#define VEC_SSE2_MUL_32x4(sign) \
-	VEC_DECL_MUL(sign, 32, 4) \
-	{ \
-		/* this was stolen from... somewhere :) */ \
-		__m128i a13    = _mm_shuffle_epi32(vec1, 0xF5);     /* (-,a3,-,a1) */ \
-		__m128i b13    = _mm_shuffle_epi32(vec2, 0xF5);     /* (-,b3,-,b1) */ \
-		__m128i prod02 = _mm_mul_epu32(vec1, vec2);         /* (-,a2*b2,-,a0*b0) */ \
-		__m128i prod13 = _mm_mul_epu32(a13, b13);           /* (-,a3*b3,-,a1*b1) */ \
-		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
-		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
-		return _mm_unpacklo_epi64(prod01, prod23);          /* (ab3,ab2,ab1,ab0) */ \
-	}
-
-#define VEC_SSE2_MUL_64x2(sign) \
-	VEC_DECL_MUL(sign, 64, 2) \
-	{ \
-		__m128i ac = _mm_mul_epu32(vec1, vec2); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \
-		__m128i b  = _mm_srli_epi64(vec1, 32);  /* b = vec1 >> 32; */ \
-		__m128i bc = _mm_mul_epu32(b, vec2);    /* bc = b * (vec2 & UINT32_MAX); */ \
-		__m128i d  = _mm_srli_epi64(vec2, 32);  /* d = vec2 >> 32; */ \
-		__m128i ad = _mm_mul_epu32(vec1, d);    /* ad = (vec1 & UINT32_MAX) * d; */ \
-		__m128i hi = _mm_add_epi64(bc, ad);     /* hi = bc + ad; */ \
-		hi = _mm_slli_epi64(hi, 32);            /* hi <<= 32; */ \
-		return _mm_add_epi64(hi, ac);           /* return ac + hi; */ \
-	}
-
-#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
-	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
-	{ \
-		return _mm_load_si128((const __m128i *)in); \
-	} \
-	\
-	VEC_DECL_LOAD(sign, bits, size) \
-	{ \
-		return _mm_loadu_si128((const __m128i *)in); \
-	} \
-	\
-	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
-	{ \
-		_mm_store_si128((__m128i *)out, vec); \
-	} \
-	\
-	VEC_DECL_STORE(sign, bits, size) \
-	{ \
-		_mm_storeu_si128((__m128i *)out, vec); \
-	} \
-	\
-	VEC_DECL_ADD(sign, bits, size) \
-	{ \
-		return _mm_add_epi##bits(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_SUB(sign, bits, size) \
-	{ \
-		return _mm_sub_epi##bits(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_AND(sign, bits, size) \
-	{ \
-		return _mm_and_si128(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_OR(sign, bits, size) \
-	{ \
-		return _mm_or_si128(vec1, vec2); \
-	} \
-	\
-	VEC_DECL_XOR(sign, bits, size) \
-	{ \
-		return _mm_xor_si128(vec1, vec2); \
-	} \
-	\
-	VEC_SSE2_MUL_##bits##x##size(sign) \
-	\
-	VEC_GENERIC_SPLAT(sign, csign, bits, size) \
-	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
-	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
-	VEC_GENERIC_AVG(sign, bits, size)
-
-#define VEC_DEFINE_COMPARISONS_SIGNED(bits, size) \
-	VEC_DECL_CMPEQ(, bits, size) \
-	{ \
-		return _mm_cmpeq_epi##bits(vec1, vec2); \
-	} \
-	VEC_DECL_CMPLT(, bits, size) \
-	{ \
-		return _mm_cmplt_epi##bits(vec1, vec2); \
-	} \
-	VEC_DECL_CMPGT(, bits, size) \
-	{ \
-		return _mm_cmpgt_epi##bits(vec1, vec2); \
-	} \
-	VEC_GENERIC_THAN_OR_EQUAL(, bits, size)
-
-#ifndef VEC_VUINT16X8
-# define VEC_VUINT16X8
-typedef __m128i vuint16x8;
-# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(_mm_setr_epi16(h, g, f, e, d, c, b, a))
-# define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 16, 8)
-VEC_GENERIC_COMPARISONS(u, U, 16, 8)
-#endif
-
-#ifndef VEC_VUINT32X4
-# define VEC_VUINT32X4
-typedef __m128i vuint32x4;
-# define VUINT32x4_CONSTANT(a, b, c, d) \
-	(_mm_setr_epi32(d, c, b, a))
-# define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 32, 4)
-VEC_GENERIC_COMPARISONS(u, U, 32, 4)
-#endif
-
-#ifndef VEC_VUINT64X2
-# define VEC_VUINT64X2
-typedef __m128i vuint64x2;
-VEC_FUNC_KEYWORDS vuint64x2 VUINT64x2_CONSTANT(uint64_t a, uint64_t b)
-{
-	return _mm_setr_epi32(b, b >> 32, a, a >> 32);
-}
-# define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(u, U, 64, 2)
-VEC_GENERIC_COMPARISONS(u, U, 64, 2)
-#endif
-
-#ifndef VEC_VINT8X16
-# define VEC_VINT8X16
-typedef __m128i vint8x16;
-# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	(_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a))
-# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 8, 16)
-VEC_DEFINE_COMPARISONS_SIGNED(8, 16)
-#endif
-
-#ifndef VEC_VINT16X8
-# define VEC_VINT16X8
-typedef __m128i vint16x8;
-# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(_mm_setr_epi16(h, g, f, e, d, c, b, a))
-# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 16, 8)
-VEC_DEFINE_COMPARISONS_SIGNED(16, 8)
-#endif
-
-#ifndef VEC_VINT32X4
-# define VEC_VINT32X4
-typedef __m128i vint32x4;
-# define VINT32x4_CONSTANT(a, b, c, d) \
-	(_mm_setr_epi32(d, c, b, a))
-# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
-VEC_DEFINE_OPERATIONS(, , 32, 4)
-VEC_DEFINE_COMPARISONS_SIGNED(32, 4)
-#endif
-
-#ifndef VEC_VINT64X2
-# define VEC_VINT64X2
-typedef __m128i vint64x2;
-# define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
-VEC_FUNC_KEYWORDS vint64x2 VINT64x2_CONSTANT(int64_t a, int64_t b)
-{
-	return _mm_setr_epi32(b, vec_rshift(b, 32), a, vec_rshift(a, 32));
-}
-VEC_DEFINE_OPERATIONS(, , 64, 2)
-VEC_GENERIC_COMPARISONS(, , 64, 2)
-#endif
-
-#undef VEC_DEFINE_OPERATIONS
-#undef VEC_DEFINE_COMPARISONS_SIGNED
-
-/* multiply */
-#undef VEC_SSE2_MUL_8x16
-#undef VEC_SSE2_MUL_16x8
-#undef VEC_SSE2_MUL_32x4
-#undef VEC_SSE2_MUL_64x2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/x86/avx2.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,257 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_X86_AVX2_H_
+#define VEC_IMPL_X86_AVX2_H_
+
+#define VEC_AVX2_OPERATION_8x32_16x16(op, sign) \
+	do { \
+		/* unpack and multiply */ \
+		__m256i dst_even = _mm256_##op##_epi16(vec1.avx2, vec2.avx2); \
+		__m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1.avx2, 8), _mm256_srli_epi16(vec2.avx2, 8)); \
+	\
+		/* repack */ \
+		return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \
+			_mm256_slli_epi16(dst_odd, 8), \
+			_mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \
+		)}; \
+	} while (0)
+
+#define VEC_AVX2_OPERATION_8x32_32x8(op, sign) \
+	do { \
+		/* unpack */ \
+		__m256i dst_1 = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \
+		__m256i dst_2 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 8), _mm256_srli_epi32(vec2.avx2, 8)); \
+		__m256i dst_3 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \
+		__m256i dst_4 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 24), _mm256_srli_epi32(vec2.avx2, 24)); \
+	\
+		/* repack */ \
+		return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \
+			_mm256_or_si256( \
+				_mm256_slli_epi32(dst_4, 8), \
+				_mm256_srli_epi32(_mm256_slli_epi32(dst_3, 8), 8) \
+			), \
+			_mm256_or_si256( \
+				_mm256_slli_epi32(_mm256_slli_epi32(dst_2, 8), 16), \
+				_mm256_srli_epi32(_mm256_slli_epi32(dst_1, 8), 24) \
+			) \
+		)}; \
+	} while (0)
+
+#define VEC_AVX2_OPERATION_16x16(op, sign) \
+	do { \
+		/* unpack and multiply */ \
+		__m256i dst_even = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \
+		__m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \
+	\
+		/* repack */ \
+		return (v##sign##int16x16){ .avx2 = _mm256_or_si256( \
+			_mm256_slli_epi32(dst_odd, 16), \
+			_mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \
+		)}; \
+	} while (0)
+
+// shifting
+
+#define VEC_AVX2_LSHIFT_8x32(sign) \
+	VEC_AVX2_OPERATION_8x32_32x8(sllv, sign)
+
+#define VEC_AVX2_LSHIFT_16x16(sign) \
+	VEC_AVX2_OPERATION_16x16(sllv, sign)
+
+#define VEC_AVX2_LSHIFT_32x8(sign) \
+	do { \
+		return (v##sign##int32x8){ .avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2) }; \
+	} while (0)
+
+#define VEC_AVX2_LSHIFT_64x4(sign) \
+	do { \
+		return (v##sign##int64x4){ .avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2) }; \
+	} while (0)
+
+#define VEC_AVX2_RSHIFT_8x32(sign, aORl) \
+	VEC_AVX2_OPERATION_8x32_32x8(sr##aORl##v, sign)
+
+#define VEC_AVX2_RSHIFT_16x16(sign, aORl) \
+	VEC_AVX2_OPERATION_16x16(sr##aORl##v, sign)
+
+#define VEC_AVX2_RSHIFT_32x8(sign, aORl) \
+	do { \
+		return (v##sign##int32x8){ .avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2) }; \
+	} while (0)
+
+#define VEC_AVX2_aRSHIFT_64x4(sign) \
+	do { \
+		return v##sign##int64x4_fallback_rshift(vec1, vec2); \
+	} while (0)
+
+#define VEC_AVX2_lRSHIFT_64x4(sign) \
+	do { \
+		return (v##sign##int64x4){ .avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2) }; \
+	} while (0)
+
+#define VEC_AVX2_RSHIFT_64x4(sign, aORl) \
+	VEC_AVX2_##aORl##RSHIFT_64x4(sign)
+
+// multiplication
+
+#define VEC_AVX2_MUL_8x32(sign) \
+	VEC_AVX2_OPERATION_8x32_16x16(mullo, sign)
+
+#define VEC_AVX2_MUL_16x16(sign) \
+	do { \
+		return (v##sign##int16x16){ .avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2) }; \
+	} while (0)
+
+#define VEC_AVX2_MUL_32x8(sign) \
+	do { \
+		return (v##sign##int32x8) { .avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2) }; \
+	} while (0)
+
+#define VEC_AVX2_MUL_64x4(sign) \
+	do { \
+		__m256i ac = _mm256_mul_epu32(vec1.avx2, vec2.avx2); \
+		__m256i b  = _mm256_srli_epi64(vec1.avx2, 32); \
+		__m256i bc = _mm256_mul_epu32(b, vec2.avx2); \
+		__m256i d  = _mm256_srli_epi64(vec2.avx2, 32); \
+		__m256i ad = _mm256_mul_epu32(vec1.avx2, d); \
+		__m256i hi = _mm256_add_epi64(bc, ad); \
+		hi = _mm256_slli_epi64(hi, 32); \
+		return (v##sign##int64x4) { .avx2 = _mm256_add_epi64(hi, ac) }; \
+	} while (0)
+
+// operations
+
+#define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx2 = _mm256_load_si256((const __m256i *)in) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx2 = _mm256_loadu_si256((const __m256i *)in) }; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		_mm256_store_si256((__m256i *)out, vec.avx2); \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		_mm256_storeu_si256((__m256i *)out, vec.avx2); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX2_MUL_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX2_LSHIFT_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX2_RSHIFT_##bits##x##size(sign, a); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX2_RSHIFT_##bits##x##size(sign, l); \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \
+		.load_aligned  = v##sign##int##bits##x##size##_avx2_load_aligned, \
+		.load          = v##sign##int##bits##x##size##_avx2_load, \
+		.store_aligned = v##sign##int##bits##x##size##_avx2_store_aligned, \
+		.store         = v##sign##int##bits##x##size##_avx2_store, \
+		.add           = v##sign##int##bits##x##size##_avx2_add, \
+		.sub           = v##sign##int##bits##x##size##_avx2_sub, \
+		.mul           = v##sign##int##bits##x##size##_avx2_mul, \
+		.and           = v##sign##int##bits##x##size##_avx2_and, \
+		.or            = v##sign##int##bits##x##size##_avx2_or, \
+		.xor           = v##sign##int##bits##x##size##_avx2_xor, \
+		.lshift        = v##sign##int##bits##x##size##_avx2_lshift, \
+		.rshift        = v##sign##int##bits##x##size##_avx2_rshift, \
+		.lrshift       = v##sign##int##bits##x##size##_avx2_lrshift, \
+	};
+
+#define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \
+	VEC_AVX2_DEFINE_OPERATIONS_SIGN( , bits, size) \
+	VEC_AVX2_DEFINE_OPERATIONS_SIGN(u, bits, size)
+
+VEC_AVX2_DEFINE_OPERATIONS(8, 32)
+VEC_AVX2_DEFINE_OPERATIONS(16, 16)
+VEC_AVX2_DEFINE_OPERATIONS(32, 8)
+VEC_AVX2_DEFINE_OPERATIONS(64, 4)
+
+#undef VEC_AVX2_DEFINE_OPERATIONS
+#undef VEC_AVX2_DEFINE_OPERATIONS_SIGN
+#undef VEC_AVX2_MUL_8x32
+#undef VEC_AVX2_MUL_16x16
+#undef VEC_AVX2_MUL_32x8
+#undef VEC_AVX2_MUL_64x4
+#undef VEC_AVX2_OPERATION_8x32_16x16
+#undef VEC_AVX2_OPERATION_8x32_32x8
+#undef VEC_AVX2_OPERATION_16x16
+#undef VEC_AVX2_LSHIFT_8x32
+#undef VEC_AVX2_LSHIFT_16x16
+#undef VEC_AVX2_LSHIFT_32x8
+#undef VEC_AVX2_LSHIFT_64x4
+#undef VEC_AVX2_RSHIFT_8x32
+#undef VEC_AVX2_RSHIFT_16x16
+#undef VEC_AVX2_RSHIFT_32x8
+#undef VEC_AVX2_aRSHIFT_64x4
+#undef VEC_AVX2_lRSHIFT_64x4
+#undef VEC_AVX2_RSHIFT_64x4
+
+#endif /* VEC_IMPL_X86_AVX2_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/x86/avx512f.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,254 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_X86_AVX512F_H_
+#define VEC_IMPL_X86_AVX512F_H_
+
+#define VEC_AVX512F_OPERATION_8x64(op, sign) \
+	do { \
+		/* unpack and add */ \
+		__m512i dst_1 = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \
+		__m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 8), _mm512_srli_epi32(vec2.avx512f, 8)); \
+		__m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \
+		__m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 24), _mm512_srli_epi32(vec2.avx512f, 24)); \
+	\
+		/* repack */ \
+		return (v##sign##int8x64){ .avx512f = _mm512_or_si512( \
+			_mm512_or_si512( \
+				_mm512_slli_epi32(dst_4, 8), \
+				_mm512_srli_epi32(_mm512_slli_epi32(dst_3, 8), 8) \
+			), \
+			_mm512_or_si512( \
+				_mm512_slli_epi32(_mm512_slli_epi32(dst_2, 8), 16), \
+				_mm512_srli_epi32(_mm512_slli_epi32(dst_1, 8), 24) \
+			) \
+		)}; \
+	} while (0)
+
+#define VEC_AVX512F_OPERATION_16x32(op, sign) \
+	do { \
+		/* unpack and add */ \
+		__m512i dst_even = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \
+		__m512i dst_odd = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \
+	\
+		/* repack */ \
+		return (v##sign##int16x32){ .avx512f = _mm512_or_si512( \
+			_mm512_slli_epi32(dst_odd, 16), \
+			_mm512_srli_epi32(_mm512_slli_epi32(dst_even, 16), 16) \
+		)}; \
+	} while (0)
+
+#define VEC_AVX512F_ADD_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(add, sign)
+
+#define VEC_AVX512F_ADD_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(add, sign)
+
+#define VEC_AVX512F_ADD_32x16(sign) \
+	do { \
+		return (v##sign##int32x16) { .avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_ADD_64x8(sign) \
+	do { \
+		return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_SUB_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(sub, sign)
+
+#define VEC_AVX512F_SUB_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(sub, sign)
+
+#define VEC_AVX512F_SUB_32x16(sign) \
+	do { \
+		return (v##sign##int32x16) { .avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_SUB_64x8(sign) \
+	do { \
+		return (v##sign##int64x8) { .avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_MUL_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(mullo, sign)
+
+#define VEC_AVX512F_MUL_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(mullo, sign)
+
+#define VEC_AVX512F_MUL_32x16(sign) \
+	do { \
+		return (v##sign##int32x16) { .avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_MUL_64x8(sign) \
+	do { \
+		__m512i ac = _mm512_mul_epu32(vec1.avx512f, vec2.avx512f); \
+		__m512i b  = _mm512_srli_epi64(vec1.avx512f, 32); \
+		__m512i bc = _mm512_mul_epu32(b, vec2.avx512f); \
+		__m512i d  = _mm512_srli_epi64(vec2.avx512f, 32); \
+		__m512i ad = _mm512_mul_epu32(vec1.avx512f, d); \
+		__m512i hi = _mm512_add_epi64(bc, ad); \
+		hi = _mm512_slli_epi64(hi, 32); \
+		return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(hi, ac) }; \
+	} while (0)
+
+#define VEC_AVX512F_LSHIFT_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(sllv, sign)
+
+#define VEC_AVX512F_LSHIFT_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(sllv, sign)
+
+#define VEC_AVX512F_LSHIFT_32x16(sign) \
+	do { \
+		return (v##sign##int32x16){ .avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_LSHIFT_64x8(sign) \
+	do { \
+		return (v##sign##int64x8){ .avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_RSHIFT_8x64(sign, aORl) \
+	VEC_AVX512F_OPERATION_8x64(sr##aORl##v, sign)
+
+#define VEC_AVX512F_RSHIFT_16x32(sign, aORl) \
+	VEC_AVX512F_OPERATION_16x32(sr##aORl##v, sign)
+
+#define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \
+	do { \
+		return (v##sign##int32x16){ .avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \
+	do { \
+		return (v##sign##int64x8){ .avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f) }; \
+	} while (0)
+
+#define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx512f = _mm512_load_si512((const __m512i *)in) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx512f = _mm512_loadu_si512((const __m512i *)in) }; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		_mm512_store_si512((__m512i *)out, vec.avx512f); \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		_mm512_storeu_si512((__m512i *)out, vec.avx512f); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_ADD_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_SUB_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_MUL_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_LSHIFT_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_RSHIFT_##bits##x##size(sign, a); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \
+		.load_aligned  = v##sign##int##bits##x##size##_avx512f_load_aligned, \
+		.load          = v##sign##int##bits##x##size##_avx512f_load, \
+		.store_aligned = v##sign##int##bits##x##size##_avx512f_store_aligned, \
+		.store         = v##sign##int##bits##x##size##_avx512f_store, \
+		.add           = v##sign##int##bits##x##size##_avx512f_add, \
+		.sub           = v##sign##int##bits##x##size##_avx512f_sub, \
+		.mul           = v##sign##int##bits##x##size##_avx512f_mul, \
+		.and           = v##sign##int##bits##x##size##_avx512f_and, \
+		.or            = v##sign##int##bits##x##size##_avx512f_or, \
+		.xor           = v##sign##int##bits##x##size##_avx512f_xor, \
+	};
+
+#define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \
+	VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size) \
+	VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size)
+
+VEC_AVX512F_DEFINE_OPERATIONS(8, 64)
+VEC_AVX512F_DEFINE_OPERATIONS(16, 32)
+VEC_AVX512F_DEFINE_OPERATIONS(32, 16)
+VEC_AVX512F_DEFINE_OPERATIONS(64, 8)
+
+#undef VEC_AVX512F_DEFINE_OPERATIONS
+#undef VEC_AVX512F_DEFINE_OPERATIONS_SIGN
+#undef VEC_AVX512F_MUL_8x64
+#undef VEC_AVX512F_MUL_16x32
+#undef VEC_AVX512F_MUL_32x16
+#undef VEC_AVX512F_MUL_64x8
+
+#undef VEC_AVX512F_LSHIFT_8x64
+#undef VEC_AVX512F_LSHIFT_16x32
+#undef VEC_AVX512F_LSHIFT_32x16
+#undef VEC_AVX512F_LSHIFT_64x8
+
+#undef VEC_AVX512F_RSHIFT_8x64
+#undef VEC_AVX512F_RSHIFT_16x32
+#undef VEC_AVX512F_RSHIFT_32x16
+#undef VEC_AVX512F_RSHIFT_64x8
+
+#endif /* VEC_IMPL_X86_AVX512F_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/x86/mmx.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,185 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_X86_MMX_H_
+#define VEC_IMPL_X86_MMX_H_
+
+#define VEC_MMX_OPERATION_8x8(op, sign) \
+	do { \
+		/* unpack and multiply */ \
+		__m64 dst_even = _mm_##op##_pi16(vec1.mmx, vec2.mmx); \
+		__m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1.mmx, 8), _mm_srli_pi16(vec2.mmx, 8)); \
+	\
+		/* repack */ \
+		return (v##sign##int8x8){ .mmx = _mm_or_si64( \
+			_mm_slli_pi16(dst_odd, 8), \
+			_mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \
+		)}; \
+	} while (0)
+
+// shifting
+#define VEC_MMX_LSHIFT_8x8(sign) \
+	VEC_MMX_OPERATION_8x8(sll, sign)
+
+#define VEC_MMX_LSHIFT_16x4(sign) \
+	do { \
+		return (v##sign##int16x4){ .mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx) }; \
+	} while (0)
+
+#define VEC_MMX_LSHIFT_32x2(sign) \
+	do { \
+		return (v##sign##int32x2){ .mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx) }; \
+	} while (0)
+
+#define VEC_MMX_RSHIFT_8x8(sign, aORl) \
+	VEC_MMX_OPERATION_8x8(sr##aORl, sign)
+
+#define VEC_MMX_RSHIFT_16x4(sign, aORl) \
+	do { \
+		return (v##sign##int16x4){ .mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx) }; \
+	} while (0)
+
+#define VEC_MMX_RSHIFT_32x2(sign, aORl) \
+	do { \
+		return (v##sign##int32x2){ .mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx) }; \
+	} while (0)
+
+// shared between MMX variations
+#define VEC_MMX_MUL_8x8(sign) \
+	VEC_MMX_OPERATION_8x8(mullo, sign)
+
+#define VEC_MMX_MUL_16x4(sign) \
+	do { \
+		/* we have a real instruction for this */ \
+		return (v##sign##int16x4){ .mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx) }; \
+	} while (0)
+
+#define VEC_MMX_MUL_32x2(sign) \
+	do { \
+		__m64 ac = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \
+		__m64 b  = _mm_srli_pi32(vec1.mmx, 16); \
+		__m64 bc = _mm_mullo_pi16(b, vec2.mmx); \
+		__m64 d  = _mm_srli_pi32(vec2.mmx, 16); \
+		__m64 ad = _mm_mullo_pi16(vec1.mmx, d); \
+		__m64 hi = _mm_add_pi32(bc, ad); \
+		hi = _mm_slli_pi32(hi, 16); \
+		return (v##sign##int32x2) { .mmx = _mm_add_pi32(hi, ac) }; /* return ac + hi; */ \
+	} while (0)
+
+#define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const sign##int##bits##_t in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		memcpy(&vec.mmx, in, sizeof(vec.mmx)); \
+		return vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		memcpy(out, &vec.mmx, sizeof(vec.mmx)); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_MMX_MUL_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .mmx = _mm_and_si64(vec1.mmx, vec2.mmx) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .mmx = _mm_or_si64(vec1.mmx, vec2.mmx) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .mmx = _mm_xor_si64(vec1.mmx, vec2.mmx) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_MMX_LSHIFT_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_MMX_RSHIFT_##bits##x##size(sign, a); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_MMX_RSHIFT_##bits##x##size(sign, l); \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \
+		.load_aligned  = v##sign##int##bits##x##size##_mmx_load_aligned, \
+		.load          = v##sign##int##bits##x##size##_mmx_load_aligned, \
+		.store_aligned = v##sign##int##bits##x##size##_mmx_store_aligned, \
+		.store         = v##sign##int##bits##x##size##_mmx_store_aligned, \
+		.add           = v##sign##int##bits##x##size##_mmx_add, \
+		.sub           = v##sign##int##bits##x##size##_mmx_sub, \
+		.mul           = v##sign##int##bits##x##size##_mmx_mul, \
+		.and           = v##sign##int##bits##x##size##_mmx_and, \
+		.or            = v##sign##int##bits##x##size##_mmx_or, \
+		.xor           = v##sign##int##bits##x##size##_mmx_xor, \
+		.lshift        = v##sign##int##bits##x##size##_mmx_lshift, \
+		.rshift        = v##sign##int##bits##x##size##_mmx_rshift, \
+		.lrshift       = v##sign##int##bits##x##size##_mmx_lrshift, \
+	};
+
+#define VEC_MMX_DEFINE_OPERATIONS(bits, size) \
+	VEC_MMX_DEFINE_OPERATIONS_SIGN( , bits, size) \
+	VEC_MMX_DEFINE_OPERATIONS_SIGN(u, bits, size)
+
+VEC_MMX_DEFINE_OPERATIONS(8, 8)
+VEC_MMX_DEFINE_OPERATIONS(16, 4)
+VEC_MMX_DEFINE_OPERATIONS(32, 2)
+
+#undef VEC_MMX_DEFINE_OPERATIONS
+#undef VEC_MMX_DEFINE_OPERATIONS_SIGN
+#undef VEC_MMX_MUL_8x8
+#undef VEC_MMX_MUL_16x4
+#undef VEC_MMX_MUL_32x2
+#undef VEC_MMX_OPERATION_8x8
+#undef VEC_MMX_LSHIFT_8x8
+#undef VEC_MMX_LSHIFT_16x4
+#undef VEC_MMX_LSHIFT_32x2
+#undef VEC_MMX_RSHIFT_8x8
+#undef VEC_MMX_RSHIFT_16x4
+#undef VEC_MMX_RSHIFT_32x2
+
+#endif /* VEC_IMPL_X86_MMX_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/x86/sse2.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,230 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_X86_SSE2_H_
+#define VEC_IMPL_X86_SSE2_H_
+
+#define VEC_SSE2_OPERATION_8x16(op, sign) \
+	do { \
+		/* unpack and multiply */ \
+		__m128i dst_even = _mm_##op##_epi16(vec1.sse, vec2.sse); \
+		__m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \
+	\
+		/* repack */ \
+		return (v##sign##int8x16){ .sse = _mm_or_si128( \
+			_mm_slli_epi16(dst_odd, 8), \
+			_mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
+		)}; \
+	} while (0)
+
+// shifting
+#define VEC_SSE2_LSHIFT_8x16(sign) \
+	VEC_SSE2_OPERATION_8x16(sll, sign)
+
+#define VEC_SSE2_LSHIFT_16x8(sign) \
+	do { \
+		return (v##sign##int16x8){ .sse = _mm_sll_epi16(vec1.sse, vec2.sse) }; \
+	} while (0)
+
+#define VEC_SSE2_LSHIFT_32x4(sign) \
+	do { \
+		return (v##sign##int32x4){ .sse = _mm_sll_epi32(vec1.sse, vec2.sse) }; \
+	} while (0)
+
+#define VEC_SSE2_LSHIFT_64x2(sign) \
+	do { \
+		return (v##sign##int64x2){ .sse = _mm_sll_epi64(vec1.sse, vec2.sse) }; \
+	} while (0)
+
+#define VEC_SSE2_RSHIFT_8x16(sign, aORl) \
+	VEC_SSE2_OPERATION_8x16(sr##aORl, sign)
+
+#define VEC_SSE2_RSHIFT_16x8(sign, aORl) \
+	do { \
+		return (v##sign##int16x8){ .sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse) }; \
+	} while (0)
+
+#define VEC_SSE2_RSHIFT_32x4(sign, aORl) \
+	do { \
+		return (v##sign##int32x4){ .sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse) }; \
+	} while (0)
+
+#define VEC_SSE2_aRSHIFT_64x2(sign) \
+	do { \
+		return v##sign##int64x2_fallback_rshift(vec1, vec2); \
+	} while (0)
+
+#define VEC_SSE2_lRSHIFT_64x2(sign) \
+	do { \
+		return (v##sign##int64x2){ .sse = _mm_srl_epi64(vec1.sse, vec2.sse) }; \
+	} while (0)
+
+#define VEC_SSE2_RSHIFT_64x2(sign, aORl) \
+	VEC_SSE2_##aORl##RSHIFT_64x2(sign)
+
+// shared between SSE2 variations
+#define VEC_SSE2_MUL_8x16(sign) \
+	VEC_SSE2_OPERATION_8x16(mullo, sign)
+
+#define VEC_SSE2_MUL_16x8(sign) \
+	do { \
+		/* we have a real instruction for this */ \
+		return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \
+	} while (0)
+
+#define VEC_SSE2_MUL_32x4(sign) \
+	do { \
+		/* this was stolen from... somewhere :) */ \
+		__m128i a13    = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \
+		__m128i b13    = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \
+		__m128i prod02 = _mm_mul_epu32(vec1.sse, vec2.sse); /* (-,a2*b2,-,a0*b0) */ \
+		__m128i prod13 = _mm_mul_epu32(a13, b13);           /* (-,a3*b3,-,a1*b1) */ \
+		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
+		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
+		return (v##sign##int32x4) { .sse = _mm_unpacklo_epi64(prod01, prod23) }; /* (ab3,ab2,ab1,ab0) */ \
+	} while (0)
+
+#define VEC_SSE2_MUL_64x2(sign) \
+	do { \
+		__m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \
+		__m128i b  = _mm_srli_epi64(vec1.sse, 32);      /* b = vec1 >> 32; */ \
+		__m128i bc = _mm_mul_epu32(b, vec2.sse);        /* bc = b * (vec2 & UINT32_MAX); */ \
+		__m128i d  = _mm_srli_epi64(vec2.sse, 32);      /* d = vec2 >> 32; */ \
+		__m128i ad = _mm_mul_epu32(vec1.sse, d);        /* ad = (vec1 & UINT32_MAX) * d; */ \
+		__m128i hi = _mm_add_epi64(bc, ad);             /* hi = bc + ad; */ \
+		hi = _mm_slli_epi64(hi, 32);                    /* hi <<= 32; */ \
+		return (v##sign##int64x2) { .sse = _mm_add_epi64(hi, ac) }; /* return ac + hi; */ \
+	} while (0)
+
+#define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .sse = _mm_load_si128((const __m128i *)in) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const sign##int##bits##_t in[size]) \
+	{ \
+		return (v##sign##int##bits##x##size) { .sse = _mm_loadu_si128((const __m128i *)in) }; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		_mm_store_si128((__m128i *)out, vec.sse); \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		_mm_storeu_si128((__m128i *)out, vec.sse); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .sse = _mm_add_epi##bits(vec1.sse, vec2.sse) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .sse = _mm_sub_epi##bits(vec1.sse, vec2.sse) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_SSE2_MUL_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return (v##sign##int##bits##x##size) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_SSE2_LSHIFT_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_SSE2_RSHIFT_##bits##x##size(sign, a); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_SSE2_RSHIFT_##bits##x##size(sign, l); \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \
+		.load_aligned  = v##sign##int##bits##x##size##_sse2_load_aligned, \
+		.load          = v##sign##int##bits##x##size##_sse2_load, \
+		.store_aligned = v##sign##int##bits##x##size##_sse2_store_aligned, \
+		.store         = v##sign##int##bits##x##size##_sse2_store, \
+		.add           = v##sign##int##bits##x##size##_sse2_add, \
+		.sub           = v##sign##int##bits##x##size##_sse2_sub, \
+		.mul           = v##sign##int##bits##x##size##_sse2_mul, \
+		.and           = v##sign##int##bits##x##size##_sse2_and, \
+		.or            = v##sign##int##bits##x##size##_sse2_or, \
+		.xor           = v##sign##int##bits##x##size##_sse2_xor, \
+		.lshift        = v##sign##int##bits##x##size##_sse2_lshift, \
+		.rshift        = v##sign##int##bits##x##size##_sse2_rshift, \
+		.lrshift       = v##sign##int##bits##x##size##_sse2_lrshift, \
+	};
+
+#define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \
+	VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size) \
+	VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size)
+
+// SSE is *only* 128-bit
+VEC_SSE2_DEFINE_OPERATIONS(8, 16)
+VEC_SSE2_DEFINE_OPERATIONS(16, 8)
+VEC_SSE2_DEFINE_OPERATIONS(32, 4)
+VEC_SSE2_DEFINE_OPERATIONS(64, 2)
+
+#undef VEC_SSE2_DEFINE_OPERATIONS
+#undef VEC_SSE2_DEFINE_OPERATIONS_SIGN
+#undef VEC_SSE2_MUL_8x16
+#undef VEC_SSE2_MUL_16x8
+#undef VEC_SSE2_MUL_32x4
+#undef VEC_SSE2_MUL_64x2
+#undef VEC_SSE2_OPERATION_8x16
+#undef VEC_SSE2_LSHIFT_8x16
+#undef VEC_SSE2_LSHIFT_16x8
+#undef VEC_SSE2_LSHIFT_32x4
+#undef VEC_SSE2_LSHIFT_64x2
+#undef VEC_SSE2_RSHIFT_8x16
+#undef VEC_SSE2_RSHIFT_16x8
+#undef VEC_SSE2_RSHIFT_32x4
+#undef VEC_SSE2_aRSHIFT_64x2
+#undef VEC_SSE2_lRSHIFT_64x2
+#undef VEC_SSE2_RSHIFT_64x2
+
+#endif /* VEC_IMPL_X86_SSE2_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/x86/sse41.h	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,55 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_X86_SSE41_H_
+#define VEC_IMPL_X86_SSE41_H_
+
+#define VEC_SSE41_DEFINE_OPERATIONS(sign) \
+	static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \
+	{ \
+		return (v##sign##int32x4){ .sse = _mm_mullo_epi32(vec1.sse, vec2.sse) }; \
+	} \
+	\
+	static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \
+		.load_aligned  = v##sign##int32x4_sse2_load_aligned, \
+		.load          = v##sign##int32x4_sse2_load, \
+		.store_aligned = v##sign##int32x4_sse2_store_aligned, \
+		.store         = v##sign##int32x4_sse2_store, \
+		.add           = v##sign##int32x4_sse2_add, \
+		.sub           = v##sign##int32x4_sse2_sub, \
+		.mul           = v##sign##int32x4_sse41_mul, \
+		.and           = v##sign##int32x4_sse2_and, \
+		.or            = v##sign##int32x4_sse2_or, \
+		.xor           = v##sign##int32x4_sse2_xor, \
+		.lshift        = v##sign##int32x4_sse2_lshift, \
+		.rshift        = v##sign##int32x4_sse2_rshift, \
+		.lrshift       = v##sign##int32x4_sse2_lrshift, \
+	};
+
+VEC_SSE41_DEFINE_OPERATIONS()
+VEC_SSE41_DEFINE_OPERATIONS(u)
+
+#undef VEC_SSE41_DEFINE_OPERATIONS
+
+#endif /* VEC_IMPL_X86_SSE41_H_ */
--- a/include/vec/vec.h	Tue Nov 19 15:55:01 2024 -0500
+++ b/include/vec/vec.h	Wed Nov 20 04:10:37 2024 -0500
@@ -29,6 +29,10 @@
 #include <string.h>
 #include <limits.h>

+#define VEC_MAX(a, b) (((a) > (b)) ? (a) : (b))
+#define VEC_MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define VEC_CLAMP(x, min, max) (VEC_MIN(VEC_MAX((x), (min)), (max)))
+
 #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
 	(((a) >= (x)) && \
 	 ((a) > x || (b) >= (y)) && \
@@ -39,14 +43,11 @@

 /* GCC/clang attributes */
 #if defined(__has_attribute)
-# if __has_attribute(__always_inline__)
-#  define VEC_ALWAYS_INLINE __attribute__((__always_inline__))
-# endif
 # if __has_attribute(__aligned__)
 #  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
 # endif
 # if __has_attribute(__vector_size__)
-#  define VEC_HAVE_GNUC_VECTORS
+#  define VEC_COMPILER_HAS_GNUC_VECTORS
 # endif
 #endif

@@ -56,26 +57,6 @@
 # endif
 #endif

-/* FIXME: gcc 4.2 on Mac OS X doesn't have always_inline,
- * even though docs and many online sources say that it
- * should have it. */
-
-#ifndef VEC_ALWAYS_INLINE
-# define VEC_ALWAYS_INLINE
-#endif
-
-/* Allow users to define all of the symbols externally in
- * one translation unit, or as a shared library. */
-#ifdef VEC_EXTERN
-# ifdef VEC_EXTERN_DEFINE
-#  define VEC_FUNC_KEYWORDS extern inline
-# else
-#  define VEC_FUNC_KEYWORDS inline
-# endif
-#else
-# define VEC_FUNC_KEYWORDS static inline VEC_ALWAYS_INLINE
-#endif
-
 #if (__STDC_VERSION__ >= 201112L)
 # define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
 #else
@@ -97,131 +78,226 @@
 /* --------------------------------------------------------------- */
 /* Detect compiler SIMD support */

+#define VEC_GENERIC_ALIGNMENT 1
+#define VEC_ALTIVEC_ALIGNMENT 16
+#define VEC_SSE2_ALIGNMENT    16
+#define VEC_AVX2_ALIGNMENT    32
+#define VEC_AVX512F_ALIGNMENT 64
+
+// for the generic implementation, 64-bit
+#define VINT8x8_ALIGNMENT   VEC_GENERIC_ALIGNMENT
+#define VINT16x4_ALIGNMENT  VEC_GENERIC_ALIGNMENT
+#define VINT32x2_ALIGNMENT  VEC_GENERIC_ALIGNMENT
+#define VUINT8x8_ALIGNMENT  VEC_GENERIC_ALIGNMENT
+#define VUINT16x4_ALIGNMENT VEC_GENERIC_ALIGNMENT
+#define VUINT32x2_ALIGNMENT VEC_GENERIC_ALIGNMENT
+
+#define VINT8x16_ALIGNMENT  VINT8x8_ALIGNMENT
+#define VINT16x8_ALIGNMENT  VINT16x4_ALIGNMENT
+#define VINT32x4_ALIGNMENT  VINT32x2_ALIGNMENT
+#define VINT64x2_ALIGNMENT  VEC_GENERIC_ALIGNMENT
+#define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT
+#define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT
+#define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT
+#define VUINT64x2_ALIGNMENT VEC_GENERIC_ALIGNMENT
+
+#define VINT8x32_ALIGNMENT   VINT8x16_ALIGNMENT
+#define VINT16x16_ALIGNMENT  VINT16x8_ALIGNMENT
+#define VINT32x8_ALIGNMENT   VINT32x4_ALIGNMENT
+#define VINT64x4_ALIGNMENT   VINT64x2_ALIGNMENT
+#define VUINT8x32_ALIGNMENT  VUINT8x16_ALIGNMENT
+#define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
+#define VUINT32x8_ALIGNMENT  VUINT32x4_ALIGNMENT
+#define VUINT64x4_ALIGNMENT  VUINT64x2_ALIGNMENT
+
+#define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
+#define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
+#define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
+#define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
+#define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
+#define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
+#define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT
+#define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT
+
+#ifndef VEC_SUPPRESS_HW
+
 // IIRC `__VEC__' is also defined, but I don't know for sure.
 // IBM says that `__ALTIVEC__' is standard though.
 #ifdef __ALTIVEC__
 # include <altivec.h>
 # define VEC_COMPILER_HAS_ALTIVEC
+# if defined(__POWER8__) && defined(__VSX__)
+#  define VEC_COMPILER_HAS_ALTIVEC_VSX
+# endif
+# if VINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VINT8x16_ALIGNMENT
+#  define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+# if VINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VINT16x8_ALIGNMENT
+#  define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+# if VINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VINT32x4_ALIGNMENT
+#  define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+# if VINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VINT64x2_ALIGNMENT
+#  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+# if VUINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VUINT8x16_ALIGNMENT
+#  define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+# if VUINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VUINT16x8_ALIGNMENT
+#  define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+# if VUINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VUINT32x4_ALIGNMENT
+#  define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+# if VUINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
+#  undef VUINT64x2_ALIGNMENT
+#  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+# endif
+#endif

-# define VINT8x16_ALIGNMENT 16
-# define VINT16x8_ALIGNMENT 16
-# define VINT32x4_ALIGNMENT 16
-# define VINT64x2_ALIGNMENT 16
+#ifdef __MMX__
+# include <mmintrin.h>
+# define VEC_COMPILER_HAS_MMX
 #endif

 #ifdef __SSE2__
-# include <immintrin.h>
+# include <emmintrin.h>
 # define VEC_COMPILER_HAS_SSE2
-# ifdef __SSE42__
-#  define VEC_COMPILER_HAS_SSE42
+# ifdef __SSE4_1__
+#  define VEC_COMPILER_HAS_SSE41
+# endif
+# if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VINT8x16_ALIGNMENT
+#  define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
+# endif
+# if VINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VINT16x8_ALIGNMENT
+#  define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
+# endif
+# if VINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VINT32x4_ALIGNMENT
+#  define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
 # endif
-
-# define VINT8x16_ALIGNMENT 16
-# define VINT16x8_ALIGNMENT 16
-# define VINT32x4_ALIGNMENT 16
-# define VINT64x2_ALIGNMENT 16
-#endif
-
-#ifndef VINT8x16_ALIGNMENT
-# define VINT8x16_ALIGNMENT 1
-#endif
-#ifndef VINT16x8_ALIGNMENT
-# define VINT16x8_ALIGNMENT 1
-#endif
-#ifndef VINT32x4_ALIGNMENT
-# define VINT32x4_ALIGNMENT 1
-#endif
-#ifndef VINT64x2_ALIGNMENT
-# define VINT64x2_ALIGNMENT 1
-#endif
-#ifndef VUINT8x16_ALIGNMENT
-# define VUINT8x16_ALIGNMENT 1
-#endif
-#ifndef VUINT16x8_ALIGNMENT
-# define VUINT16x8_ALIGNMENT 1
-#endif
-#ifndef VUINT32x4_ALIGNMENT
-# define VUINT32x4_ALIGNMENT 1
-#endif
-#ifndef VUINT64x2_ALIGNMENT
-# define VUINT64x2_ALIGNMENT 1
+# if VINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VINT64x2_ALIGNMENT
+#  define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
+# endif
+# if VUINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VUINT8x16_ALIGNMENT
+#  define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
+# endif
+# if VUINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VUINT16x8_ALIGNMENT
+#  define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
+# endif
+# if VUINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VUINT32x4_ALIGNMENT
+#  define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
+# endif
+# if VUINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
+#  undef VUINT64x2_ALIGNMENT
+#  define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
+# endif
 #endif

-// generic 256-bit is just doubled 128-bit
-#ifndef VINT8x32_ALIGNMENT
-# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT
-#endif
-#ifndef VINT16x16_ALIGNMENT
-# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT
-#endif
-#ifndef VINT32x8_ALIGNMENT
-# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT
-#endif
-#ifndef VINT64x4_ALIGNMENT
-# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT
-#endif
-#ifndef VUINT8x32_ALIGNMENT
-# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT
-#endif
-#ifndef VUINT16x16_ALIGNMENT
-# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
-#endif
-#ifndef VUINT32x8_ALIGNMENT
-# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT
-#endif
-#ifndef VUINT64x4_ALIGNMENT
-# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT
+#ifdef __AVX2__
+# include <immintrin.h>
+# define VEC_COMPILER_HAS_AVX2
+# if VINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VINT8x32_ALIGNMENT
+#  define VINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
+# if VINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VINT16x16_ALIGNMENT
+#  define VINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
+# if VINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VINT32x8_ALIGNMENT
+#  define VINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
+# if VINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VINT64x4_ALIGNMENT
+#  define VINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
+# if VUINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VUINT8x32_ALIGNMENT
+#  define VUINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
+# if VUINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VUINT16x16_ALIGNMENT
+#  define VUINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
+# if VUINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VUINT32x8_ALIGNMENT
+#  define VUINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
+# if VUINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
+#  undef VUINT64x4_ALIGNMENT
+#  define VUINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
+# endif
 #endif

-// generic 512-bit is just doubled 256-bit
-#ifndef VINT8x64_ALIGNMENT
-# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
-#endif
-#ifndef VINT16x32_ALIGNMENT
-# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
-#endif
-#ifndef VINT32x16_ALIGNMENT
-# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
-#endif
-#ifndef VINT64x8_ALIGNMENT
-# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
-#endif
-#ifndef VUINT8x64_ALIGNMENT
-# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
-#endif
-#ifndef VUINT16x32_ALIGNMENT
-# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
-#endif
-#ifndef VUINT32x16_ALIGNMENT
-# define VUINT32x16_ALIGNMENT VUINT32x16_ALIGNMENT
-#endif
-#ifndef VUINT64x8_ALIGNMENT
-# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT
+#ifdef __AVX512F__
+# include <immintrin.h>
+# define VEC_COMPILER_HAS_AVX512F
+# if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VINT8x64_ALIGNMENT
+#  define VINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
+# if VINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VINT16x32_ALIGNMENT
+#  define VINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
+# if VINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VINT32x16_ALIGNMENT
+#  define VINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
+# if VINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VINT64x8_ALIGNMENT
+#  define VINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
+# if VUINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VUINT8x64_ALIGNMENT
+#  define VUINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
+# if VUINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VUINT16x32_ALIGNMENT
+#  define VUINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
+# if VUINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VUINT32x16_ALIGNMENT
+#  define VUINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
+# if VUINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
+#  undef VUINT64x8_ALIGNMENT
+#  define VUINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
+# endif
 #endif

-/* --------------------------------------------------------------- */
-/* Detect CPU SIMD support */
-
-// stubs for now... will be implemented sometime
-#define VEC_CPU_have_SSE2() (0)
-#define VEC_CPU_have_SSE42() (0)
-#define VEC_CPU_have_ALTIVEC() (0)
-#define VEC_CPU_have_ALTIVEC_VSX() (0)
+#endif

 /* --------------------------------------------------------------- */
 /* bit shift */

-VEC_FUNC_KEYWORDS uintmax_t vec_ulrshift(uintmax_t x, unsigned int y)
+inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y)
 {
 	return x >> y;
 }

-VEC_FUNC_KEYWORDS uintmax_t vec_ullshift(uintmax_t x, unsigned int y)
+inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y)
 {
 	return x << y;
 }

-VEC_FUNC_KEYWORDS intmax_t vec_lrshift(intmax_t x, unsigned int y)
+inline intmax_t vec_lrshift(intmax_t x, unsigned int y)
 {
 	// reinterpret as unsigned integer and then shift
 	union {
@@ -234,7 +310,7 @@
 	return xx.d;
 }

-VEC_FUNC_KEYWORDS intmax_t vec_llshift(intmax_t x, unsigned int y)
+inline intmax_t vec_llshift(intmax_t x, unsigned int y)
 {
 	// reinterpret as unsigned integer and then shift
 	union {
@@ -247,12 +323,12 @@
 	return xx.d;
 }

-VEC_FUNC_KEYWORDS uintmax_t vec_urshift(uintmax_t x, unsigned int y)
+inline uintmax_t vec_urshift(uintmax_t x, unsigned int y)
 {
 	return x >> y;
 }

-VEC_FUNC_KEYWORDS uintmax_t vec_ulshift(uintmax_t x, unsigned int y)
+inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y)
 {
 	return x << y;
 }
@@ -283,7 +359,7 @@
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
 **/
-VEC_FUNC_KEYWORDS intmax_t vec_rshift(intmax_t x, unsigned int y)
+inline intmax_t vec_rshift(intmax_t x, unsigned int y)
 {
 	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

@@ -302,7 +378,7 @@
 	return xx.d;
 }

-VEC_FUNC_KEYWORDS intmax_t vec_lshift(intmax_t x, unsigned int y)
+inline intmax_t vec_lshift(intmax_t x, unsigned int y)
 {
 	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

@@ -320,110 +396,246 @@
 	return xx.d;
 }

+#ifdef VEC_IMPLEMENTATION
+extern inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y);
+extern inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y);
+extern inline intmax_t vec_lrshift(intmax_t x, unsigned int y);
+extern inline intmax_t vec_llshift(intmax_t x, unsigned int y);
+extern inline uintmax_t vec_urshift(uintmax_t x, unsigned int y);
+extern inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y);
+extern inline intmax_t vec_rshift(intmax_t x, unsigned int y);
+extern inline intmax_t vec_lshift(intmax_t x, unsigned int y);
+#endif
+
 /* --------------------------------------------------------------- */
 /* Array alignment macros */

-#include <stdio.h>
-
+/* the alignment must be specified in bytes and must be a multiple of the
+ * type size. it is always assumed that the type will be on a boundary of
+ * its size, which may or may not be true */
 #ifdef VEC_ALIGNED
 # define VEC_ALIGNED_ARRAY(type, var, length, align) \
 	VEC_ALIGNED(align) type var[length]
 # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
 	(sizeof(var))
 #else
-/* the alignment must be specified in bytes and must be a multiple of the
- * type size. it is always assumed that the type will be on a boundary of
- * its size, which may or may not be true */
 # define VEC_ALIGNED_ARRAY(type, var, length, align) \
-	VEC_STATIC_ASSERT(align % sizeof(type) == 0 && align != 0, "vec: alignment needs to be a multiple of the type size and non-zero"); \
-	type vec_##var##_unaligned_[(length) + (align / sizeof(type)) - 1]; \
+	VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \
+	type vec_##var##_unaligned_[(length) + (align / sizeof(type))]; \
 	type *var = (type *)(((uintptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)); \
 	VEC_ASSERT(((uintptr_t)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
 # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
-	(sizeof(vec_##var##_unaligned_) - ((align) - 1))
+	(sizeof(vec_##var##_unaligned_) - (align - 1))
 #endif

-#define VEC_ALIGNED_ARRAY_LENGTH(var, align) \
-	(VEC_ALIGNED_ARRAY_SIZEOF(var, align)/sizeof(*var))
+#define VEC_ALIGNED_ARRAY_LENGTH(var) \
+	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))

 // ------------------------------------------------------------
 // predefined variants for each vector type

+#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 8, VINT8x8_ALIGNMENT)
+#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
+#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
+#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)
+
+#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 4, VINT16x4_ALIGNMENT)
+#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
+#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
+#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)
+
+#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 2, VINT32x2_ALIGNMENT)
+#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
+#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
+#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)
+
+#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 8, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)
+
+#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 4, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)
+
+#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 2, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)
+
 #define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
 #define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

 #define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
 #define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)

 #define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
 #define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)

 #define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
 #define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)

 #define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
 #define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)

 #define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
 #define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)

 #define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
 #define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)

 #define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
 #define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)

 #define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT)
+#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
+#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
 #define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)

 #define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT)
+#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
+#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
 #define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

 #define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT)
+#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
+#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
 #define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)

 #define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT)
+#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
+#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
 #define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)

 #define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
 #define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)

 #define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
 #define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

 #define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
 #define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)

 #define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
 #define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)

 #define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT)
+#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
+#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
 #define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)

-#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x16_ALIGNMENT)
+#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x32_ALIGNMENT)
+#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
+#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
 #define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

 #define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT)
+#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
+#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
 #define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)

 #define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT)
+#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
+#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
 #define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)

 #define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
 #define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)

-#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x16_ALIGNMENT)
+#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
 #define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

 #define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
 #define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)

 #define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
 #define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)

 /* --------------------------------------------------------------- */
 /* Defines the structures for each vector type */

+// 64-bit
+typedef union {
+#ifdef VEC_COMPILER_HAS_MMX
+	__m64 mmx;
+#endif
+
+	uint8_t generic[8];
+} vuint8x8;
+
+typedef union {
+#ifdef VEC_COMPILER_HAS_MMX
+	__m64 mmx;
+#endif
+
+	uint16_t generic[4];
+} vuint16x4;
+
+typedef union {
+#ifdef VEC_COMPILER_HAS_MMX
+	__m64 mmx;
+#endif
+
+	uint32_t generic[2];
+} vuint32x2;
+
+typedef union {
+#ifdef VEC_COMPILER_HAS_MMX
+	__m64 mmx;
+#endif
+
+	int8_t generic[8];
+} vint8x8;
+
+typedef union {
+#ifdef VEC_COMPILER_HAS_MMX
+	__m64 mmx;
+#endif
+
+	int16_t generic[4];
+} vint16x4;
+
+typedef union {
+#ifdef VEC_COMPILER_HAS_MMX
+	__m64 mmx;
+#endif
+
+	int32_t generic[2];
+} vint32x2;
+
 // 128-bit
 typedef union {
 #ifdef VEC_COMPILER_HAS_SSE2
@@ -432,7 +644,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector unsigned char altivec;
 #endif
-	uint8_t generic[16];
+	vuint8x8 generic[2];
 } vuint8x16;

 typedef union {
@@ -442,7 +654,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector unsigned short altivec;
 #endif
-	uint16_t generic[8];
+	vuint16x4 generic[2];
 } vuint16x8;

 typedef union {
@@ -452,7 +664,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector unsigned int altivec;
 #endif
-	uint32_t generic[4];
+	vuint32x2 generic[2];
 } vuint32x4;

 typedef union {
@@ -472,7 +684,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector signed char altivec;
 #endif
-	int8_t generic[16];
+	vint8x8 generic[2];
 } vint8x16;

 typedef union {
@@ -482,7 +694,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector signed short altivec;
 #endif
-	int16_t generic[8];
+	vint16x4 generic[2];
 } vint16x8;

 typedef union {
@@ -492,7 +704,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector signed int altivec;
 #endif
-	int32_t generic[4];
+	vint32x2 generic[2];
 } vint32x4;

 typedef union {
@@ -507,1162 +719,615 @@

 // 256-bit
 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vuint8x16 generic[2];
 } vuint8x32;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vuint16x8 generic[2];
 } vuint16x16;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vuint32x4 generic[2];
 } vuint32x8;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vuint64x2 generic[2];
 } vuint64x4;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vint8x16 generic[2];
 } vint8x32;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vint16x8 generic[2];
 } vint16x16;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vint32x4 generic[2];
 } vint32x8;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX2
+	__m256i avx2;
+#endif
 	vint64x2 generic[2];
 } vint64x4;

 // 512-bit
 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vuint8x32 generic[2];
 } vuint8x64;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vuint16x16 generic[2];
 } vuint16x32;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vuint32x8 generic[2];
 } vuint32x16;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vuint64x4 generic[2];
 } vuint64x8;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vint8x32 generic[2];
 } vint8x64;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vint16x16 generic[2];
 } vint16x32;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vint32x8 generic[2];
 } vint32x16;

 typedef union {
+#ifdef VEC_COMPILER_HAS_AVX512F
+	__m512i avx512f;
+#endif
 	vint64x4 generic[2];
 } vint64x8;

-// --------------------------------------------------------------------------------
-// okay, now onto the actual functions:
-//
-// we have generic variations of every major operation EXCEPT aligned load and
-// aligned store. this means that a vector implementation can be created with
-// only aligned load and aligned store implemented, which sucks, but it werks
+// ---------------------------------------------------------------------------------
+// function declarations
+
+int vec_init(void);
+
+#define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]); \
+	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
+	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);

-#define VEC_GENERIC_OPERATION(op, sign, csign, bits, size) \
-	do { \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \
-	\
-		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
-		v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \
-	\
-		for (int i = 0; i < size; i++) varr1[i] = (op); \
-	\
-		return v##sign##int##bits##x##size##_load_aligned(varr1); \
-	} while (0)
+#define VEC_DECLARE_OPERATIONS(bits, size) \
+	VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \
+	VEC_DECLARE_OPERATIONS_SIGN(u, bits, size)
+
+// 64-bit
+VEC_DECLARE_OPERATIONS(8, 8)
+VEC_DECLARE_OPERATIONS(16, 4)
+VEC_DECLARE_OPERATIONS(32, 2)
+
+// 128-bit
+VEC_DECLARE_OPERATIONS(8, 16)
+VEC_DECLARE_OPERATIONS(16, 8)
+VEC_DECLARE_OPERATIONS(32, 4)
+VEC_DECLARE_OPERATIONS(64, 2)

-#define VEC_GENERIC_ADD(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size)
-#define VEC_GENERIC_SUB(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size)
-#define VEC_GENERIC_MUL(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size)
-#define VEC_GENERIC_DIV(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size)
-#define VEC_GENERIC_AND(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size)
-#define VEC_GENERIC_OR(sign, csign, bits, size)  VEC_GENERIC_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size)
-#define VEC_GENERIC_XOR(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size)
+// 256-bit
+VEC_DECLARE_OPERATIONS(8, 32)
+VEC_DECLARE_OPERATIONS(16, 16)
+VEC_DECLARE_OPERATIONS(32, 8)
+VEC_DECLARE_OPERATIONS(64, 4)

-#define VEC_GENERIC_CMP(op, sign, csign, bits, size) \
-	VEC_GENERIC_OPERATION((varr1[i] op varr1[i]) ? csign##INT##bits##_MAX : 0, sign, csign, bits, size)
+// 512-bit
+VEC_DECLARE_OPERATIONS(8, 64)
+VEC_DECLARE_OPERATIONS(16, 32)
+VEC_DECLARE_OPERATIONS(32, 16)
+VEC_DECLARE_OPERATIONS(64, 8)

-#define VEC_GENERIC_CMPLT(sign, csign, bits, size) VEC_GENERIC_CMP(<,  sign, csign, bits, size)
-#define VEC_GENERIC_CMPLE(sign, csign, bits, size) VEC_GENERIC_CMP(<=, sign, csign, bits, size)
-#define VEC_GENERIC_CMPEQ(sign, csign, bits, size) VEC_GENERIC_CMP(==, sign, csign, bits, size)
-#define VEC_GENERIC_CMPGE(sign, csign, bits, size) VEC_GENERIC_CMP(>=, sign, csign, bits, size)
-#define VEC_GENERIC_CMPGT(sign, csign, bits, size) VEC_GENERIC_CMP(>,  sign, csign, bits, size)
+#undef VEC_DECLARE_OPERATIONS
+#undef VEC_DECLARE_OPERATIONS_SIGN
+
+// ---------------------------------------------------------------------------------
+// okay, now we can actually implement the functions
+
+#ifdef VEC_IMPLEMENTATION

-#define VEC_GENERIC_SHIFT(op, sign, csign, bits, size) \
-	do { \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
-		VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \
-	\
-		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
-		vuint##bits##x##size##_store_aligned(vec2, varr2); \
-	\
-		for (int i = 0; i < size; i++) varr1[i] = (op); \
-	\
-		return v##sign##int##bits##x##size##_load_aligned(varr1); \
-	} while (0)
+// Fallback functions, need to be defined before everything else.
+#include "impl/fallback.h"

-#define VEC_GENERIC_LSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size)
-#define VEC_GENERIC_RSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size)
-#define VEC_GENERIC_LRSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size)
+// okay, these are filled in for each supported backend
+#define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \
+	typedef struct { \
+		v##sign##int##bits##x##size (*splat)(sign##int##bits##_t x); \
+		v##sign##int##bits##x##size (*load_aligned)(const sign##int##bits##_t in[size]); \
+		v##sign##int##bits##x##size (*load)(const sign##int##bits##_t in[size]); \
+		void (*store_aligned)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
+		void (*store)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
+		v##sign##int##bits##x##size (*add)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*sub)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*mul)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*div)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*avg)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*and)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*or)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*xor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*not)(v##sign##int##bits##x##size vec); \
+		v##sign##int##bits##x##size (*cmplt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*cmple)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*cmpeq)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*cmpge)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*cmpgt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	} v##sign##int##bits##x##size##_impl;

-#ifdef VEC_COMPILER_HAS_SSE2
-// these are shared between SSE2 variations
-# define VEC_SSE2_MUL_8x16(sign) \
-	do { \
-		/* unpack and multiply */ \
-		__m128i dst_even = _mm_mullo_epi16(vec1.sse, vec2.sse); \
-		__m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \
-	\
-		/* repack */ \
-		return (v##sign##int8x16){ .sse = _mm_or_si128( \
-			_mm_slli_epi16(dst_odd, 8), \
-			_mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
-		)}; \
-	} while (0)
+#define VEC_DEFINE_IMPL_STRUCT(bits, size) \
+	VEC_DEFINE_IMPL_STRUCT_SIGN( , bits, size) \
+	VEC_DEFINE_IMPL_STRUCT_SIGN(u, bits, size)

-# define VEC_SSE2_MUL_16x8(sign) \
-	do { \
-		/* we have a real instruction for this */ \
-		return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \
-	} while (0)
+// 64-bit
+VEC_DEFINE_IMPL_STRUCT(8, 8)
+VEC_DEFINE_IMPL_STRUCT(16, 4)
+VEC_DEFINE_IMPL_STRUCT(32, 2)
+
+// 128-bit
+VEC_DEFINE_IMPL_STRUCT(8, 16)
+VEC_DEFINE_IMPL_STRUCT(16, 8)
+VEC_DEFINE_IMPL_STRUCT(32, 4)
+VEC_DEFINE_IMPL_STRUCT(64, 2)

-# define VEC_SSE2_MUL_32x4(sign) \
-	do { \
-		/* this was stolen from... somewhere :) */ \
-		__m128i a13    = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \
-		__m128i b13    = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \
-		__m128i prod02 = _mm_mul_epu32(vec1, vec2);         /* (-,a2*b2,-,a0*b0) */ \
-		__m128i prod13 = _mm_mul_epu32(a13, b13);           /* (-,a3*b3,-,a1*b1) */ \
-		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
-		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
-		return (v##sign##int32x4) {.sse = _mm_unpacklo_epi64(prod01, prod23)}; /* (ab3,ab2,ab1,ab0) */ \
-	} while (0)
+// 256-bit
+VEC_DEFINE_IMPL_STRUCT(8, 32)
+VEC_DEFINE_IMPL_STRUCT(16, 16)
+VEC_DEFINE_IMPL_STRUCT(32, 8)
+VEC_DEFINE_IMPL_STRUCT(64, 4)

-# define VEC_SSE2_MUL_64x2(sign) \
-	do { \
-		__m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \
-		__m128i b  = _mm_srli_epi64(vec1.sse, 32);      /* b = vec1 >> 32; */ \
-		__m128i bc = _mm_mul_epu32(b, vec2.sse);        /* bc = b * (vec2 & UINT32_MAX); */ \
-		__m128i d  = _mm_srli_epi64(vec2.sse, 32);      /* d = vec2 >> 32; */ \
-		__m128i ad = _mm_mul_epu32(vec1.sse, d);        /* ad = (vec1 & UINT32_MAX) * d; */ \
-		__m128i hi = _mm_add_epi64(bc, ad);             /* hi = bc + ad; */ \
-		hi = _mm_slli_epi64(hi, 32);                    /* hi <<= 32; */ \
-		return (v##sign##int64x2) {.sse = _mm_add_epi64(hi, ac); } /* return ac + hi; */ \
-	} while (0)
+// 512-bit
+VEC_DEFINE_IMPL_STRUCT(8, 64)
+VEC_DEFINE_IMPL_STRUCT(16, 32)
+VEC_DEFINE_IMPL_STRUCT(32, 16)
+VEC_DEFINE_IMPL_STRUCT(64, 8)
+
+#undef VEC_DEFINE_IMPL_STRUCT
+#undef VEC_DEFINE_IMPL_STRUCT_SIGN
+
+// ------------------------------------------------------------------------
+
+#ifdef VEC_COMPILER_HAS_ALTIVEC
+# include "impl/ppc/altivec.h"
 #endif

-// --------------------------------------------------------------------------------
-// vuint8x16 implementation
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load_aligned(const uint8_t in[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vuint8x16) { .sse = _mm_load_si128((__m128i *)in) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_ld(0, in);
-	} else
-#endif
-	{
-		vuint8x16 vec;
-		memcpy(vec.generic, in, sizeof(vec.generic));
-		return vec;
-	}
-
-	VEC_ASSERT(0, "No suitable load_aligned variant found");
-
-	return (vuint8x16){ 0 };
-}
-
-VEC_FUNC_KEYWORDS void vuint8x16_store_aligned(vuint8x16 vec, uint8_t out[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		_mm_store_si128((__m128i *)out, vec.sse);
-		return;
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		vec_st(vec.altivec, 0, out);
-		return;
-	} else
-#endif
-	{
-		memcpy(out, vec.generic, sizeof(vec.generic));
-		return;
-	}
-
-	VEC_ASSERT(0, "No suitable aligned store variant found");
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_splat(uint8_t x)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
+#ifdef VEC_COMPILER_HAS_AVX512F
+# include "impl/x86/avx512f.h"
 #endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return (vuint8x16){ .altivec = vec_splat_u8(x) };
-	} else
-#endif
-	{
-		return (vuint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} };
-	}
-
-	// okay, we don't have a regular thing. call the load function with a splatted array
-	VUINT8x16_ALIGNED_ARRAY(arr);
-	for (int i = 0; i < 16; i++) arr[i] = x;
-	return vuint8x16_load_aligned(arr);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load(const uint8_t in[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vuint8x16) { .sse = _mm_loadu_si128((__m128i *)in) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in));
-	} else
-#endif
-	{
-		vuint8x16 vec;
-		memcpy(vec.generic, in, sizeof(vec.generic));
-		return vec;
-	}
-
-	// ok, we don't have unaligned load, copy the array
-	// and call the aligned load function
-	VUINT8x16_ALIGNED_ARRAY(aligned_in);
-	memcpy(aligned_in, in, 16);
-	return vuint8x16_load_aligned(aligned_in);
-}
-
-VEC_FUNC_KEYWORDS void vuint8x16_store(vuint8x16 vec, uint8_t out[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		_mm_storeu_si128((__m128i *)out, vec.sse);
-		return;
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// noop
-	} else
-#endif
-	{
-		memcpy(out, vec.generic, sizeof(vec.generic));
-		return;
-	}

-	// no unaligned store? use the aligned version
-	VUINT8x16_ALIGNED_ARRAY(aligned_out);
-	vuint8x16_store_aligned(vec, aligned_out);
-
-	// then copy to the output buffer
-	memcpy(out, aligned_out, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_add(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vuint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_add(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_ADD(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_sub(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vuint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_sub(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_SUB(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_mul(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		VEC_SSE2_MUL_8x16(u);
-	} else
+#ifdef VEC_COMPILER_HAS_AVX2
+# include "impl/x86/avx2.h"
 #endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-# ifdef vec_mul // this isn't available on older compilers
-		return vec_mul(vec1.altivec, vec2.altivec);
-# endif
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i];
-		return vec1;
-	}

-	VEC_GENERIC_MUL(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_div(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-	if (VEC_CPU_have_ALTIVEC_VSX()) {
-		return vec_div(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_DIV(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_and(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vuint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_and(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_AND(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_or(vuint8x16 vec1, vuint8x16 vec2)
-{
 #ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vuint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_or(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_OR(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_xor(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vuint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_xor(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_XOR(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lshift(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		//noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_sl(vec1, vec2);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulshift(vec1.generic[i], vec2.generic[i]);
-		return vec1;
-	}
-
-	VEC_GENERIC_LSHIFT(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_rshift(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		//noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_sl(vec1, vec2);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_urshift(vec1.generic[i], vec2.generic[i]);
-		return vec1;
-	}
-
-	VEC_GENERIC_RSHIFT(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lrshift(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		//noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_sl(vec1, vec2);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulrshift(vec1.generic[i], vec2.generic[i]);
-		return vec1;
-	}
-
-	VEC_GENERIC_LRSHIFT(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_avg(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_avg(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (uint8_t)(vec1.generic[i] + vec2.generic[i]) / 2;
-		return vec1;
-	}
-
-	return vuint8x16_div(vuint8x16_add(vec1, vec2), vuint8x16_splat(2));
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmplt(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPLT(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmple(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPLE(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpeq(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPEQ(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpgt(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPGT(u, U, 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpge(vuint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPGE(u, U, 8, 16);
-}
-
-// --------------------------------------------------------------------------------
-// vint8x16 implementation
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_load_aligned(const int8_t in[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vint8x16) { .sse = _mm_load_si128((__m128i *)in) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return (vint8x16) { .altivec = vec_ld(0, in) };
-	} else
+# include "impl/x86/sse2.h"
 #endif
-	{
-		vint8x16 vec;
-		memcpy(vec.generic, in, sizeof(vec.generic));
-		return vec;
-	}
-
-	VEC_ASSERT(0, "No suitable load_aligned variant found");
-
-	return (vint8x16){ 0 };
-}
-
-VEC_FUNC_KEYWORDS void vint8x16_store_aligned(vint8x16 vec, int8_t out[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		_mm_store_si128((__m128i *)out, vec.sse);
-		return;
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		vec_st(vec.altivec, 0, out);
-		return;
-	} else
-#endif
-	{
-		memcpy(out, vec.generic, sizeof(vec.generic));
-		return;
-	}
-
-	VEC_ASSERT(0, "No suitable aligned store variant found");
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_splat(int8_t x)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return (vint8x16){ .altivec = vec_splat_s8(x) };
-	} else
-#endif
-	{
-		return (vint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} };
-	}
-
-	// okay, we don't have a regular thing. call the load function with a splatted array
-	VINT8x16_ALIGNED_ARRAY(arr);
-	for (int i = 0; i < 16; i++) arr[i] = x;
-	return vint8x16_load_aligned(arr);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_load(const int8_t in[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vint8x16) { .sse = _mm_loadu_si128((__m128i *)in) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return (vint8x16) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) };
-	} else
-#endif
-	{
-		vint8x16 vec;
-		memcpy(vec.generic, in, sizeof(vec.generic));
-		return vec;
-	}
-
-	// ok, we don't have unaligned load, copy the array
-	// and call the aligned load function
-	VINT8x16_ALIGNED_ARRAY(aligned_in);
-	memcpy(aligned_in, in, 16);
-	return vint8x16_load_aligned(aligned_in);
-}
-
-VEC_FUNC_KEYWORDS void vint8x16_store(vint8x16 vec, int8_t out[16])
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		_mm_storeu_si128((__m128i *)out, vec.sse);
-		return;
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// noop
-	} else
-#endif
-	{
-		memcpy(out, vec.generic, sizeof(vec.generic));
-		return;
-	}
-
-	// no unaligned store? use the aligned version
-	VINT8x16_ALIGNED_ARRAY(aligned_out);
-	vint8x16_store_aligned(vec, aligned_out);
-
-	// then copy to the output buffer
-	memcpy(out, aligned_out, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_add(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return (vint8x16) { .altivec = vec_add(vec1.altivec, vec2.altivec) };
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_ADD(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_sub(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return (vint8x16) { .altivec = vec_sub(vec1.altivec, vec2.altivec) };
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_SUB(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_mul(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		VEC_SSE2_MUL_8x16();
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-# ifdef vec_mul // this isn't available on older compilers
-		return (vint8x16) { .altivec = vec_mul(vec1.altivec, vec2.altivec) };
-# endif
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_MUL(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_div(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-	if (VEC_CPU_have_ALTIVEC_VSX()) {
-		return (vint8x16) { .altivec = vec_div(vec1.altivec, vec2.altivec) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// noop
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_DIV(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_and(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return (vint8x16) {.altivec = vec_and(vec1.altivec, vec2.altivec) };
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_ADD(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_or(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_or(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i];
-		return vec1;
-	}

-	VEC_GENERIC_OR(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_xor(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		return (vint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) };
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_xor(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i];
-		return vec1;
-	}
-
-	VEC_GENERIC_XOR(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_lshift(vint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		//noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_sl(vec1, vec2);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lshift(vec1.generic[i], vec2.generic[i]);
-		return vec1;
-	}
-
-	VEC_GENERIC_LSHIFT(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_rshift(vint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		//noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_sl(vec1, vec2);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_rshift(vec1.generic[i], vec2.generic[i]);
-		return vec1;
-	}
-
-	VEC_GENERIC_RSHIFT(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_lrshift(vint8x16 vec1, vuint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		//noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_sl(vec1, vec2);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lrshift(vec1.generic[i], vec2.generic[i]);
-		return vec1;
-	}
-
-	VEC_GENERIC_LRSHIFT(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_avg(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		return vec_avg(vec1.altivec, vec2.altivec);
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (int8_t)(vec1.generic[i] + vec2.generic[i]) / 2;
-		return vec1;
-	}
-
-	return vint8x16_div(vint8x16_add(vec1, vec2), vint8x16_splat(2));
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmplt(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
+// depends on SSE2 functions; the only thing SSE4.1 provides for us
+// is a native 32-bit multiply
+#ifdef VEC_COMPILER_HAS_SSE41
+# include "impl/x86/sse41.h"
 #endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}

-	VEC_GENERIC_CMPLT(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmple(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPLE(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpeq(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho :)
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPEQ(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpgt(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
+#ifdef VEC_COMPILER_HAS_MMX
+# include "impl/x86/mmx.h"
 #endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}

-	VEC_GENERIC_CMPGT(, , 8, 16);
-}
-
-VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpge(vint8x16 vec1, vint8x16 vec2)
-{
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (VEC_CPU_have_SSE2()) {
-		// noop
-	} else
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (VEC_CPU_have_ALTIVEC()) {
-		// these functions exist, no internet rn tho
-	} else
-#endif
-	{
-		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0;
-		return vec1;
-	}
-
-	VEC_GENERIC_CMPGE(, , 8, 16);
-}
-
-/* ----------------------------------------------------------------- */
-/* bitwise NOT is just an XOR with UINT[BITS]_MAX */
-
-#define DEFINE_NOT_OPERATION(sign, bits, size) \
-	VEC_FUNC_KEYWORDS v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
-	{ \
-		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \
-	}
-
-DEFINE_NOT_OPERATION(, 8, 16)
-DEFINE_NOT_OPERATION(u, 8, 16)
-
-DEFINE_NOT_OPERATION(, 8, 32)
-DEFINE_NOT_OPERATION(, 16, 16)
-DEFINE_NOT_OPERATION(, 32, 8)
-DEFINE_NOT_OPERATION(, 64, 4)
-DEFINE_NOT_OPERATION(u, 8, 32)
-DEFINE_NOT_OPERATION(u, 16, 16)
-DEFINE_NOT_OPERATION(u, 32, 8)
-DEFINE_NOT_OPERATION(u, 64, 4)
-
-DEFINE_NOT_OPERATION(, 8, 64)
-DEFINE_NOT_OPERATION(, 16, 32)
-DEFINE_NOT_OPERATION(, 32, 16)
-DEFINE_NOT_OPERATION(, 64, 8)
-DEFINE_NOT_OPERATION(u, 8, 64)
-DEFINE_NOT_OPERATION(u, 16, 32)
-DEFINE_NOT_OPERATION(u, 32, 16)
-DEFINE_NOT_OPERATION(u, 64, 8)
-
-#undef DEFINE_NOT_OPERATION
+#include "impl/generic.h"

 /* ---------------------------------------------------------------- */

-/* cleanup */
-#undef VEC_OPERATION_DECL
-#undef VEC_OPERATION_THIS_DECL
-#undef VEC_TWOWAY_DECL
+#include "impl/cpu.h" // CPU detection crap
+
+// 64-bit
+static vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
+static vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
+static vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
+static vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
+static vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
+static vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
+
+// 128-bit
+static vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
+static vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
+static vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
+static vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
+static vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
+static vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
+static vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
+static vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
+
+// 256-bit
+static vint8x32_impl  *vint8x32_impl_cpu  = &vint8x32_impl_generic;
+static vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic;
+static vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
+static vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
+static vint32x8_impl  *vint32x8_impl_cpu  = &vint32x8_impl_generic;
+static vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic;
+static vint64x4_impl  *vint64x4_impl_cpu  = &vint64x4_impl_generic;
+static vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic;
+
+// 512-bit
+static vint8x64_impl  *vint8x64_impl_cpu  = &vint8x64_impl_generic;
+static vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic;
+static vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
+static vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
+static vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
+static vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
+static vint64x8_impl  *vint64x8_impl_cpu  = &vint64x8_impl_generic;
+static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic;
+
+int vec_init(void)
+{
+	// This function is NOT thread safe. However, once vec
+	// is initialized, all of the vector functions are thread-safe.
+	//
+	// In fact, it's possible to use vec without calling
+	// vec_init() at all, but it would be completely useless since
+	// it would just use a generic implementation without any
+	// vectorization whatsoever (unless maybe the compiler is
+	// smart enough to optimize it into vectors)
+
+	vec_get_CPU_features();

-#undef VEC_DECL_SPLAT
-#undef VEC_DECL_LOAD
-#undef VEC_DECL_STORE
-#undef VEC_DECL_ADD
-#undef VEC_DECL_SUB
-#undef VEC_DECL_MUL
-#undef VEC_DECL_DIV
-#undef VEC_DECL_AND
-#undef VEC_DECL_OR
-#undef VEC_DECL_XOR
-#undef VEC_DECL_AVG
-#undef VEC_DECL_SHIFT
-#undef VEC_DECL_NOT
+#ifdef VEC_COMPILER_HAS_ALTIVEC
+	if (vec_CPU_have_ALTIVEC()) {
+		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
+		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
+		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
+		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
+		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
+		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
+#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
+		if (vec_CPU_have_ALTIVEC_VSX()) {
+			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
+			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
+		}
+#endif
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_AVX512F
+	if (vec_CPU_have_AVX512F()) {
+		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
+		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
+		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
+		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
+		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
+		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
+		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
+		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_AVX2
+	if (vec_CPU_have_AVX2()) {
+		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
+		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
+		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
+		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
+		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
+		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
+		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
+		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_SSE2
+	if (vec_CPU_have_SSE2()) {
+		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
+		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
+		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
+		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
+# ifdef VEC_COMPILER_HAS_SSE41
+		if (vec_CPU_have_SSE41()) {
+			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
+			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
+		} else
+# endif
+		{
+			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
+			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
+		}
+		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
+		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_MMX
+	if (vec_CPU_have_MMX()) {
+		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
+		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
+		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
+		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
+		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
+		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
+	}
+#endif
+	{
+		// do nothing, they're already set to generics
+	}
+}

-#undef VEC_DECL_CMPLT
-#undef VEC_DECL_CMPGT
-#undef VEC_DECL_CMPEQ
-#undef VEC_DECL_CMPLE
-#undef VEC_DECL_CMPGE
+/* ---------------------------------------------------------------- */

-#undef VEC_GENERIC_SPLAT
-#undef VEC_GENERIC_DIVIDE
-#undef VEC_GENERIC_SHIFT
-#undef VEC_GENERIC_SHIFTS
-#undef VEC_GENERIC_LSHIFT
-#undef VEC_GENERIC_RSHIFT
-#undef VEC_GENERIC_LRSHIFT
-#undef VEC_GENERIC_AVG
-#undef VEC_GENERIC_THAN_OR_EQUAL
-#undef VEC_GENERIC_COMPARISON
-#undef VEC_GENERIC_COMPARISONS
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->splat) \
+			return v##sign##int##bits##x##size##_impl_cpu->splat(x); \
+	\
+		return v##sign##int##bits##x##size##_fallback_splat(x); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \
+			return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \
+	\
+		VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \
+		return (v##sign##int##bits##x##size){0}; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->load) \
+			return v##sign##int##bits##x##size##_impl_cpu->load(in); \
+	\
+		return v##sign##int##bits##x##size##_fallback_load(in); \
+	} \
+	\
+	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \
+			v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \
+			return; \
+		} \
+	\
+		VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \
+	} \
+	\
+	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->store) { \
+			v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \
+			return; \
+		} \
+	\
+		v##sign##int##bits##x##size##_fallback_store(vec, out); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->add) \
+			v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_add(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->sub) \
+			v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_sub(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->mul) \
+			v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_mul(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->div) \
+			v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_div(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->avg) \
+			v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_avg(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->and) \
+			v##sign##int##bits##x##size##_impl_cpu->and(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->or) \
+			v##sign##int##bits##x##size##_impl_cpu->or(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->xor) \
+			v##sign##int##bits##x##size##_impl_cpu->xor(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->not) \
+			v##sign##int##bits##x##size##_impl_cpu->not(vec); \
+	\
+		return v##sign##int##bits##x##size##_fallback_not(vec); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->cmplt) \
+			v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_cmplt(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->cmple) \
+			v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_cmple(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->cmpeq) \
+			v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_cmpeq(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->cmpge) \
+			v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_cmpge(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->cmpgt) \
+			v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_cmpgt(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->lshift) \
+			v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_lshift(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->rshift) \
+			v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_rshift(vec1, vec2); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		if (v##sign##int##bits##x##size##_impl_cpu->lrshift) \
+			v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \
+	\
+		return v##sign##int##bits##x##size##_fallback_lrshift(vec1, vec2); \
+	}

-#undef VEC_VINT8X16
-#undef VEC_VINT16X8
-#undef VEC_VINT32X4
-#undef VEC_VINT64X2
-#undef VEC_VUINT8X16
-#undef VEC_VUINT16X8
-#undef VEC_VUINT32X4
-#undef VEC_VUINT64X2
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
+
+// 64-bit
+VEC_DEFINE_OPERATIONS(8, 8)
+VEC_DEFINE_OPERATIONS(16, 4)
+VEC_DEFINE_OPERATIONS(32, 2)
+
+// 128-bit
+VEC_DEFINE_OPERATIONS(8, 16)
+VEC_DEFINE_OPERATIONS(16, 8)
+VEC_DEFINE_OPERATIONS(32, 4)
+VEC_DEFINE_OPERATIONS(64, 2)

-#undef VEC_VINT8X32
-#undef VEC_VINT16X16
-#undef VEC_VINT32X8
-#undef VEC_VINT64X4
-#undef VEC_VUINT8X32
-#undef VEC_VUINT16X16
-#undef VEC_VUINT32X8
-#undef VEC_VUINT64X4
+// 256-bit
+VEC_DEFINE_OPERATIONS(8, 32)
+VEC_DEFINE_OPERATIONS(16, 16)
+VEC_DEFINE_OPERATIONS(32, 8)
+VEC_DEFINE_OPERATIONS(64, 4)

-#undef VEC_VINT8X64
-#undef VEC_VINT16X32
-#undef VEC_VINT32X16
-#undef VEC_VINT64X8
-#undef VEC_VUINT8X64
-#undef VEC_VUINT16X32
-#undef VEC_VUINT32X16
-#undef VEC_VUINT64X8
+// 512-bit
+VEC_DEFINE_OPERATIONS(8, 64)
+VEC_DEFINE_OPERATIONS(16, 32)
+VEC_DEFINE_OPERATIONS(32, 16)
+VEC_DEFINE_OPERATIONS(64, 8)
+
+#undef VEC_DEFINE_OPERATIONS
+#undef VEC_DEFINE_OPERATIONS_SIGN
+
+#endif /* VEC_IMPLEMENTATION */

 #endif /* VEC_VEC_H_ */
--- a/src/vec.c	Tue Nov 19 15:55:01 2024 -0500
+++ b/src/vec.c	Wed Nov 20 04:10:37 2024 -0500
@@ -1,3 +1,2 @@
-#define VEC_EXTERN
-#define VEC_EXTERN_DEFINE
+#define VEC_IMPLEMENTATION
 #include "vec/vec.h"
--- a/test/Makefile	Tue Nov 19 15:55:01 2024 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,30 +0,0 @@
-CFLAGS += -std=c99 -I../include
-
-# binary files
-BINS = test-gcc test-generic test-host
-OBJS = $(BINS:=.o)
-
-.PHONY: all clean test
-
-all: $(BINS)
-
-# suppress the platform-dependent hardware stuff so we only have
-# GCC vector extensions
-test-gcc: CFLAGS += -DVEC_SUPPRESS_HW
-
-# also suppress GCC extensions, leaving only the defaults
-test-generic: CFLAGS += -DVEC_SUPPRESS_HW -DVEC_SUPPRESS_GCC
-
-$(OBJS): main.c
-	$(CC) $(CFLAGS) -o $@ -c $^
-
-$(BINS): %: %.o
-	$(CC) $(LDFLAGS) -o $@ $^
-
-clean:
-	$(RM) $(BINS) $(OBJS)
-
-test: clean $(BINS)
-	./test-gcc
-	./test-generic
-	./test-host
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/Makefile.ppc	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,3 @@
+CFLAGS += -maltivec
+
+include Makefile.template
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/Makefile.template	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,40 @@
+CFLAGS += -g -O2 -std=c99 -I../include
+
+HEADERS = ../include/vec/vec.h \
+	../include/vec/impl/ppc/altivec.h \
+	../include/vec/impl/x86/avx2.h \
+	../include/vec/impl/x86/avx512f.h \
+	../include/vec/impl/x86/mmx.h \
+	../include/vec/impl/x86/sse2.h \
+	../include/vec/impl/x86/sse41.h \
+	../include/vec/impl/cpu.h \
+	../include/vec/impl/fallback.h \
+	../include/vec/impl/generic.h
+BINS = test-generic test-host
+OBJS = vec-generic.o vec-host.o test.o
+
+.PHONY: all clean test
+
+all: $(BINS)
+
+vec-generic.o: ../src/vec.c
+	$(CC) $(CFLAGS) -DVEC_SUPPRESS_HW=1 -c -o $@ $<
+
+vec-host.o: ../src/vec.c
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+test.o: test.c
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+test-generic: vec-generic.o test.o
+	$(CC) $(LDFLAGS) -o $@ $^
+
+test-host: vec-host.o test.o
+	$(CC) $(LDFLAGS) -o $@ $^
+
+clean:
+	$(RM) $(BINS) $(OBJS)
+
+test: clean $(BINS)
+	./test-generic
+	./test-host
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/Makefile.x86	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,3 @@
+CFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f
+
+include Makefile.template
\ No newline at end of file
--- a/test/main.c	Tue Nov 19 15:55:01 2024 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,106 +0,0 @@
-#define VEC_EXTERN
-#define VEC_EXTERN_DEFINE
-#include "vec/vec.h"
-
-#include <stdio.h>
-#include <inttypes.h>
-
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
-
-static const int8_t testval8[] = {
-	INT8_C(-80), INT8_C(-3), INT8_C(25), INT8_C(0x7F),
-	INT8_C(-42), INT8_C(27), INT8_C(24), INT8_C(0x40),
-};
-
-static const uint8_t testvalu8[] = {
-	UINT8_C(0x00), UINT8_C(0xFF), UINT8_C(0xFE), UINT8_C(0x7F),
-	UINT8_C(0xC0), UINT8_C(0x80), UINT8_C(0x20), UINT8_C(0x50),
-};
-
-static const int16_t testval16[] = {
-	INT16_C(-8000), INT16_C(-30), INT16_MAX, INT16_C(0x4000),
-	INT16_C(-42),   INT16_C(250), INT16_MIN, INT16_C(0x500),
-};
-
-static const uint16_t testvalu16[] = {
-	UINT16_C(0x0000), UINT16_C(0xFFFF), UINT16_C(0xFEA), UINT16_C(0x7FF),
-	UINT16_C(0x7FFF), UINT16_C(0x8000), UINT16_C(0x20B), UINT16_C(0x50C),
-};
-
-static const int32_t testval32[] = {
-	INT32_C(-1000000),   INT32_C(-3), INT32_C(0x00000000), INT32_C(0xFFFFFFFF),
-	INT32_C(     -42),   INT32_C(27), INT32_C(0xABCDEF03), INT32_C(0x00000FFF),
-	INT32_C(0xFFFFFFFF), INT32_C( 0), INT32_C(0xFFFFFFFE), INT32_C(         1),
-};
-
-static const uint32_t testvalu32[] = {
-	UINT32_C(0x00000000), UINT32_C(0xDEADBEEF), UINT32_C(42), UINT32_C(0x12340000),
-	UINT32_C(0xFFFFFFFF), UINT32_C(0xFEDCBA98), UINT32_C(17), UINT32_C(0x00012345),
-	UINT32_C(0xFFFFFFFF), UINT32_C(0xFFFFFFFE), UINT32_C( 0), UINT32_C(         1),
-};
-
-static const int64_t testval64[] = {
-	INT64_MAX, INT64_C(-3),     INT64_C(0x00000000),   INT64_C(0xFFFFFFFFF),
-	INT64_MIN, INT64_C(645366), INT64_C(0x12345ABCDE), INT64_C(0xF00000FFF),
-};
-
-static const uint64_t testvalu64[] = {
-	UINT64_MAX,     UINT64_C(0x44354365), UINT64_C(0x00000000),   UINT64_C(0xFFFFFFFFF),
-	UINT64_C(0xff), UINT64_C(645366),     UINT64_C(0x12345ABCDE), UINT64_C(0xF00000FFF),
-};
-
-#define VTEST(sign, csign, bits, size) \
-	static inline v##sign##int##bits##x##size vtest##sign##bits##x##size(const size_t start) \
-	{ \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(x); \
-		for (size_t i = 0; i < size; i++) \
-			x[i] = testval##sign##bits[(start + i) % ARRAY_SIZE(testval##sign##bits)]; \
-		return v##sign##int##bits##x##size##_load_aligned(x); \
-	}
-
-#define VPRINT(sign, csign, psign, bits, size) \
-	static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \
-	{ \
-		fputs("vector: ", file); \
-	\
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(v); \
-	\
-		v##sign##int##bits##x##size##_store_aligned(vec, v); \
-	\
-		fprintf(file, "%" PRI ## psign ## bits, v[0]); \
-	\
-		for (int i = 1; i < size; i++) \
-			fprintf(file, ", %" PRI ## psign ## bits, v[i]); \
-	\
-		fputs("\n", file); \
-	\
-	}
-
-#define DEF_VEC_TEST_FUNCS(bits, size) \
-	VTEST(, , bits, size)     VTEST(u, U, bits, size) \
-	VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size)
-
-DEF_VEC_TEST_FUNCS(8, 16)
-
-#undef DEF_VEC_TEST_FUNCS
-#undef VPRINT
-#undef VTEST
-
-// ------------------------------------------------------------
-
-#include "test_align.h"
-#include "test_arith.h"
-#include "test_compare.h"
-
-// ------------------------------------------------------------
-
-int main(void)
-{
-	int ret = 0;
-
-	ret |= test_align();
-	ret |= test_arith();
-	ret |= test_compare();
-
-	return ret;
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test.c	Wed Nov 20 04:10:37 2024 -0500
@@ -0,0 +1,123 @@
+#include "vec/vec.h"
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+
+static const int8_t testval8[] = {
+	INT8_C(-80), INT8_C(-3), INT8_C(25), INT8_C(0x7F),
+	INT8_C(-42), INT8_C(27), INT8_C(24), INT8_C(0x40),
+};
+
+static const uint8_t testvalu8[] = {
+	UINT8_C(0x00), UINT8_C(0xFF), UINT8_C(0xFE), UINT8_C(0x7F),
+	UINT8_C(0xC0), UINT8_C(0x80), UINT8_C(0x20), UINT8_C(0x50),
+};
+
+static const int16_t testval16[] = {
+	INT16_C(-8000), INT16_C(-30), INT16_MAX, INT16_C(0x4000),
+	INT16_C(-42),   INT16_C(250), INT16_MIN, INT16_C(0x500),
+};
+
+static const uint16_t testvalu16[] = {
+	UINT16_C(0x0000), UINT16_C(0xFFFF), UINT16_C(0xFEA), UINT16_C(0x7FF),
+	UINT16_C(0x7FFF), UINT16_C(0x8000), UINT16_C(0x20B), UINT16_C(0x50C),
+};
+
+static const int32_t testval32[] = {
+	INT32_C(-1000000),   INT32_C(-3), INT32_C(0x00000000), INT32_C(0xFFFFFFFF),
+	INT32_C(     -42),   INT32_C(27), INT32_C(0xABCDEF03), INT32_C(0x00000FFF),
+	INT32_C(0xFFFFFFFF), INT32_C( 0), INT32_C(0xFFFFFFFE), INT32_C(         1),
+};
+
+static const uint32_t testvalu32[] = {
+	UINT32_C(0x00000000), UINT32_C(0xDEADBEEF), UINT32_C(42), UINT32_C(0x12340000),
+	UINT32_C(0xFFFFFFFF), UINT32_C(0xFEDCBA98), UINT32_C(17), UINT32_C(0x00012345),
+	UINT32_C(0xFFFFFFFF), UINT32_C(0xFFFFFFFE), UINT32_C( 0), UINT32_C(         1),
+};
+
+static const int64_t testval64[] = {
+	INT64_MAX, INT64_C(-3),     INT64_C(0x00000000),   INT64_C(0xFFFFFFFFF),
+	INT64_MIN, INT64_C(645366), INT64_C(0x12345ABCDE), INT64_C(0xF00000FFF),
+};
+
+static const uint64_t testvalu64[] = {
+	UINT64_MAX,     UINT64_C(0x44354365), UINT64_C(0x00000000),   UINT64_C(0xFFFFFFFFF),
+	UINT64_C(0xff), UINT64_C(645366),     UINT64_C(0x12345ABCDE), UINT64_C(0xF00000FFF),
+};
+
+#define VTEST(sign, csign, bits, size) \
+	static inline v##sign##int##bits##x##size vtest##sign##bits##x##size(const size_t start) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(x); \
+		for (size_t i = 0; i < size; i++) \
+			x[i] = testval##sign##bits[(start + i) % ARRAY_SIZE(testval##sign##bits)]; \
+		return v##sign##int##bits##x##size##_load_aligned(x); \
+	}
+
+#define VPRINT(sign, csign, psign, bits, size) \
+	static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \
+	{ \
+		fputs("vector: ", file); \
+	\
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(v); \
+	\
+		v##sign##int##bits##x##size##_store_aligned(vec, v); \
+	\
+		fprintf(file, "%" PRI ## psign ## bits, v[0]); \
+	\
+		for (int i = 1; i < size; i++) \
+			fprintf(file, ", %" PRI ## psign ## bits, v[i]); \
+	\
+		fputs("\n", file); \
+	\
+	}
+
+#define DEF_VEC_TEST_FUNCS(bits, size) \
+	VTEST(, , bits, size)     VTEST(u, U, bits, size) \
+	VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size)
+
+DEF_VEC_TEST_FUNCS(8, 8)
+DEF_VEC_TEST_FUNCS(16, 4)
+DEF_VEC_TEST_FUNCS(32, 2)
+
+DEF_VEC_TEST_FUNCS(8, 16)
+DEF_VEC_TEST_FUNCS(16, 8)
+DEF_VEC_TEST_FUNCS(32, 4)
+DEF_VEC_TEST_FUNCS(64, 2)
+
+DEF_VEC_TEST_FUNCS(8, 32)
+DEF_VEC_TEST_FUNCS(16, 16)
+DEF_VEC_TEST_FUNCS(32, 8)
+DEF_VEC_TEST_FUNCS(64, 4)
+
+DEF_VEC_TEST_FUNCS(8, 64)
+DEF_VEC_TEST_FUNCS(16, 32)
+DEF_VEC_TEST_FUNCS(32, 16)
+DEF_VEC_TEST_FUNCS(64, 8)
+
+#undef DEF_VEC_TEST_FUNCS
+#undef VPRINT
+#undef VTEST
+
+// ------------------------------------------------------------
+
+#include "test_align.h"
+#include "test_arith.h"
+#include "test_compare.h"
+
+// ------------------------------------------------------------
+
+int main(void)
+{
+	int ret = 0;
+
+	vec_init();
+
+	ret |= test_align();
+	ret |= test_arith();
+	ret |= test_compare();
+
+	return ret;
+}
--- a/test/test_align.h	Tue Nov 19 15:55:01 2024 -0500
+++ b/test/test_align.h	Wed Nov 20 04:10:37 2024 -0500
@@ -31,7 +31,24 @@
 	RUN_TEST( ,  , bits, size) \
 	RUN_TEST(u, U, bits, size)

+	RUN_TESTS(8, 8)
+	RUN_TESTS(16, 4)
+	RUN_TESTS(32, 2)
+
 	RUN_TESTS(8, 16)
+	RUN_TESTS(16, 8)
+	RUN_TESTS(32, 4)
+	RUN_TESTS(64, 2)
+
+	RUN_TESTS(8, 32)
+	RUN_TESTS(16, 16)
+	RUN_TESTS(32, 8)
+	RUN_TESTS(64, 4)
+
+	RUN_TESTS(8, 64)
+	RUN_TESTS(16, 32)
+	RUN_TESTS(32, 16)
+	RUN_TESTS(64, 8)

 #undef RUN_TESTS
 #undef RUN_TEST
--- a/test/test_arith.h	Tue Nov 19 15:55:01 2024 -0500
+++ b/test/test_arith.h	Wed Nov 20 04:10:37 2024 -0500
@@ -69,7 +69,24 @@
 	CREATE_TESTS_SIGN(, d, , bits, size) \
 	CREATE_TESTS_SIGN(u, u, U, bits, size)

+CREATE_TESTS(8, 8)
+CREATE_TESTS(16, 4)
+CREATE_TESTS(32, 2)
+
 CREATE_TESTS(8, 16)
+CREATE_TESTS(16, 8)
+CREATE_TESTS(32, 4)
+CREATE_TESTS(64, 2)
+
+CREATE_TESTS(8, 32)
+CREATE_TESTS(16, 16)
+CREATE_TESTS(32, 8)
+CREATE_TESTS(64, 4)
+
+CREATE_TESTS(8, 64)
+CREATE_TESTS(16, 32)
+CREATE_TESTS(32, 16)
+CREATE_TESTS(64, 8)

 #undef CREATE_TESTS_SIGN
 #undef CREATE_TESTS
@@ -109,7 +126,24 @@
 	RUN_TESTS_SIGN( , bits, size) \
 	RUN_TESTS_SIGN(u, bits, size)

+	RUN_TESTS(8, 8)
+	RUN_TESTS(16, 4)
+	RUN_TESTS(32, 2)
+
 	RUN_TESTS(8, 16)
+	RUN_TESTS(16, 8)
+	RUN_TESTS(32, 4)
+	RUN_TESTS(64, 2)
+
+	RUN_TESTS(8, 32)
+	RUN_TESTS(16, 16)
+	RUN_TESTS(32, 8)
+	RUN_TESTS(64, 4)
+
+	RUN_TESTS(8, 64)
+	RUN_TESTS(16, 32)
+	RUN_TESTS(32, 16)
+	RUN_TESTS(64, 8)

 #undef RUN_TESTS_SIGN
 #undef RUN_TESTS
--- a/test/test_compare.h	Tue Nov 19 15:55:01 2024 -0500
+++ b/test/test_compare.h	Wed Nov 20 04:10:37 2024 -0500
@@ -32,7 +32,24 @@

 #define CREATE_TESTS(bits, size) CREATE_TESTS_SIGN(, d, bits, size) CREATE_TESTS_SIGN(u, u, bits, size)

+CREATE_TESTS(8, 8)
+CREATE_TESTS(16, 4)
+CREATE_TESTS(32, 2)
+
 CREATE_TESTS(8, 16)
+CREATE_TESTS(16, 8)
+CREATE_TESTS(32, 4)
+CREATE_TESTS(64, 2)
+
+CREATE_TESTS(8, 32)
+CREATE_TESTS(16, 16)
+CREATE_TESTS(32, 8)
+CREATE_TESTS(64, 4)
+
+CREATE_TESTS(8, 64)
+CREATE_TESTS(16, 32)
+CREATE_TESTS(32, 16)
+CREATE_TESTS(64, 8)

 #undef CREATE_TESTS_SIGN
 #undef CREATE_TESTS
@@ -59,7 +76,24 @@
 	RUN_TESTS_SIGN( , bits, size) \
 	RUN_TESTS_SIGN(u, bits, size)

+	RUN_TESTS(8, 8)
+	RUN_TESTS(16, 4)
+	RUN_TESTS(32, 2)
+
 	RUN_TESTS(8, 16)
+	RUN_TESTS(16, 8)
+	RUN_TESTS(32, 4)
+	RUN_TESTS(64, 2)
+
+	RUN_TESTS(8, 32)
+	RUN_TESTS(16, 16)
+	RUN_TESTS(32, 8)
+	RUN_TESTS(64, 4)
+
+	RUN_TESTS(8, 64)
+	RUN_TESTS(16, 32)
+	RUN_TESTS(32, 16)
+	RUN_TESTS(64, 8)

 #undef RUN_TESTS_SIGN
 #undef RUN_TESTS