changeset 39:f9ca85d2f14c

*: rearrange some things; add avx512bw support
author Paper <paper@tflc.us>
date Sat, 26 Apr 2025 15:31:39 -0400
parents fd42f9b1b95e
children 55cadb1fac4b
files README include/vec/cpu.h include/vec/defs.h include/vec/impl/cpu.h include/vec/impl/generic.h include/vec/impl/ppc/altivec.h include/vec/impl/x86/avx2.h include/vec/impl/x86/avx512bw.h include/vec/impl/x86/avx512f.h include/vec/impl/x86/sse2.h include/vec/mem.h include/vec/vec.h test/Makefile.template test/Makefile.x86 test/test.c test/test_benchmark.h utils/gengeneric.c
diffstat 17 files changed, 1203 insertions(+), 743 deletions(-) [+]
line wrap: on
line diff
--- a/README	Sat Apr 26 02:54:44 2025 -0400
+++ b/README	Sat Apr 26 15:31:39 2025 -0400
@@ -138,9 +138,9 @@
 multiple translation units and pass different command line arguments
 to the compiler to enable SSE2/AVX2/Altivec etc, and detect the vector
 modes the CPU supports at runtime. vec provides an optional public API
-specifically for this use-case within `vec/impl/cpu.h`; bear in mind
-though that it is not thread-safe, so if your program is multithreaded
-you'll want to cache the results on startup.
+specifically for this use-case within `vec/cpu.h`; bear in mind though
+that it is not thread-safe, so if your program is multithreaded you'll want
+to cache the results on startup.
 
 The CPU vector detection API is extremely simple, and self-explanatory.
 You call `vec_get_CPU_features()', and it returns a bit-mask of the
@@ -177,6 +177,9 @@
 
 The heap-based API is based off the good old C malloc API:
 
+	/* heap allocation stuff is only defined here: */
+	#include "vec/mem.h"
+
 	vec_int32 *q = vec_malloc(1024 * sizeof(vec_int32));
 
 	/* q is now aligned, and ready for use with a vector aligned load
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/cpu.h	Sat Apr 26 15:31:39 2025 -0400
@@ -0,0 +1,516 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024-2025 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_CPU_H_
+#define VEC_CPU_H_
+
+#include "defs.h"
+
+/* Detect CPU SIMD support. Much of this code was stolen from SDL.
+ *
+ * Simple DirectMedia Layer
+ * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
+# include <sys/sysctl.h> // For AltiVec check
+#elif defined(__OpenBSD__) && defined(__powerpc__)
+# include <sys/types.h>
+# include <sys/sysctl.h> // For AltiVec check
+# include <machine/cpu.h>
+#elif defined(__FreeBSD__) && defined(__powerpc__)
+# include <machine/cpu.h>
+# include <sys/auxv.h>
+#elif defined(__ALTIVEC__)
+# include <signal.h>
+# include <setjmp.h>
+#endif
+
+#ifdef __FreeBSD__
+# include <sys/param.h>
+#endif
+
+#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
+# include <unistd.h>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+# include <elf.h>
+
+/*#include <asm/hwcap.h>*/
+# ifndef AT_HWCAP
+# define AT_HWCAP 16
+# endif
+# ifndef AT_PLATFORM
+#  define AT_PLATFORM 15
+# endif
+# ifndef HWCAP_NEON
+#  define HWCAP_NEON (1 << 12)
+# endif
+#endif
+
+static inline int vec_CPU_have_CPUID(void)
+{
+	int has_CPUID = 0;
+
+#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
+	__asm__ (
+"        pushfl                      # Get original EFLAGS             \n"
+"        popl    %%eax                                                 \n"
+"        movl    %%eax,%%ecx                                           \n"
+"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
+"        pushl   %%eax               # Save new EFLAGS value on stack  \n"
+"        popfl                       # Replace current EFLAGS value    \n"
+"        pushfl                      # Get new EFLAGS                  \n"
+"        popl    %%eax               # Store new EFLAGS in EAX         \n"
+"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
+"        jz      1f                  # Processor=80486                 \n"
+"        movl    $1,%0               # We have CPUID support           \n"
+"1:                                                                    \n"
+	: "=m" (has_CPUID)
+	:
+	: "%eax", "%ecx"
+	);
+#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
+/* Technically, if this is being compiled under __x86_64__ then it has
+   CPUid by definition.  But it's nice to be able to prove it.  :)      */
+	__asm__ (
+"        pushfq                      # Get original EFLAGS             \n"
+"        popq    %%rax                                                 \n"
+"        movq    %%rax,%%rcx                                           \n"
+"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
+"        pushq   %%rax               # Save new EFLAGS value on stack  \n"
+"        popfq                       # Replace current EFLAGS value    \n"
+"        pushfq                      # Get new EFLAGS                  \n"
+"        popq    %%rax               # Store new EFLAGS in EAX         \n"
+"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
+"        jz      1f                  # Processor=80486                 \n"
+"        movl    $1,%0               # We have CPUID support           \n"
+"1:                                                                    \n"
+	: "=m" (has_CPUID)
+	:
+	: "%rax", "%rcx"
+	);
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+	__asm {
+		pushfd                      ; Get original EFLAGS
+		pop     eax
+		mov     ecx, eax
+		xor     eax, 200000h        ; Flip ID bit in EFLAGS
+		push    eax                 ; Save new EFLAGS value on stack
+		popfd                       ; Replace current EFLAGS value
+		pushfd                      ; Get new EFLAGS
+		pop     eax                 ; Store new EFLAGS in EAX
+		xor     eax, ecx            ; Can not toggle ID bit,
+		jz      done                ; Processor=80486
+		mov     has_CPUID,1         ; We have CPUID support
+done:
+	}
+#elif defined(_MSC_VER) && defined(_M_X64)
+	has_CPUID = 1;
+#elif defined(__sun) && defined(__i386)
+	__asm (
+"       pushfl                 \n"
+"       popl    %eax           \n"
+"       movl    %eax,%ecx      \n"
+"       xorl    $0x200000,%eax \n"
+"       pushl   %eax           \n"
+"       popfl                  \n"
+"       pushfl                 \n"
+"       popl    %eax           \n"
+"       xorl    %ecx,%eax      \n"
+"       jz      1f             \n"
+"       movl    $1,-8(%ebp)    \n"
+"1:                            \n"
+	);
+#elif defined(__sun) && defined(__amd64)
+	__asm (
+"       pushfq                 \n"
+"       popq    %rax           \n"
+"       movq    %rax,%rcx      \n"
+"       xorl    $0x200000,%eax \n"
+"       pushq   %rax           \n"
+"       popfq                  \n"
+"       pushfq                 \n"
+"       popq    %rax           \n"
+"       xorl    %ecx,%eax      \n"
+"       jz      1f             \n"
+"       movl    $1,-8(%rbp)    \n"
+"1:                            \n"
+	);
+#endif
+
+	return has_CPUID;
+}
+
+#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm__ __volatile__( \
+		"        pushl %%ebx        \n" \
+		"        xorl %%ecx,%%ecx   \n" \
+		"        cpuid              \n" \
+		"        movl %%ebx, %%esi  \n" \
+		"        popl %%ebx         \n" \
+		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
+		: "a"(func))
+#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm__ __volatile__( \
+		"        pushq %%rbx        \n" \
+		"        xorq %%rcx,%%rcx   \n" \
+		"        cpuid              \n" \
+		"        movq %%rbx, %%rsi  \n" \
+		"        popq %%rbx         \n" \
+		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
+		: "a"(func))
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm { \
+		__asm mov eax, func \
+		__asm xor ecx, ecx \
+		__asm cpuid \
+		__asm mov a, eax \
+		__asm mov b, ebx \
+		__asm mov c, ecx \
+		__asm mov d, edx \
+	}
+#elif (defined(_MSC_VER) && defined(_M_X64))
+// Use __cpuidex instead of __cpuid because ICL does not clear ecx register
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	do { \
+		int CPUInfo[4]; \
+		__cpuidex(CPUInfo, func, 0); \
+		a = CPUInfo[0]; \
+		b = CPUInfo[1]; \
+		c = CPUInfo[2]; \
+		d = CPUInfo[3]; \
+	} while (0)
+#else
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	do { \
+		a = b = c = d = 0; \
+		(void)a; \
+		(void)b; \
+		(void)c; \
+		(void)d; \
+	} while (0)
+#endif
+
+// ---------------------------------------------------------------
+
+static int vec_CPU_CPUIDFeatures[4];
+static int vec_CPU_CPUIDMaxFunction = 0;
+static int vec_CPU_OSSavesYMM = 0;
+static int vec_CPU_OSSavesZMM = 0;
+
+static inline void vec_CPU_get_CPUID_features(void)
+{
+	static int checked = 0;
+	if (!checked) {
+		checked = 1;
+		if (vec_CPU_have_CPUID()) {
+			int a, b, c, d;
+			VEC_CPU_CPUID(0, a, b, c, d);
+			vec_CPU_CPUIDMaxFunction = a;
+			if (vec_CPU_CPUIDMaxFunction >= 1) {
+				VEC_CPU_CPUID(1, a, b, c, d);
+				vec_CPU_CPUIDFeatures[0] = a;
+				vec_CPU_CPUIDFeatures[1] = b;
+				vec_CPU_CPUIDFeatures[2] = c;
+				vec_CPU_CPUIDFeatures[3] = d;
+
+				// Check to make sure we can call xgetbv
+				if (c & 0x08000000) {
+					// Call xgetbv to see if YMM (etc) register state is saved
+#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
+					__asm__(".byte 0x0f, 0x01, 0xd0"
+							: "=a"(a)
+							: "c"(0)
+							: "%edx");
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
+					a = (int)_xgetbv(0);
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+					__asm {
+						xor ecx, ecx
+						_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
+						mov a, eax
+					}
+#endif
+					vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
+					vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
+				}
+			}
+		}
+	}
+}
+
+#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
+static jmp_buf vec_jmpbuf;
+static void vec_CPU_illegal_instruction(int sig)
+{
+	longjmp(vec_jmpbuf, 1);
+}
+#endif
+
+static int vec_CPU_have_ALTIVEC(void)
+{
+	volatile int altivec = 0;
+#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
+	int selectors[2] = {
+# ifdef __OpenBSD__
+		CTL_MACHDEP, CPU_ALTIVEC
+# else
+		CTL_HW, HW_VECTORUNIT
+# endif
+	};
+	int hasVectorUnit = 0;
+	vec_uintsize length = sizeof(hasVectorUnit);
+	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+	if (!error)
+		altivec = (hasVectorUnit != 0);
+#elif defined(__FreeBSD__) && defined(__powerpc__)
+	unsigned long cpufeatures = 0;
+	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
+#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
+	void (*handler)(int sig);
+	handler = signal(SIGILL, vec_CPU_illegal_instruction);
+	if (!setjmp(vec_jmpbuf)) {
+		vector unsigned char vec;
+		vec_and(vec, vec);
+		altivec = 1;
+	}
+	signal(SIGILL, handler);
+#endif
+	return altivec;
+}
+
+static int vec_CPU_have_ALTIVEC_VSX(void)
+{
+	volatile int vsx = 0;
+#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
+# warning Compiling UNTESTED code for VSX.
+	void (*handler)(int sig);
+	handler = signal(SIGILL, vec_CPU_illegal_instruction);
+	if (!setjmp(vec_jmpbuf)) {
+		// this is completely untested
+		//__asm__ __volatile__("mtspr 256, %0\n\t"
+		//			 "xxland %%v0, %%v0, %%v0" ::"r"(-1));
+		//vsx = 1;
+	}
+	signal(SIGILL, handler);
+#endif
+	return vsx;
+}
+
+#define vec_CPU_have_MMX()   (vec_CPU_CPUIDFeatures[3] & 0x00800000)
+#define vec_CPU_have_SSE()   (vec_CPU_CPUIDFeatures[3] & 0x02000000)
+#define vec_CPU_have_SSE2()  (vec_CPU_CPUIDFeatures[3] & 0x04000000)
+#define vec_CPU_have_SSE3()  (vec_CPU_CPUIDFeatures[2] & 0x00000001)
+#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
+#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
+#define vec_CPU_have_AVX()   (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))
+
+static inline int vec_CPU_have_AVX2(void)
+{
+	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
+		int a, b, c, d;
+		VEC_CPU_CPUID(7, a, b, c, d);
+		return b & 0x00000020;
+		(void)a, (void)c, (void)d;
+	}
+	return 0;
+}
+
+static inline int vec_CPU_have_AVX512F(void)
+{
+	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
+		int a, b, c, d;
+		VEC_CPU_CPUID(7, a, b, c, d);
+		return b & 0x00010000;
+		(void)a, (void)c, (void)d;
+	}
+	return 0;
+}
+
+#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
+static int readProcAuxvForNeon(void)
+{
+	int neon = 0;
+	int fd;
+
+	fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+	if (fd >= 0) {
+		Elf32_auxv_t aux;
+		while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
+			if (aux.a_type == AT_HWCAP) {
+				neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
+				break;
+			}
+		}
+		close(fd);
+	}
+	return neon;
+}
+#endif
+
+static int vec_CPU_have_NEON(void)
+{
+/* The way you detect NEON is a privileged instruction on ARM, so you have
+   query the OS kernel in a platform-specific way. :/ */
+#if defined(SDL_CPUINFO_DISABLED)
+	return 0; /* disabled */
+#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
+/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
+/* Seems to have been removed */
+#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
+#endif
+	/* All WinRT ARM devices are required to support NEON, but just in case. */
+	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
+#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
+	return 1; /* ARMv8 always has non-optional NEON support. */
+#elif defined(__VITA__)
+	return 1;
+#elif defined(__3DS__)
+	return 0;
+#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
+	/* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
+	return 1; /* all Apple ARMv7 chips and later have NEON. */
+#elif defined(__APPLE__)
+	return 0; /* assume anything else from Apple doesn't have NEON. */
+#elif !defined(__arm__)
+	return 0; /* not an ARM CPU at all. */
+#elif defined(__OpenBSD__)
+	return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
+#elif defined(HAVE_ELF_AUX_INFO)
+	unsigned long hasneon = 0;
+	if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
+		return 0;
+
+	return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
+#elif defined(__QNXNTO__)
+	return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
+#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
+#elif defined(__linux__)
+	return readProcAuxvForNeon();
+#elif defined(__ANDROID__)
+	/* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
+	{
+		AndroidCpuFamily cpu_family = android_getCpuFamily();
+		if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
+			uint64_t cpu_features = android_getCpuFeatures();
+			if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
+				return 1;
+			}
+		}
+		return 0;
+	}
+#elif defined(__RISCOS__)
+	/* Use the VFPSupport_Features SWI to access the MVFR registers */
+	{
+		_kernel_swi_regs regs;
+		regs.r[0] = 0;
+		if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
+			if ((regs.r[2] & 0xFFF000) == 0x111000) {
+				return 1;
+			}
+		}
+		return 0;
+	}
+#else
+#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
+	return 0;
+#endif
+}
+
+enum {
+	VEC_CPU_HAS_ALTIVEC = (1 << 0),
+	VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1),
+	VEC_CPU_HAS_MMX = (1 << 2),
+	VEC_CPU_HAS_SSE = (1 << 3),
+	VEC_CPU_HAS_SSE2 = (1 << 4),
+	VEC_CPU_HAS_SSE3 = (1 << 5),
+	VEC_CPU_HAS_SSE41 = (1 << 6),
+	VEC_CPU_HAS_SSE42 = (1 << 7),
+	VEC_CPU_HAS_AVX = (1 << 8),
+	VEC_CPU_HAS_AVX2 = (1 << 9),
+	VEC_CPU_HAS_AVX512F = (1 << 10),
+	VEC_CPU_HAS_NEON = (1 << 11),
+};
+
+#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF)
+
+VEC_FUNC_IMPL uint32_t vec_get_CPU_features(void)
+{
+	static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;
+	if (vec_CPU_features == VEC_CPU_FEATURES_RESET) {
+		vec_CPU_get_CPUID_features();
+		vec_CPU_features = 0;
+		if (vec_CPU_have_ALTIVEC())
+			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
+		if (vec_CPU_have_ALTIVEC_VSX())
+			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
+		if (vec_CPU_have_MMX())
+			vec_CPU_features |= VEC_CPU_HAS_MMX;
+		if (vec_CPU_have_SSE())
+			vec_CPU_features |= VEC_CPU_HAS_SSE;
+		if (vec_CPU_have_SSE2())
+			vec_CPU_features |= VEC_CPU_HAS_SSE2;
+		if (vec_CPU_have_SSE3())
+			vec_CPU_features |= VEC_CPU_HAS_SSE3;
+		if (vec_CPU_have_SSE41())
+			vec_CPU_features |= VEC_CPU_HAS_SSE41;
+		if (vec_CPU_have_SSE42())
+			vec_CPU_features |= VEC_CPU_HAS_SSE42;
+		if (vec_CPU_have_AVX())
+			vec_CPU_features |= VEC_CPU_HAS_AVX;
+		if (vec_CPU_have_AVX2())
+			vec_CPU_features |= VEC_CPU_HAS_AVX2;
+		if (vec_CPU_have_AVX512F())
+			vec_CPU_features |= VEC_CPU_HAS_AVX512F;
+		if (vec_CPU_have_NEON())
+			vec_CPU_features |= VEC_CPU_HAS_NEON;
+	}
+	return vec_CPU_features;
+}
+
+#endif /* VEC_CPU_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/defs.h	Sat Apr 26 15:31:39 2025 -0400
@@ -0,0 +1,126 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024-2025 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_DEFS_H_
+#define VEC_DEFS_H_
+
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef VEC_CUSTOM_INTEGER_TYPEDEF
+/* we already have custom integer typedefs; */
+# include "impl/integer.h"
+#else
+# if __cplusplus >= (201103L)
+#  include <cstdint>
+#  include <cstddef>
+typedef std::size_t    vec_uintsize;
+
+typedef std::uint8_t   vec_uint8;
+typedef std::uint16_t  vec_uint16;
+typedef std::uint32_t  vec_uint32;
+typedef std::uint64_t  vec_uint64;
+typedef std::uintmax_t vec_uintmax;
+typedef std::uintptr_t vec_uintptr;
+
+typedef std::int8_t    vec_int8;
+typedef std::int16_t   vec_int16;
+typedef std::int32_t   vec_int32;
+typedef std::int64_t   vec_int64;
+typedef std::intmax_t  vec_intmax;
+# elif __STDC_VERSION__ >= 199901L
+#  include <stdint.h>
+#  include <stddef.h>
+typedef uint8_t   vec_uint8;
+typedef uint16_t  vec_uint16;
+typedef uint32_t  vec_uint32;
+typedef uint64_t  vec_uint64;
+typedef uintmax_t vec_uintmax;
+typedef uintptr_t vec_uintptr;
+typedef size_t    vec_uintsize;
+typedef int8_t    vec_int8;
+typedef int16_t   vec_int16;
+typedef int32_t   vec_int32;
+typedef int64_t   vec_int64;
+typedef intmax_t  vec_intmax;
+# else
+#  error Unable to find integer types with known size.
+# endif
+#endif
+
+#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
+	(((a) >= (x)) && \
+	 ((a) > x || (b) >= (y)) && \
+	 ((a) > x || (b) > (y) || (c) >= (z)))
+
+#ifdef __GNUC__
+# define VEC_GNUC_ATLEAST(x, y, z) \
+	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)
+#else
+# define VEC_GNUC_ATLEAST(x, y, z) (0)
+#endif
+
+/* GCC/clang attributes */
+#if defined(__has_attribute)
+# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x)
+#else
+# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch)
+#endif
+
+#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
+# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg)
+#elif (__STDC_VERSION__ >= 201112L)
+# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
+#else
+# define VEC_STATIC_ASSERT(x, msg) \
+	extern int (*vec_impl_Static_assert_function_(void)) \
+		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
+#endif
+
+#if VEC_GNUC_HAS_ATTRIBUTE(__always_inline__, 4, 0, 0)
+# define VEC_ALWAYS_INLINE __attribute__((__always_inline__))
+#else
+# define VEC_ALWAYS_INLINE
+#endif
+
+#define VEC_FUNC_IMPL static inline VEC_ALWAYS_INLINE
+
+/* --------------------------------------------------------------- */
+/* Get maximum value for type */
+
+#define VEC_TYPE_SIGNED(t) (((t)(-1)) < ((t)0))
+
+#define VEC_MAX_EX(t, TOPBIT) \
+	(((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | \
+      ((TOPBIT) << ((sizeof(t) * 8ULL) - 4ULL)))
+
+#define VEC_MAX_OF_UNSIGNED(t) VEC_MAX_EX(t, 0xFULL)
+#define VEC_MAX_OF_SIGNED(t) VEC_MAX_EX(t, 0x7ULL)
+
+#define VEC_MAX_OF_TYPE(t) \
+	((unsigned long long)(VEC_TYPE_SIGNED(t) \
+		? VEC_MAX_OF_SIGNED(t) \
+		: VEC_MAX_OF_UNSIGNED(t)))
+
+#endif /* VEC_DEFS_H */
--- a/include/vec/impl/cpu.h	Sat Apr 26 02:54:44 2025 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,514 +0,0 @@
-/**
- * vec - a tiny SIMD vector library in C99
- * 
- * Copyright (c) 2024-2025 Paper
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
-#ifndef VEC_IMPL_CPU_H_
-#define VEC_IMPL_CPU_H_
-
-/* Detect CPU SIMD support. Much of this code was stolen from SDL.
- *
- * Simple DirectMedia Layer
- * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
-*/
-
-#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
-# include <sys/sysctl.h> // For AltiVec check
-#elif defined(__OpenBSD__) && defined(__powerpc__)
-# include <sys/types.h>
-# include <sys/sysctl.h> // For AltiVec check
-# include <machine/cpu.h>
-#elif defined(__FreeBSD__) && defined(__powerpc__)
-# include <machine/cpu.h>
-# include <sys/auxv.h>
-#elif defined(__ALTIVEC__)
-# include <signal.h>
-# include <setjmp.h>
-#endif
-
-#ifdef __FreeBSD__
-# include <sys/param.h>
-#endif
-
-#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
-# include <unistd.h>
-# include <sys/types.h>
-# include <sys/stat.h>
-# include <fcntl.h>
-# include <elf.h>
-
-/*#include <asm/hwcap.h>*/
-# ifndef AT_HWCAP
-# define AT_HWCAP 16
-# endif
-# ifndef AT_PLATFORM
-#  define AT_PLATFORM 15
-# endif
-# ifndef HWCAP_NEON
-#  define HWCAP_NEON (1 << 12)
-# endif
-#endif
-
-static inline int vec_CPU_have_CPUID(void)
-{
-	int has_CPUID = 0;
-
-#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
-	__asm__ (
-"        pushfl                      # Get original EFLAGS             \n"
-"        popl    %%eax                                                 \n"
-"        movl    %%eax,%%ecx                                           \n"
-"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
-"        pushl   %%eax               # Save new EFLAGS value on stack  \n"
-"        popfl                       # Replace current EFLAGS value    \n"
-"        pushfl                      # Get new EFLAGS                  \n"
-"        popl    %%eax               # Store new EFLAGS in EAX         \n"
-"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
-"        jz      1f                  # Processor=80486                 \n"
-"        movl    $1,%0               # We have CPUID support           \n"
-"1:                                                                    \n"
-	: "=m" (has_CPUID)
-	:
-	: "%eax", "%ecx"
-	);
-#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
-/* Technically, if this is being compiled under __x86_64__ then it has
-   CPUid by definition.  But it's nice to be able to prove it.  :)      */
-	__asm__ (
-"        pushfq                      # Get original EFLAGS             \n"
-"        popq    %%rax                                                 \n"
-"        movq    %%rax,%%rcx                                           \n"
-"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
-"        pushq   %%rax               # Save new EFLAGS value on stack  \n"
-"        popfq                       # Replace current EFLAGS value    \n"
-"        pushfq                      # Get new EFLAGS                  \n"
-"        popq    %%rax               # Store new EFLAGS in EAX         \n"
-"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
-"        jz      1f                  # Processor=80486                 \n"
-"        movl    $1,%0               # We have CPUID support           \n"
-"1:                                                                    \n"
-	: "=m" (has_CPUID)
-	:
-	: "%rax", "%rcx"
-	);
-#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
-	__asm {
-		pushfd                      ; Get original EFLAGS
-		pop     eax
-		mov     ecx, eax
-		xor     eax, 200000h        ; Flip ID bit in EFLAGS
-		push    eax                 ; Save new EFLAGS value on stack
-		popfd                       ; Replace current EFLAGS value
-		pushfd                      ; Get new EFLAGS
-		pop     eax                 ; Store new EFLAGS in EAX
-		xor     eax, ecx            ; Can not toggle ID bit,
-		jz      done                ; Processor=80486
-		mov     has_CPUID,1         ; We have CPUID support
-done:
-	}
-#elif defined(_MSC_VER) && defined(_M_X64)
-	has_CPUID = 1;
-#elif defined(__sun) && defined(__i386)
-	__asm (
-"       pushfl                 \n"
-"       popl    %eax           \n"
-"       movl    %eax,%ecx      \n"
-"       xorl    $0x200000,%eax \n"
-"       pushl   %eax           \n"
-"       popfl                  \n"
-"       pushfl                 \n"
-"       popl    %eax           \n"
-"       xorl    %ecx,%eax      \n"
-"       jz      1f             \n"
-"       movl    $1,-8(%ebp)    \n"
-"1:                            \n"
-	);
-#elif defined(__sun) && defined(__amd64)
-	__asm (
-"       pushfq                 \n"
-"       popq    %rax           \n"
-"       movq    %rax,%rcx      \n"
-"       xorl    $0x200000,%eax \n"
-"       pushq   %rax           \n"
-"       popfq                  \n"
-"       pushfq                 \n"
-"       popq    %rax           \n"
-"       xorl    %ecx,%eax      \n"
-"       jz      1f             \n"
-"       movl    $1,-8(%rbp)    \n"
-"1:                            \n"
-	);
-#endif
-
-	return has_CPUID;
-}
-
-#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	__asm__ __volatile__( \
-		"        pushl %%ebx        \n" \
-		"        xorl %%ecx,%%ecx   \n" \
-		"        cpuid              \n" \
-		"        movl %%ebx, %%esi  \n" \
-		"        popl %%ebx         \n" \
-		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
-		: "a"(func))
-#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	__asm__ __volatile__( \
-		"        pushq %%rbx        \n" \
-		"        xorq %%rcx,%%rcx   \n" \
-		"        cpuid              \n" \
-		"        movq %%rbx, %%rsi  \n" \
-		"        popq %%rbx         \n" \
-		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
-		: "a"(func))
-#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	__asm { \
-		__asm mov eax, func \
-		__asm xor ecx, ecx \
-		__asm cpuid \
-		__asm mov a, eax \
-		__asm mov b, ebx \
-		__asm mov c, ecx \
-		__asm mov d, edx \
-	}
-#elif (defined(_MSC_VER) && defined(_M_X64))
-// Use __cpuidex instead of __cpuid because ICL does not clear ecx register
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	do { \
-		int CPUInfo[4]; \
-		__cpuidex(CPUInfo, func, 0); \
-		a = CPUInfo[0]; \
-		b = CPUInfo[1]; \
-		c = CPUInfo[2]; \
-		d = CPUInfo[3]; \
-	} while (0)
-#else
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	do { \
-		a = b = c = d = 0; \
-		(void)a; \
-		(void)b; \
-		(void)c; \
-		(void)d; \
-	} while (0)
-#endif
-
-// ---------------------------------------------------------------
-
-static int vec_CPU_CPUIDFeatures[4];
-static int vec_CPU_CPUIDMaxFunction = 0;
-static int vec_CPU_OSSavesYMM = 0;
-static int vec_CPU_OSSavesZMM = 0;
-
-static inline void vec_CPU_get_CPUID_features(void)
-{
-	static int checked = 0;
-	if (!checked) {
-		checked = 1;
-		if (vec_CPU_have_CPUID()) {
-			int a, b, c, d;
-			VEC_CPU_CPUID(0, a, b, c, d);
-			vec_CPU_CPUIDMaxFunction = a;
-			if (vec_CPU_CPUIDMaxFunction >= 1) {
-				VEC_CPU_CPUID(1, a, b, c, d);
-				vec_CPU_CPUIDFeatures[0] = a;
-				vec_CPU_CPUIDFeatures[1] = b;
-				vec_CPU_CPUIDFeatures[2] = c;
-				vec_CPU_CPUIDFeatures[3] = d;
-
-				// Check to make sure we can call xgetbv
-				if (c & 0x08000000) {
-					// Call xgetbv to see if YMM (etc) register state is saved
-#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
-					__asm__(".byte 0x0f, 0x01, 0xd0"
-							: "=a"(a)
-							: "c"(0)
-							: "%edx");
-#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
-					a = (int)_xgetbv(0);
-#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
-					__asm {
-						xor ecx, ecx
-						_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
-						mov a, eax
-					}
-#endif
-					vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
-					vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
-				}
-			}
-		}
-	}
-}
-
-#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
-static jmp_buf vec_jmpbuf;
-static void vec_CPU_illegal_instruction(int sig)
-{
-	longjmp(vec_jmpbuf, 1);
-}
-#endif
-
-static int vec_CPU_have_ALTIVEC(void)
-{
-	volatile int altivec = 0;
-#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
-	int selectors[2] = {
-# ifdef __OpenBSD__
-		CTL_MACHDEP, CPU_ALTIVEC
-# else
-		CTL_HW, HW_VECTORUNIT
-# endif
-	};
-	int hasVectorUnit = 0;
-	vec_uintsize length = sizeof(hasVectorUnit);
-	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
-	if (!error)
-		altivec = (hasVectorUnit != 0);
-#elif defined(__FreeBSD__) && defined(__powerpc__)
-	unsigned long cpufeatures = 0;
-	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
-	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
-#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
-	void (*handler)(int sig);
-	handler = signal(SIGILL, vec_CPU_illegal_instruction);
-	if (!setjmp(vec_jmpbuf)) {
-		vector unsigned char vec;
-		vec_and(vec, vec);
-		altivec = 1;
-	}
-	signal(SIGILL, handler);
-#endif
-	return altivec;
-}
-
-static int vec_CPU_have_ALTIVEC_VSX(void)
-{
-	volatile int vsx = 0;
-#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
-# warning Compiling UNTESTED code for VSX.
-	void (*handler)(int sig);
-	handler = signal(SIGILL, vec_CPU_illegal_instruction);
-	if (!setjmp(vec_jmpbuf)) {
-		// this is completely untested
-		//__asm__ __volatile__("mtspr 256, %0\n\t"
-		//			 "xxland %%v0, %%v0, %%v0" ::"r"(-1));
-		//vsx = 1;
-	}
-	signal(SIGILL, handler);
-#endif
-	return vsx;
-}
-
-#define vec_CPU_have_MMX()   (vec_CPU_CPUIDFeatures[3] & 0x00800000)
-#define vec_CPU_have_SSE()   (vec_CPU_CPUIDFeatures[3] & 0x02000000)
-#define vec_CPU_have_SSE2()  (vec_CPU_CPUIDFeatures[3] & 0x04000000)
-#define vec_CPU_have_SSE3()  (vec_CPU_CPUIDFeatures[2] & 0x00000001)
-#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
-#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
-#define vec_CPU_have_AVX()   (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))
-
-static inline int vec_CPU_have_AVX2(void)
-{
-	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
-		int a, b, c, d;
-		VEC_CPU_CPUID(7, a, b, c, d);
-		return b & 0x00000020;
-		(void)a, (void)c, (void)d;
-	}
-	return 0;
-}
-
-static inline int vec_CPU_have_AVX512F(void)
-{
-	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
-		int a, b, c, d;
-		VEC_CPU_CPUID(7, a, b, c, d);
-		return b & 0x00000020;
-		(void)a, (void)c, (void)d;
-	}
-	return 0;
-}
-
-#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
-static int readProcAuxvForNeon(void)
-{
-	int neon = 0;
-	int fd;
-
-	fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
-	if (fd >= 0) {
-		Elf32_auxv_t aux;
-		while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
-			if (aux.a_type == AT_HWCAP) {
-				neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
-				break;
-			}
-		}
-		close(fd);
-	}
-	return neon;
-}
-#endif
-
-static int vec_CPU_have_NEON(void)
-{
-/* The way you detect NEON is a privileged instruction on ARM, so you have
-   query the OS kernel in a platform-specific way. :/ */
-#if defined(SDL_CPUINFO_DISABLED)
-	return 0; /* disabled */
-#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
-/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
-/* Seems to have been removed */
-#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
-#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
-#endif
-	/* All WinRT ARM devices are required to support NEON, but just in case. */
-	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
-#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
-	return 1; /* ARMv8 always has non-optional NEON support. */
-#elif defined(__VITA__)
-	return 1;
-#elif defined(__3DS__)
-	return 0;
-#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
-	/* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
-	return 1; /* all Apple ARMv7 chips and later have NEON. */
-#elif defined(__APPLE__)
-	return 0; /* assume anything else from Apple doesn't have NEON. */
-#elif !defined(__arm__)
-	return 0; /* not an ARM CPU at all. */
-#elif defined(__OpenBSD__)
-	return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
-#elif defined(HAVE_ELF_AUX_INFO)
-	unsigned long hasneon = 0;
-	if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
-		return 0;
-
-	return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
-#elif defined(__QNXNTO__)
-	return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
-#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
-	return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
-#elif defined(__linux__)
-	return readProcAuxvForNeon();
-#elif defined(__ANDROID__)
-	/* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
-	{
-		AndroidCpuFamily cpu_family = android_getCpuFamily();
-		if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
-			uint64_t cpu_features = android_getCpuFeatures();
-			if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
-				return 1;
-			}
-		}
-		return 0;
-	}
-#elif defined(__RISCOS__)
-	/* Use the VFPSupport_Features SWI to access the MVFR registers */
-	{
-		_kernel_swi_regs regs;
-		regs.r[0] = 0;
-		if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
-			if ((regs.r[2] & 0xFFF000) == 0x111000) {
-				return 1;
-			}
-		}
-		return 0;
-	}
-#else
-#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
-	return 0;
-#endif
-}
-
-enum {
-	VEC_CPU_HAS_ALTIVEC = (1 << 0),
-	VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1),
-	VEC_CPU_HAS_MMX = (1 << 2),
-	VEC_CPU_HAS_SSE = (1 << 3),
-	VEC_CPU_HAS_SSE2 = (1 << 4),
-	VEC_CPU_HAS_SSE3 = (1 << 5),
-	VEC_CPU_HAS_SSE41 = (1 << 6),
-	VEC_CPU_HAS_SSE42 = (1 << 7),
-	VEC_CPU_HAS_AVX = (1 << 8),
-	VEC_CPU_HAS_AVX2 = (1 << 9),
-	VEC_CPU_HAS_AVX512F = (1 << 10),
-	VEC_CPU_HAS_NEON = (1 << 11),
-};
-
-#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF)
-
-VEC_FUNC_IMPL uint32_t vec_get_CPU_features(void)
-{
-	static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;
-	if (vec_CPU_features == VEC_CPU_FEATURES_RESET) {
-		vec_CPU_get_CPUID_features();
-		vec_CPU_features = 0;
-		if (vec_CPU_have_ALTIVEC())
-			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
-		if (vec_CPU_have_ALTIVEC_VSX())
-			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
-		if (vec_CPU_have_MMX())
-			vec_CPU_features |= VEC_CPU_HAS_MMX;
-		if (vec_CPU_have_SSE())
-			vec_CPU_features |= VEC_CPU_HAS_SSE;
-		if (vec_CPU_have_SSE2())
-			vec_CPU_features |= VEC_CPU_HAS_SSE2;
-		if (vec_CPU_have_SSE3())
-			vec_CPU_features |= VEC_CPU_HAS_SSE3;
-		if (vec_CPU_have_SSE41())
-			vec_CPU_features |= VEC_CPU_HAS_SSE41;
-		if (vec_CPU_have_SSE42())
-			vec_CPU_features |= VEC_CPU_HAS_SSE42;
-		if (vec_CPU_have_AVX())
-			vec_CPU_features |= VEC_CPU_HAS_AVX;
-		if (vec_CPU_have_AVX2())
-			vec_CPU_features |= VEC_CPU_HAS_AVX2;
-		if (vec_CPU_have_AVX512F())
-			vec_CPU_features |= VEC_CPU_HAS_AVX512F;
-		if (vec_CPU_have_NEON())
-			vec_CPU_features |= VEC_CPU_HAS_NEON;
-	}
-	return vec_CPU_features;
-}
-
-#endif /* VEC_IMPL_CPU_H_ */
--- a/include/vec/impl/generic.h	Sat Apr 26 02:54:44 2025 -0400
+++ b/include/vec/impl/generic.h	Sat Apr 26 15:31:39 2025 -0400
@@ -28,10 +28,6 @@
 #ifndef VEC_IMPL_GENERIC_H_
 #define VEC_IMPL_GENERIC_H_
 
-#include <string.h>
-
-// -----------------------------------------------------------------
-
 #define VEC_GENERIC_OPERATION(op, sign, bits, size) \
 	do { \
 		int i; \
--- a/include/vec/impl/ppc/altivec.h	Sat Apr 26 02:54:44 2025 -0400
+++ b/include/vec/impl/ppc/altivec.h	Sat Apr 26 15:31:39 2025 -0400
@@ -27,8 +27,6 @@
 #ifndef VEC_IMPL_PPC_ALTIVEC_H_
 #define VEC_IMPL_PPC_ALTIVEC_H_
 
-#include <altivec.h>
-
 /* GCC 4.2.1 on Mac OS X doesn't have these for some reason */
 #ifdef vec_mul
 # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
--- a/include/vec/impl/x86/avx2.h	Sat Apr 26 02:54:44 2025 -0400
+++ b/include/vec/impl/x86/avx2.h	Sat Apr 26 15:31:39 2025 -0400
@@ -157,6 +157,14 @@
 /* -------------------------------------------------------------------- */
 /* generic ops */
 
+#define VEC_AVX2_SPLAT(sign, bits, size) \
+	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_set1_epi##bits(x); \
+		return vec; \
+	}
+
 #define VEC_AVX2_LOAD_ALIGNED(sign, bits, size) \
 	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
@@ -195,6 +203,11 @@
 /* ------------------------------------------------------------------------ */
 /* 8x32 */
 
+#ifndef VINT8x32_SPLAT_DEFINED
+VEC_AVX2_SPLAT(/* nothing */, 8, 32)
+# define VINT8x32_SPLAT_DEFINED
+#endif
+
 #ifndef VINT8x32_LOAD_ALIGNED_DEFINED
 VEC_AVX2_LOAD_ALIGNED(/* nothing */, 8, 32)
 # define VINT8x32_LOAD_ALIGNED_DEFINED
@@ -242,6 +255,11 @@
 
 /* u8x32 */
 
+#ifndef VUINT8x32_SPLAT_DEFINED
+VEC_AVX2_SPLAT(u, 8, 32)
+# define VUINT8x32_SPLAT_DEFINED
+#endif
+
 #ifndef VUINT8x32_LOAD_ALIGNED_DEFINED
 VEC_AVX2_LOAD_ALIGNED(u, 8, 32)
 # define VUINT8x32_LOAD_ALIGNED_DEFINED
@@ -290,6 +308,11 @@
 /* ------------------------------------------------------------------------ */
 /* 16x16 */
 
+#ifndef VINT16x16_SPLAT_DEFINED
+VEC_AVX2_SPLAT(/* nothing */, 16, 16)
+# define VINT16x16_SPLAT_DEFINED
+#endif
+
 #ifndef VINT16x16_LOAD_ALIGNED_DEFINED
 VEC_AVX2_LOAD_ALIGNED(/* nothing */, 16, 16)
 # define VINT16x16_LOAD_ALIGNED_DEFINED
@@ -337,6 +360,11 @@
 
 /* u16x16 */
 
+#ifndef VUINT16x16_SPLAT_DEFINED
+VEC_AVX2_SPLAT(u, 16, 16)
+# define VUINT16x16_SPLAT_DEFINED
+#endif
+
 #ifndef VUINT16x16_LOAD_ALIGNED_DEFINED
 VEC_AVX2_LOAD_ALIGNED(u, 16, 16)
 # define VUINT16x16_LOAD_ALIGNED_DEFINED
@@ -385,6 +413,11 @@
 /* ------------------------------------------------------------------------ */
 /* 32x8 */
 
+#ifndef VINT32x8_SPLAT_DEFINED
+VEC_AVX2_SPLAT(/* nothing */, 32, 8)
+# define VINT32x8_SPLAT_DEFINED
+#endif
+
 #ifndef VINT32x8_LOAD_ALIGNED_DEFINED
 VEC_AVX2_LOAD_ALIGNED(/* nothing */, 32, 8)
 # define VINT32x8_LOAD_ALIGNED_DEFINED
@@ -432,6 +465,11 @@
 
 /* u32x8 */
 
+#ifndef VUINT32x8_SPLAT_DEFINED
+VEC_AVX2_SPLAT(u, 32, 8)
+# define VUINT32x8_SPLAT_DEFINED
+#endif
+
 #ifndef VUINT32x8_LOAD_ALIGNED_DEFINED
 VEC_AVX2_LOAD_ALIGNED(u, 32, 8)
 # define VUINT32x8_LOAD_ALIGNED_DEFINED
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/x86/avx512bw.h	Sat Apr 26 15:31:39 2025 -0400
@@ -0,0 +1,166 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024-2025 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_X86_AVX512BW_H_
+#define VEC_IMPL_X86_AVX512BW_H_
+
+#define VEC_AVX512BW_OP(SIGN, BITS, SIZE, INTLSIGN, SECONDSIGN, OP, NAME) \
+	VEC_FUNC_IMPL v##SIGN##int##BITS##x##SIZE v##SIGN##int##BITS##x##SIZE##_##NAME(v##SIGN##int##BITS##x##SIZE vec1, v##SECONDSIGN##int##BITS##x##SIZE vec2) \
+	{ \
+		vec1.avx512f = _mm512_##OP##_ep##INTLSIGN##BITS(vec1.avx512f, vec2.avx512f); \
+	\
+		return vec1; \
+	}
+
+/* -- 8x64 */
+
+#ifndef VINT8x64_MIN_DEFINED
+VEC_AVX512BW_OP(, 8, 64, i, , min, min)
+# define VINT8x64_MIN_DEFINED
+#endif
+
+#ifndef VINT8x64_MAX_DEFINED
+VEC_AVX512BW_OP(, 8, 64, i, , max, max)
+# define VINT8x64_MAX_DEFINED
+#endif
+
+#ifndef VINT8x64_ADD_DEFINED
+VEC_AVX512BW_OP(, 8, 64, i, , add, add)
+# define VINT8x64_ADD_DEFINED
+#endif
+
+#ifndef VINT8x64_SUB_DEFINED
+VEC_AVX512BW_OP(, 8, 64, i, , sub, sub)
+# define VINT8x64_SUB_DEFINED
+#endif
+
+/* -- unsigned */
+
+#ifndef VINT8x64_MIN_DEFINED
+VEC_AVX512BW_OP(u, 8, 64, u, u, min, min)
+# define VINT8x64_MIN_DEFINED
+#endif
+
+#ifndef VINT8x64_MAX_DEFINED
+VEC_AVX512BW_OP(u, 8, 64, u, u, max, max)
+# define VINT8x64_MAX_DEFINED
+#endif
+
+#ifndef VINT8x64_ADD_DEFINED
+VEC_AVX512BW_OP(u, 8, 64, i, u, add, add)
+# define VINT8x64_ADD_DEFINED
+#endif
+
+#ifndef VINT8x64_SUB_DEFINED
+VEC_AVX512BW_OP(u, 8, 64, i, u, sub, sub)
+# define VINT8x64_SUB_DEFINED
+#endif
+
+/* -- 16x32 */
+
+#ifndef VINT16x32_MIN_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, , min, min)
+# define VINT16x32_MIN_DEFINED
+#endif
+
+#ifndef VINT16x32_MAX_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, , max, max)
+# define VINT16x32_MAX_DEFINED
+#endif
+
+#ifndef VINT16x32_ADD_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, , add, add)
+# define VINT16x32_ADD_DEFINED
+#endif
+
+#ifndef VINT16x32_SUB_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, , sub, sub)
+# define VINT16x32_SUB_DEFINED
+#endif
+
+#ifndef VINT16x32_MUL_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, , mullo, mul)
+# define VINT16x32_MUL_DEFINED
+#endif
+
+#ifndef VINT16x32_LSHIFT_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, u, sllv, lshift)
+# define VINT16x32_LSHIFT_DEFINED
+#endif
+
+#ifndef VINT16x32_RSHIFT_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, u, srav, rshift)
+# define VINT16x32_RSHIFT_DEFINED
+#endif
+
+#ifndef VINT16x32_LRSHIFT_DEFINED
+VEC_AVX512BW_OP(, 16, 32, i, u, srlv, lrshift)
+# define VINT16x32_LRSHIFT_DEFINED
+#endif
+
+/* -- unsigned */
+
+#ifndef VUINT16x32_MIN_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, u, u, min, min)
+# define VUINT16x32_MIN_DEFINED
+#endif
+
+#ifndef VUINT16x32_MAX_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, u, u, max, max)
+# define VUINT16x32_MAX_DEFINED
+#endif
+
+#ifndef VUINT16x32_ADD_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, i, u, add, add)
+# define VUINT16x32_ADD_DEFINED
+#endif
+
+#ifndef VUINT16x32_SUB_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, i, u, sub, sub)
+# define VUINT16x32_SUB_DEFINED
+#endif
+
+#ifndef VUINT16x32_MUL_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, i, u, mullo, mul)
+# define VUINT16x32_MUL_DEFINED
+#endif
+
+#ifndef VUINT16x32_LSHIFT_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, i, u, sllv, lshift)
+# define VUINT16x32_LSHIFT_DEFINED
+#endif
+
+#ifndef VUINT16x32_RSHIFT_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, i, u, srlv, rshift)
+# define VUINT16x32_RSHIFT_DEFINED
+#endif
+
+#ifndef VUINT16x32_LRSHIFT_DEFINED
+VEC_AVX512BW_OP(u, 16, 32, i, u, srlv, lrshift)
+# define VUINT16x32_LRSHIFT_DEFINED
+#endif
+
+/* no mul for 8-bit */
+
+#endif /* VEC_IMPL_X86_AVX512BW_H_ */
\ No newline at end of file
--- a/include/vec/impl/x86/avx512f.h	Sat Apr 26 02:54:44 2025 -0400
+++ b/include/vec/impl/x86/avx512f.h	Sat Apr 26 15:31:39 2025 -0400
@@ -133,6 +133,14 @@
 
 /* ------------------------------------------------------------------------ */
 
+#define VEC_AVX512F_SPLAT(sign, bits, size) \
+	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##size x) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.avx512f = _mm512_set1_epi##bits(x); \
+		return vec; \
+	}
+
 #define VEC_AVX512F_LOAD_ALIGNED(sign, bits, size) \
 	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
@@ -168,6 +176,183 @@
 		return vec1; \
 	}
 
+/* ------------------------------------------------------------------------ */
+/* 8x64; there is very little we can do here */
+
+#ifndef VINT8x64_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(, 8, 64)
+# define VINT8x64_SPLAT_DEFINED
+#endif
+
+#ifndef VINT8x64_LOAD_ALIGNED_DEFINED
+VEC_AVX512F_LOAD_ALIGNED(, 8, 64)
+# define VINT8x64_LOAD_ALIGNED_DEFINED
+#endif
+
+#ifndef VINT8x64_LOAD_DEFINED
+VEC_AVX512F_LOAD(, 8, 64)
+# define VINT8x64_LOAD_DEFINED
+#endif
+
+#ifndef VINT8x64_STORE_ALIGNED_DEFINED
+VEC_AVX512F_STORE_ALIGNED(, 8, 64)
+# define VINT8x64_STORE_ALIGNED_DEFINED
+#endif
+
+#ifndef VINT8x64_STORE_DEFINED
+VEC_AVX512F_STORE(, 8, 64)
+# define VINT8x64_STORE_DEFINED
+#endif
+
+#ifndef VINT8x64_AND_DEFINED
+VEC_AVX512F_BITWISE(and, , 8, 64)
+# define VINT8x64_AND_DEFINED
+#endif
+
+#ifndef VINT8x64_OR_DEFINED
+VEC_AVX512F_BITWISE(or, , 8, 64)
+# define VINT8x64_OR_DEFINED
+#endif
+
+#ifndef VINT8x64_XOR_DEFINED
+VEC_AVX512F_BITWISE(xor, , 8, 64)
+# define VINT8x64_XOR_DEFINED
+#endif
+
+/* ---- unsigned */
+
+#ifndef VUINT8x64_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(u, 8, 64)
+# define VUINT8x64_SPLAT_DEFINED
+#endif
+
+#ifndef VUINT8x64_LOAD_ALIGNED_DEFINED
+VEC_AVX512F_LOAD_ALIGNED(u, 8, 64)
+# define VUINT8x64_LOAD_ALIGNED_DEFINED
+#endif
+
+#ifndef VUINT8x64_LOAD_DEFINED
+VEC_AVX512F_LOAD(u, 8, 64)
+# define VUINT8x64_LOAD_DEFINED
+#endif
+
+#ifndef VUINT8x64_STORE_ALIGNED_DEFINED
+VEC_AVX512F_STORE_ALIGNED(u, 8, 64)
+# define VUINT8x64_STORE_ALIGNED_DEFINED
+#endif
+
+#ifndef VUINT8x64_STORE_DEFINED
+VEC_AVX512F_STORE(u, 8, 64)
+# define VUINT8x64_STORE_DEFINED
+#endif
+
+#ifndef VUINT8x64_AND_DEFINED
+VEC_AVX512F_BITWISE(and, u, 8, 64)
+# define VUINT8x64_AND_DEFINED
+#endif
+
+#ifndef VUINT8x64_OR_DEFINED
+VEC_AVX512F_BITWISE(or, u, 8, 64)
+# define VUINT8x64_OR_DEFINED
+#endif
+
+#ifndef VUINT8x64_XOR_DEFINED
+VEC_AVX512F_BITWISE(xor, u, 8, 64)
+# define VUINT8x64_XOR_DEFINED
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* 16x32; there is very little we can do here */
+
+#ifndef VINT16x32_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(, 16, 32)
+# define VINT16x32_SPLAT_DEFINED
+#endif
+
+#ifndef VINT16x32_LOAD_ALIGNED_DEFINED
+VEC_AVX512F_LOAD_ALIGNED(, 16, 32)
+# define VINT16x32_LOAD_ALIGNED_DEFINED
+#endif
+
+#ifndef VINT16x32_LOAD_DEFINED
+VEC_AVX512F_LOAD(, 16, 32)
+# define VINT16x32_LOAD_DEFINED
+#endif
+
+#ifndef VINT16x32_STORE_ALIGNED_DEFINED
+VEC_AVX512F_STORE_ALIGNED(, 16, 32)
+# define VINT16x32_STORE_ALIGNED_DEFINED
+#endif
+
+#ifndef VINT16x32_STORE_DEFINED
+VEC_AVX512F_STORE(, 16, 32)
+# define VINT16x32_STORE_DEFINED
+#endif
+
+#ifndef VINT16x32_AND_DEFINED
+VEC_AVX512F_BITWISE(and, , 16, 32)
+# define VINT16x32_AND_DEFINED
+#endif
+
+#ifndef VINT16x32_OR_DEFINED
+VEC_AVX512F_BITWISE(or, , 16, 32)
+# define VINT16x32_OR_DEFINED
+#endif
+
+#ifndef VINT16x32_XOR_DEFINED
+VEC_AVX512F_BITWISE(xor, , 16, 32)
+# define VINT16x32_XOR_DEFINED
+#endif
+
+/* ---- unsigned */
+
+#ifndef VUINT16x32_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(u, 16, 32)
+# define VUINT16x32_SPLAT_DEFINED
+#endif
+
+#ifndef VUINT16x32_LOAD_ALIGNED_DEFINED
+VEC_AVX512F_LOAD_ALIGNED(u, 16, 32)
+# define VUINT16x32_LOAD_ALIGNED_DEFINED
+#endif
+
+#ifndef VUINT16x32_LOAD_DEFINED
+VEC_AVX512F_LOAD(u, 16, 32)
+# define VUINT16x32_LOAD_DEFINED
+#endif
+
+#ifndef VUINT16x32_STORE_ALIGNED_DEFINED
+VEC_AVX512F_STORE_ALIGNED(u, 16, 32)
+# define VUINT16x32_STORE_ALIGNED_DEFINED
+#endif
+
+#ifndef VUINT16x32_STORE_DEFINED
+VEC_AVX512F_STORE(u, 16, 32)
+# define VUINT16x32_STORE_DEFINED
+#endif
+
+#ifndef VUINT16x32_AND_DEFINED
+VEC_AVX512F_BITWISE(and, u, 16, 32)
+# define VUINT16x32_AND_DEFINED
+#endif
+
+#ifndef VUINT16x32_OR_DEFINED
+VEC_AVX512F_BITWISE(or, u, 16, 32)
+# define VUINT16x32_OR_DEFINED
+#endif
+
+#ifndef VUINT16x32_XOR_DEFINED
+VEC_AVX512F_BITWISE(xor, u, 16, 32)
+# define VUINT16x32_XOR_DEFINED
+#endif
+
+/* ------------------------------------------------------------------------ */
+
+#ifndef VINT32x16_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(, 32, 16)
+# define VINT32x16_SPLAT_DEFINED
+#endif
+
 #ifndef VINT32x16_LOAD_ALIGNED_DEFINED
 VEC_AVX512F_LOAD_ALIGNED(, 32, 16)
 # define VINT32x16_LOAD_ALIGNED_DEFINED
@@ -218,6 +403,11 @@
 # define VINT32x16_XOR_DEFINED
 #endif
 
+#ifndef VUINT32x16_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(u, 32, 16)
+# define VUINT32x16_SPLAT_DEFINED
+#endif
+
 #ifndef VUINT32x16_LOAD_ALIGNED_DEFINED
 VEC_AVX512F_LOAD_ALIGNED(u, 32, 16)
 # define VUINT32x16_LOAD_ALIGNED_DEFINED
@@ -268,6 +458,11 @@
 # define VUINT32x16_XOR_DEFINED
 #endif
 
+#ifndef VINT64x8_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(, 64, 8)
+# define VINT64x8_SPLAT_DEFINED
+#endif
+
 #ifndef VINT64x8_LOAD_ALIGNED_DEFINED
 VEC_AVX512F_LOAD_ALIGNED(, 64, 8)
 # define VINT64x8_LOAD_ALIGNED_DEFINED
@@ -318,6 +513,11 @@
 # define VINT64x8_XOR_DEFINED
 #endif
 
+#ifndef VUINT64x8_SPLAT_DEFINED
+VEC_AVX512F_SPLAT(u, 64, 8)
+# define VUINT64x8_SPLAT_DEFINED
+#endif
+
 #ifndef VUINT64x8_LOAD_ALIGNED_DEFINED
 VEC_AVX512F_LOAD_ALIGNED(u, 64, 8)
 # define VUINT64x8_LOAD_ALIGNED_DEFINED
--- a/include/vec/impl/x86/sse2.h	Sat Apr 26 02:54:44 2025 -0400
+++ b/include/vec/impl/x86/sse2.h	Sat Apr 26 15:31:39 2025 -0400
@@ -25,8 +25,6 @@
 #ifndef VEC_IMPL_X86_SSE2_H_
 #define VEC_IMPL_X86_SSE2_H_
 
-#include <emmintrin.h>
-
 /* eh */
 #define VEC_SSE2_SET1_8(x)  _mm_set1_epi8(x)
 #define VEC_SSE2_SET1_16(x) _mm_set1_epi16(x)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/mem.h	Sat Apr 26 15:31:39 2025 -0400
@@ -0,0 +1,123 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024-2025 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_MEM_H_
+#define VEC_MEM_H_
+
+#include "defs.h"
+
+#define VEC_MALLOC_ALIGNMENT (256)
+
+VEC_STATIC_ASSERT(!(VEC_MALLOC_ALIGNMENT & (VEC_MALLOC_ALIGNMENT - 1))
+	&& (VEC_MALLOC_ALIGNMENT > 0),
+	"VEC_MALLOC_ALIGNMENT must be a power of two");
+
+typedef unsigned char vec_alignment_type;
+
+VEC_STATIC_ASSERT(VEC_MALLOC_ALIGNMENT > VEC_MAX_OF_TYPE(vec_alignment_type),
+	"VEC_MALLOC_ALIGNMENT cannot fit in the pointer alignment data");
+
+#define VEC_MALLOC_ADDITIONAL_SIZE (sizeof(vec_alignment_type) + (VEC_MALLOC_ALIGNMENT - 1))
+#define VEC_MALLOC_MAX_SIZE (SIZE_MAX - VEC_MALLOC_ADDITIONAL_SIZE)
+
+VEC_FUNC_IMPL void *vec_internal_align_ptr_(void *q)
+{
+	vec_alignment_type diff;
+
+	diff = (((uintptr_t)q + (VEC_MALLOC_ALIGNMENT - 1)) & ~(VEC_MALLOC_ALIGNMENT - 1)) - (uintptr_t)q;
+	q = (char *)q + diff;
+
+	memcpy((char *)q - sizeof(diff), &diff, sizeof(diff));
+
+	return q;
+}
+
+/* reverses vec_align_ptr */
+VEC_FUNC_IMPL void *vec_internal_unalign_ptr_(void *q)
+{
+	vec_alignment_type diff;
+
+	memcpy(&diff, (char *)q - sizeof(diff), sizeof(diff));
+	q = (char *)q - diff;
+
+	return q;
+}
+
+VEC_FUNC_IMPL void *vec_malloc(size_t size)
+{
+	void *q;
+
+	if (size > VEC_MALLOC_MAX_SIZE)
+		return NULL;
+
+	/* allocate space for the diff (we have to do this,
+	 * for realloc has no way of knowing the original ptr) */
+	q = malloc(size + VEC_MALLOC_ADDITIONAL_SIZE);
+	if (!q)
+		return NULL;
+
+	return vec_internal_align_ptr_(q);
+}
+
+VEC_FUNC_IMPL void *vec_calloc(size_t count, size_t nmemb)
+{
+	size_t size;
+	void *q;
+
+	size = count * nmemb;
+	if ((size && size / count != nmemb)
+		|| size > VEC_MALLOC_MAX_SIZE)
+		return NULL; /* nope */
+
+	q = calloc(size + VEC_MALLOC_ADDITIONAL_SIZE, 1);
+	if (!q)
+		return NULL;
+
+	return vec_internal_align_ptr_(q);
+}
+
+VEC_FUNC_IMPL void *vec_realloc(void *ptr, size_t newsize)
+{
+	void *q;
+
+	if (!ptr)
+		return vec_malloc(newsize);
+
+	if (newsize > VEC_MALLOC_MAX_SIZE)
+		return NULL;
+
+	q = realloc(vec_internal_unalign_ptr_(ptr), VEC_MALLOC_ADDITIONAL_SIZE);
+	if (!q)
+		return NULL;
+
+	return vec_internal_align_ptr_(q);
+}
+
+VEC_FUNC_IMPL void vec_free(void *ptr)
+{
+	if (ptr)
+		free(vec_internal_unalign_ptr_(ptr));
+}
+
+#endif /* VEC_MEM_H_ */
--- a/include/vec/vec.h	Sat Apr 26 02:54:44 2025 -0400
+++ b/include/vec/vec.h	Sat Apr 26 15:31:39 2025 -0400
@@ -25,116 +25,7 @@
 #ifndef VEC_VEC_H_
 #define VEC_VEC_H_
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#ifdef VEC_HAVE_IMPL_INTEGER_H
-# include "impl/integer.h"
-#else
-# if __cplusplus >= (201103L)
-#  include <cstdint>
-#  include <cstddef>
-typedef std::size_t    vec_uintsize;
-
-typedef std::uint8_t   vec_uint8;
-typedef std::uint16_t  vec_uint16;
-typedef std::uint32_t  vec_uint32;
-typedef std::uint64_t  vec_uint64;
-typedef std::uintmax_t vec_uintmax;
-typedef std::uintptr_t vec_uintptr;
-
-typedef std::int8_t    vec_int8;
-typedef std::int16_t   vec_int16;
-typedef std::int32_t   vec_int32;
-typedef std::int64_t   vec_int64;
-typedef std::intmax_t  vec_intmax;
-# elif __STDC_VERSION__ >= 199901L
-#  include <stdint.h>
-#  include <stddef.h>
-typedef uint8_t   vec_uint8;
-typedef uint16_t  vec_uint16;
-typedef uint32_t  vec_uint32;
-typedef uint64_t  vec_uint64;
-typedef uintmax_t vec_uintmax;
-typedef uintptr_t vec_uintptr;
-typedef size_t    vec_uintsize;
-typedef int8_t    vec_int8;
-typedef int16_t   vec_int16;
-typedef int32_t   vec_int32;
-typedef int64_t   vec_int64;
-typedef intmax_t  vec_intmax;
-# else
-#  error Unable to find integer types with known size.
-# endif
-#endif
-
-#include <string.h>
-#include <stdlib.h>
-
-#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
-	(((a) >= (x)) && \
-	 ((a) > x || (b) >= (y)) && \
-	 ((a) > x || (b) > (y) || (c) >= (z)))
-
-#ifdef __GNUC__
-# define VEC_GNUC_ATLEAST(x, y, z) \
-	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)
-#else
-# define VEC_GNUC_ATLEAST(x, y, z) (0)
-#endif
-
-/* GCC/clang attributes */
-#if defined(__has_attribute)
-# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x)
-#else
-# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch)
-#endif
-
-#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
-# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg)
-#elif (__STDC_VERSION__ >= 201112L)
-# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
-#else
-# define VEC_STATIC_ASSERT(x, msg) \
-	extern int (*vec_impl_Static_assert_function_(void)) \
-		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
-#endif
-
-#ifndef VEC_ASSERT
-# ifndef VEC_DISABLE_ASSERTIONS
-#  include <assert.h>
-#  define VEC_ASSERT(x, msg) assert(msg && x)
-# else
-#  define VEC_ASSERT(x, msg)
-# endif
-#endif
-
-#if VEC_GNUC_HAS_ATTRIBUTE(__always_inline__, 4, 0, 0)
-# define VEC_ALWAYS_INLINE __attribute__((__always_inline__))
-#else
-# define VEC_ALWAYS_INLINE
-#endif
-
-#define VEC_FUNC_IMPL static inline VEC_ALWAYS_INLINE
-
-/* --------------------------------------------------------------- */
-/* Get maximum value for type */
-
-#define VEC_TYPE_SIGNED(t) (((t)(-1)) < ((t)0))
-
-#define VEC_MAX_EX(t, TOPBIT) \
-	(((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | \
-      ((TOPBIT) << ((sizeof(t) * 8ULL) - 4ULL)))
-
-#define VEC_MAX_OF_UNSIGNED(t) VEC_MAX_EX(t, 0xFULL)
-#define VEC_MAX_OF_SIGNED(t) VEC_MAX_EX(t, 0x7ULL)
-
-#define VEC_MAX_OF_TYPE(t) \
-	((unsigned long long)(VEC_TYPE_SIGNED(t) \
-		? VEC_MAX_OF_SIGNED(t) \
-		: VEC_MAX_OF_UNSIGNED(t)))
+#include "defs.h"
 
 /* --------------------------------------------------------------- */
 /* Detect compiler SIMD support */
@@ -386,6 +277,9 @@
 
 #ifdef __AVX512F__
 # include <immintrin.h>
+# ifdef __AVX512BW__
+#  define VEC_COMPILER_HAS_AVX512BW
+# endif
 # define VEC_COMPILER_HAS_AVX512F
 # if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
 #  undef VINT8x64_ALIGNMENT
@@ -423,6 +317,10 @@
 
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* --------------------------------------------------------------- */
 /* bit shift */
 
@@ -497,7 +395,6 @@
 	}; \
 	unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \
 	type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \
-	VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
 # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
 	(sizeof(vec_unaligned_##var##_) - (align - 1))
 #endif
@@ -1024,6 +921,10 @@
 /* ------------------------------------------------------------------------ */
 /* finally; we can import the real implementations */
 
+#ifdef VEC_COMPILER_HAS_AVX512BW
+# include "impl/x86/avx512bw.h"
+#endif
+
 #ifdef VEC_COMPILER_HAS_AVX512F
 # include "impl/x86/avx512f.h"
 #endif
@@ -1051,97 +952,6 @@
 #include "impl/generic.h"
 
 /* ------------------------------------------------------------------------ */
-/* very minimal aligned malloc */
-
-#define VEC_MALLOC_ALIGNMENT (64)
-
-VEC_STATIC_ASSERT(!(VEC_MALLOC_ALIGNMENT & (VEC_MALLOC_ALIGNMENT - 1))
-	&& (VEC_MALLOC_ALIGNMENT > 0),
-	"VEC_MALLOC_ALIGNMENT must be a power of two");
-
-typedef unsigned char vec_alignment_type;
-
-#define VEC_MALLOC_ADDITIONAL_SIZE (sizeof(vec_alignment_type) + (VEC_MALLOC_ALIGNMENT - 1))
-#define VEC_MALLOC_MAX_SIZE (SIZE_MAX - VEC_MALLOC_ADDITIONAL_SIZE)
-
-VEC_FUNC_IMPL void *vec_internal_align_ptr_(void *q)
-{
-	vec_alignment_type diff;
-
-	diff = (((uintptr_t)q + (VEC_MALLOC_ALIGNMENT - 1)) & ~(VEC_MALLOC_ALIGNMENT - 1)) - (uintptr_t)q;
-	q = (char *)q + diff;
-
-	memcpy((char *)q - sizeof(diff), &diff, sizeof(diff));
-
-	return q;
-}
-
-/* reverses vec_align_ptr */
-VEC_FUNC_IMPL void *vec_internal_unalign_ptr_(void *q)
-{
-	vec_alignment_type diff;
-
-	memcpy(&diff, (char *)q - sizeof(diff), sizeof(diff));
-	q = (char *)q - diff;
-
-	return q;
-}
-
-VEC_FUNC_IMPL void *vec_malloc(size_t size)
-{
-	void *q;
-
-	if (size > VEC_MALLOC_MAX_SIZE)
-		return NULL;
-
-	/* allocate space for the diff (we have to do this,
-	 * for realloc has no way of knowing the original ptr) */
-	q = malloc(size + VEC_MALLOC_ADDITIONAL_SIZE);
-	if (!q)
-		return NULL;
-
-	return vec_internal_align_ptr_(q);
-}
-
-VEC_FUNC_IMPL void *vec_calloc(size_t count, size_t nmemb)
-{
-	size_t size;
-	void *q;
-
-	size = count * nmemb;
-	if ((size && size / count != nmemb)
-		|| size > VEC_MALLOC_MAX_SIZE)
-		return NULL; /* nope */
-
-	q = calloc(size + VEC_MALLOC_ADDITIONAL_SIZE, 1);
-	if (!q)
-		return NULL;
-
-	return vec_internal_align_ptr_(q);
-}
-
-VEC_FUNC_IMPL void *vec_realloc(void *ptr, size_t newsize)
-{
-	void *q;
-
-	if (!ptr)
-		return vec_malloc(newsize);
-
-	if (newsize > VEC_MALLOC_MAX_SIZE)
-		return NULL;
-
-	q = realloc(vec_internal_unalign_ptr_(ptr), VEC_MALLOC_ADDITIONAL_SIZE);
-	if (!q)
-		return NULL;
-
-	return vec_internal_align_ptr_(q);
-}
-
-VEC_FUNC_IMPL void vec_free(void *ptr)
-{
-	if (ptr)
-		free(vec_internal_unalign_ptr_(ptr));
-}
 
 #ifdef __cplusplus
 }
--- a/test/Makefile.template	Sat Apr 26 02:54:44 2025 -0400
+++ b/test/Makefile.template	Sat Apr 26 15:31:39 2025 -0400
@@ -3,19 +3,26 @@
 CXXFLAGS += $(CPPFLAGS) -std=c++11
 
 HEADERS = ../include/vec/vec.h \
+	../include/vec/cpu.h \
+	../include/vec/mem.h \
+	../include/vec/defs.h \
 	../include/vec/impl/ppc/altivec.h \
 	../include/vec/impl/x86/avx2.h \
 	../include/vec/impl/x86/avx512f.h \
+	../include/vec/impl/x86/avx512bw.h \
 	../include/vec/impl/x86/mmx.h \
 	../include/vec/impl/x86/sse2.h \
+	../include/vec/impl/x86/sse3.h \
 	../include/vec/impl/x86/sse41.h \
-	../include/vec/impl/cpu.h \
+	../include/vec/impl/x86/sse42.h \
 	../include/vec/impl/generic.h \
 	test_align.h \
 	test_arith.h \
 	test_compare.h \
 	test_shift.h \
-	test_benchmark.h
+	test_benchmark.h \
+	test_benchmark_vec.c \
+	test_benchmark_simple.c
 BINS = test-generic test-host test-cxx
 OBJS = test.o test-cxx.o test_benchmark_simple.o test_benchmark_vec.o
 
--- a/test/Makefile.x86	Sat Apr 26 02:54:44 2025 -0400
+++ b/test/Makefile.x86	Sat Apr 26 15:31:39 2025 -0400
@@ -1,3 +1,3 @@
-CPPFLAGS += -mmmx -msse2 -mavx512f
+CPPFLAGS += -mmmx -msse2 -msse3 -msse4.1 -msse4.2 -mavx512f -mavx512bw
 
 include Makefile.template
\ No newline at end of file
--- a/test/test.c	Sat Apr 26 02:54:44 2025 -0400
+++ b/test/test.c	Sat Apr 26 15:31:39 2025 -0400
@@ -1,4 +1,5 @@
 #include "vec/vec.h"
+#include "vec/mem.h"
 
 #include <stdio.h>
 #include <string.h>
--- a/test/test_benchmark.h	Sat Apr 26 02:54:44 2025 -0400
+++ b/test/test_benchmark.h	Sat Apr 26 15:31:39 2025 -0400
@@ -2,10 +2,6 @@
 /* ------------------------------------------------------------------------ */
 /* simple benchmark for getting the min/max range of an audio sample. */
 
-/* prevent GCC from optimizing these function calls away - i think there's
- * probably a better way to do this, but I haven't found it yet :) */
-
-
 extern void test_benchmark_sample_minmax_simple_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax);
 extern void test_benchmark_sample_minmax_vec_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax);
 
@@ -14,19 +10,19 @@
 	int32_t min, max;
 	clock_t start, end;
 	int i;
-	int16_t *q = vec_malloc(16000000u * 2u);
+	int16_t *q = vec_malloc(16000001u * 2u);
 
-	printf("\nsigned 16-bit audio sample min/max - 1 thousand passes - 16000000 samples\n\n");
+	printf("\nsigned 16-bit audio sample min/max - 1 thousand passes - 16000001 samples\n\n");
 
 	/* generate random sample values */
-	for (i = 0; i < 16000000; i++)
+	for (i = 0; i < 16000001; i++)
 		q[i] = rand();
 
 	start = clock();
 	for (i = 0; i < 1000; i++) {
 		min = INT32_MAX;
 		max = INT32_MIN;
-		test_benchmark_sample_minmax_vec_impl(q, 16000000u, &min, &max);
+		test_benchmark_sample_minmax_vec_impl(q, 16000001u, &min, &max);
 	}
 	end = clock();
 
@@ -36,7 +32,7 @@
 	for (i = 0; i < 1000; i++) {
 		min = INT32_MAX;
 		max = INT32_MIN;
-		test_benchmark_sample_minmax_simple_impl(q, 16000000u, &min, &max);
+		test_benchmark_sample_minmax_simple_impl(q, 16000001u, &min, &max);
 	}
 	end = clock();
 
--- a/utils/gengeneric.c	Sat Apr 26 02:54:44 2025 -0400
+++ b/utils/gengeneric.c	Sat Apr 26 15:31:39 2025 -0400
@@ -66,10 +66,6 @@
 	"#ifndef VEC_IMPL_GENERIC_H_\n"
 	"#define VEC_IMPL_GENERIC_H_\n"
 	"\n"
-	"#include <string.h>\n"
-	"\n"
-	"// -----------------------------------------------------------------\n"
-	"\n"
 	"#define VEC_GENERIC_OPERATION(op, sign, bits, size) \\\n"
 	"	do { \\\n"
 	"		int i; \\\n"