changeset 23:e26874655738

*: huge refactor, new major release (hahaha) I keep finding things that are broken... The problem NOW was that vec would unintentionally build some functions with extended instruction sets, which is Bad and would mean that for all intents and purposes the CPU detection was completely broken. Now vec is no longer header only either. Boohoo. However this gives a lot more flexibility to vec since we no longer want or need to care about C++ crap. The NEON and Altivec implementations have not been updated which means they won't compile hence why they're commented out in the cmake build file.
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 02:52:40 -0500
parents fbcd3fa6f8fc
children e49e70f7012f
files .hgignore CMakeLists.txt README include/vec/cpu.h include/vec/impl/align.h include/vec/impl/arm/neon.h include/vec/impl/cpu.h include/vec/impl/fallback.h include/vec/impl/generic.h include/vec/impl/integer.h.in include/vec/impl/ppc/altivec.h include/vec/impl/x86/avx2.h include/vec/impl/x86/avx512f.h include/vec/impl/x86/mmx.h include/vec/impl/x86/sse2.h include/vec/impl/x86/sse41.h include/vec/types.h.in include/vec/vec.h src/cpu.c src/impl/arm/neon.c src/impl/fallback.c src/impl/generic.c src/impl/ppc/altivec.c src/impl/x86/avx2.c src/impl/x86/avx512f.c src/impl/x86/mmx.c src/impl/x86/sse2.c src/impl/x86/sse41.c src/vec.c test/CMakeLists.txt test/Makefile.ppc test/Makefile.template test/Makefile.x86 test/test.cc test/test_arith.h test/test_shift.h
diffstat 36 files changed, 4205 insertions(+), 3881 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Sat Nov 23 04:09:44 2024 +0000
+++ b/.hgignore	Sun Nov 24 02:52:40 2024 -0500
@@ -40,3 +40,4 @@
 
 # Build dir
 ^build/
+^test/build/
--- a/CMakeLists.txt	Sat Nov 23 04:09:44 2024 +0000
+++ b/CMakeLists.txt	Sun Nov 24 02:52:40 2024 -0500
@@ -1,49 +1,114 @@
 cmake_minimum_required(VERSION 3.23)
 
-project(vec VERSION 2.0.0 DESCRIPTION "a tiny C99 SIMD vector library")
-
-add_library(vec SHARED src/vec.c)
+project(vec VERSION 3.0.0 DESCRIPTION "a tiny C99 SIMD vector library")
 
-target_sources(vec PUBLIC
-	$<INSTALL_INTERFACE:vec/vec.h>
-	$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/vec.h>
-	$<INSTALL_INTERFACE:vec/impl/integer.h>
-	$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/vec/impl/integer.h>
-)
+add_library(vec SHARED "src/vec.c;src/cpu.c;src/impl/generic.c;src/impl/fallback.c")
 
 include(CheckCCompilerFlag)
 
 if(MSVC)
-	# TODO ?
+	# Untested!
+
+	if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+		set(COMPILER_HAS_MMX OFF)
+	else()
+		set(COMPILER_HAS_MMX ON)
+		set(COMPILER_MMX_FLAGS "") # none?
+	endif()
+	check_c_compiler_flag("/arch:SSE2" COMPILER_HAS_SSE2)
+	if(COMPILER_HAS_SSE2)
+		set(COMPILER_SSE2_FLAGS "/arch:SSE2")
+	endif()
+	check_c_compiler_flag("/arch:SSE4.2" COMPILER_HAS_SSE41)
+	if(COMPILER_HAS_SSE41)
+		set(COMPILER_SSE41_FLAGS "/arch:SSE4.2")
+	endif()
+	check_c_compiler_flag("/arch:AVX2" COMPILER_HAS_AVX2)
+	if(COMPILER_HAS_AVX2)
+		set(COMPILER_AVX2_FLAGS "/arch:AVX2")
+	endif()
+	check_c_compiler_flag("/arch:AVX512" COMPILER_HAS_AVX512F)
+	if(COMPILER_HAS_AVX512F)
+		set(COMPILER_AVX512F_FLAGS "/arch:AVX512")
+	endif()
+	# TODO we have to try_compile to detect NEON
 else()
-	check_c_compiler_flag("-maltivec" COMPILER_HAS_ALTIVEC)
-	if(COMPILER_HAS_ALTIVEC)
-		target_compile_options(vec PRIVATE "-maltivec")
-	endif()
+	#check_c_compiler_flag("-maltivec" COMPILER_HAS_ALTIVEC)
+	#if(COMPILER_HAS_ALTIVEC)
+	#	set(COMPILER_ALTIVEC_FLAGS "-maltivec")
+	#endif()
+	#check_c_compiler_flag("-mfpu=neon" COMPILER_HAS_NEON)
+	#if(COMPILER_HAS_NEON)
+	#	set(COMPILER_NEON_FLAGS "-mfpu=neon")
+	#endif()
 	check_c_compiler_flag("-mmmx" COMPILER_HAS_MMX)
 	if(COMPILER_HAS_MMX)
-		target_compile_options(vec PRIVATE "-mmmx")
+		set(COMPILER_MMX_FLAGS "-mmmx")
 	endif()
 	check_c_compiler_flag("-msse2" COMPILER_HAS_SSE2)
 	if(COMPILER_HAS_SSE2)
-		target_compile_options(vec PRIVATE "-msse2")
+		set(COMPILER_SSE2_FLAGS "-msse2")
 	endif()
 	check_c_compiler_flag("-msse4.1" COMPILER_HAS_SSE41)
 	if(COMPILER_HAS_SSE41)
-		target_compile_options(vec PRIVATE "-msse4.1")
+		set(COMPILER_SSE41_FLAGS "-msse4.1")
 	endif()
 	check_c_compiler_flag("-mavx2" COMPILER_HAS_AVX2)
 	if(COMPILER_HAS_AVX2)
-		target_compile_options(vec PRIVATE "-mavx2")
+		set(COMPILER_AVX2_FLAGS "-mavx2")
 	endif()
 	check_c_compiler_flag("-mavx512f" COMPILER_HAS_AVX512F)
 	if(COMPILER_HAS_AVX512F)
-		target_compile_options(vec PRIVATE "-mavx512f")
+		set(COMPILER_AVX512F_FLAGS "-mavx512f")
 	endif()
 endif()
 
+if(COMPILER_HAS_ALTIVEC)
+	target_sources(vec PRIVATE "src/impl/ppc/altivec.c")
+	set_source_files_properties("src/impl/ppc/altivec.c" PROPERTIES COMPILE_FLAGS "${COMPILER_ALTIVEC_FLAGS}")
+	target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_ALTIVEC")
+endif()
+
+if(COMPILER_HAS_NEON)
+	target_sources(vec PRIVATE "src/impl/arm/neon.c")
+	set_source_files_properties("src/impl/arm/neon.c" PROPERTIES COMPILE_FLAGS "${COMPILER_NEON_FLAGS}")
+	target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_NEON")
+endif()
+
+if(COMPILER_HAS_MMX)
+	target_sources(vec PRIVATE "src/impl/x86/mmx.c")
+	set_source_files_properties("src/impl/x86/mmx.c" PROPERTIES COMPILE_FLAGS "${COMPILER_MMX_FLAGS}")
+	target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_MMX")
+endif()
+
+if(COMPILER_HAS_SSE2)
+	target_sources(vec PRIVATE "src/impl/x86/sse2.c")
+	set_source_files_properties("src/impl/x86/sse2.c" PROPERTIES COMPILE_FLAGS "${COMPILER_SSE2_FLAGS}")
+	target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_SSE2")
+endif()
+
+if(COMPILER_HAS_SSE41)
+	target_sources(vec PRIVATE "src/impl/x86/sse41.c")
+	set_source_files_properties("src/impl/x86/sse41.c" PROPERTIES COMPILE_FLAGS "${COMPILER_SSE41_FLAGS}")
+	target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_SSE41")
+endif()
+
+if(COMPILER_HAS_AVX2)
+	target_sources(vec PRIVATE "src/impl/x86/avx2.c")
+	set_source_files_properties("src/impl/x86/avx2.c" PROPERTIES COMPILE_FLAGS "${COMPILER_AVX2_FLAGS}")
+	target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_AVX2")
+endif()
+
+if(COMPILER_HAS_AVX512F)
+	target_sources(vec PRIVATE "src/impl/x86/avx512f.c")
+	set_source_files_properties("src/impl/x86/avx512f.c" PROPERTIES COMPILE_FLAGS "${COMPILER_AVX512F_FLAGS}")
+	target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_AVX512F")
+endif()
+
+
 #########################################################################
-# integer types
+# integer types; it's nice to accommodate for older broken systems that
+# may not have stdint.h.
 
 include(CheckTypeSize)
 
@@ -61,6 +126,7 @@
 check_type_size("long"      LONG_SIZE      LANGUAGE C)
 check_type_size("long long" LONG_LONG_SIZE LANGUAGE C)
 check_type_size("uintptr_t" UINTPTR_T_SIZE LANGUAGE C)
+check_type_size("size_t"    SIZE_T_SIZE LANGUAGE C)
 
 if(INT16_T_SIZE EQUAL 2)
 	set(SIZE16 "int16_t")
@@ -68,6 +134,8 @@
 	set(SIZE16 "short")
 elseif(INT_SIZE EQUAL 2)
 	set(SIZE16 "int")
+else()
+	message(FATAL_ERROR "Failed to find a signed 16-bit integer type.")
 endif()
 
 if(UINT16_T_SIZE EQUAL 2)
@@ -78,6 +146,8 @@
 	set(USIZE16 "unsigned short")
 elseif(INT_SIZE EQUAL 2)
 	set(USIZE16 "unsigned int")
+else()
+	message(FATAL_ERROR "Failed to find an unsigned 16-bit integer type.")
 endif()
 
 if(INT32_T_SIZE EQUAL 4)
@@ -88,6 +158,8 @@
 	set(SIZE32 "int")
 elseif(LONG_SIZE EQUAL 4)
 	set(SIZE32 "long")
+else()
+	message(FATAL_ERROR "Failed to find a signed 32-bit integer type.")
 endif()
 
 if(UINT32_T_SIZE EQUAL 4)
@@ -100,6 +172,8 @@
 	set(USIZE32 "unsigned int")
 elseif(LONG_SIZE EQUAL 4)
 	set(USIZE32 "unsigned long")
+else()
+	message(FATAL_ERROR "Failed to find an unsigned 32-bit integer type.")
 endif()
 
 if(INT64_T_SIZE EQUAL 8)
@@ -112,6 +186,8 @@
 	set(SIZE64 "long")
 elseif(LONG_LONG_SIZE EQUAL 8)
 	set(SIZE64 "long long")
+else()
+	message(FATAL_ERROR "Failed to find a signed 64-bit integer type.")
 endif()
 
 if(UINT64_T_SIZE EQUAL 8)
@@ -126,28 +202,46 @@
 	set(USIZE64 "unsigned long")
 elseif(LONG_LONG_SIZE EQUAL 8)
 	set(USIZE64 "unsigned long long")
+else()
+	message(FATAL_ERROR "Failed to find an unsigned 64-bit integer type.")
 endif()
 
 if(CMAKE_SIZEOF_VOID_P EQUAL UINTPTR_T_SIZE)
 	set(USIZEPTR "uintptr_t")
-elseif(CMAKE_SIZEOF_VOID_P EQUAL 1)
+elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 1)
 	set(USIZEPTR "unsigned char")
-elseif(CMAKE_SIZEOF_VOID_P EQUAL 2)
+elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 2)
 	set(USIZEPTR "${USIZE16}")
-elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 4)
 	set(USIZEPTR "${USIZE32}")
-elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 8)
 	set(USIZEPTR "${USIZE64}")
+else()
+	message(FATAL_ERROR "Failed to find an integer type that can fit a pointer.")
 endif()
 
-configure_file(include/vec/impl/integer.h.in include/vec/impl/integer.h @ONLY)
+if(NOT SIZE_T_SIZE EQUAL 0 AND NOT SIZE_T_SIZE EQUAL "")
+	set(USIZESIZE "size_t")
+else()
+	# should be close enough I guess
+	set(USIZESIZE "${USIZEPTR}")
+endif()
 
-target_compile_definitions(vec PRIVATE "VEC_HAVE_IMPL_INTEGER_H")
+configure_file(include/vec/types.h.in include/vec/types.h @ONLY)
 
 #########################################################################
 
+target_sources(vec PUBLIC
+	$<INSTALL_INTERFACE:vec/vec.h>
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/vec.h>
+	$<INSTALL_INTERFACE:vec/types.h>
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/vec/types.h>
+	$<INSTALL_INTERFACE:vec/cpu.h>
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/cpu.h>
+)
+
 target_compile_features(vec PRIVATE $<IF:$<COMPILE_FEATURES:c_std_11>,c_std_11,c_std_99>)
-target_include_directories(vec PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_BINARY_DIR}/include/vec")
+target_include_directories(vec PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_BINARY_DIR}/include")
 
 # Installing
 
--- a/README	Sat Nov 23 04:09:44 2024 +0000
+++ b/README	Sun Nov 24 02:52:40 2024 -0500
@@ -1,4 +1,4 @@
-vec - a tiny SIMD vector header-only library written in C99
+vec - a tiny SIMD vector library written in C99
 
 it comes with an extremely basic API that is similar to other intrinsics
 libraries; each type is in the exact same format:
@@ -12,6 +12,13 @@
 on processors where vec has an implementation and falls back to array-based
 implementations where they are not.
 
+to initialize vec, you MUST call `vec_init()' when your program starts up.
+
+note that `vec_init()' is NOT thread-safe, and things can and will
+blow up if you call it simultaneously from different threads (i.e. you
+try to only initialize it when you need to... please just initialize
+it on startup so you don't have to worry about that!!!)
+
 all of these have many operations that are prefixed with the name of the
 type and an underscore, for example:
 
@@ -106,10 +113,3 @@
 		the result vector if the value in `vec1' is greater
 		than or equal to the corresponding value in `vec2',
 		else all of the bits are turned off.
-
-to initialize vec, you MUST call `vec_init()' when your programs starts up.
-
-note that `vec_init()' is NOT thread-safe, and things can and will
-blow up if you call it simultaneously from different threads (i.e. you
-try to only initialize it when you need to... please just initialize
-it on startup so you don't have to worry about that!!!)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/cpu.h	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,50 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+// Sure, this can be a public API.
+
+#ifndef VEC_CPU_H_
+#define VEC_CPU_H_
+
+#include "vec/vec.h"
+
+enum {
+	VEC_CPU_HAS_ALTIVEC = (1 << 0),
+	VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1),
+	VEC_CPU_HAS_MMX = (1 << 2),
+	VEC_CPU_HAS_SSE = (1 << 3),
+	VEC_CPU_HAS_SSE2 = (1 << 4),
+	VEC_CPU_HAS_SSE3 = (1 << 5),
+	VEC_CPU_HAS_SSE41 = (1 << 6),
+	VEC_CPU_HAS_SSE42 = (1 << 7),
+	VEC_CPU_HAS_AVX = (1 << 8),
+	VEC_CPU_HAS_AVX2 = (1 << 9),
+	VEC_CPU_HAS_AVX512F = (1 << 10),
+	VEC_CPU_HAS_NEON = (1 << 11),
+};
+
+// NOT thread-safe.
+vec_uint32 vec_get_CPU_features(void);
+
+#endif /* VEC_CPU_H_ */
--- a/include/vec/impl/align.h	Sat Nov 23 04:09:44 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,267 +0,0 @@
-/**
- * vec - a tiny SIMD vector library in C99
- * 
- * Copyright (c) 2024 Paper
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
-#ifndef VEC_IMPL_ALIGN_H_
-#define VEC_IMPL_ALIGN_H_
-
-// Array alignment macros
-
-#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
-# define VEC_ALIGNAS(x) alignas(x)
-#elif (__STDC_VERSION__ >= 201112L)
-# define VEC_ALIGNAS(x) _Alignas(x)
-#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
-# define VEC_ALIGNAS(x) __attribute__((__aligned__(x)))
-#endif
-
-/* the alignment must be specified in bytes and must be a multiple of the
- * type size. it is always assumed that the type will be on a boundary of
- * its size, which may or may not be true */
-#ifdef VEC_ALIGNAS
-# define VEC_ALIGNED_ARRAY(type, var, length, align) \
-	VEC_ALIGNAS(align) type var[length]
-# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
-	(sizeof(var))
-#else
-// use unions to get an aligned offset without triggering strict aliasing
-# define VEC_ALIGNED_ARRAY(type, var, length, align) \
-	VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \
-	union vec_aligned_union_##var##_ { \
-		type arr[length]; \
-		unsigned char bytes[sizeof(type) * length]; \
-	}; \
-	unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \
-	type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \
-	VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
-# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
-	(sizeof(vec_unaligned_##var##_) - (align - 1))
-#endif
-
-#define VEC_ALIGNED_ARRAY_LENGTH(var) \
-	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))
-
-//////////////////////////////////////////////////////////////////////////////////////
-// predefined variants for each vector type
-
-//////////////////////////////////////////////////////////////////////////////////////
-// 16-bit
-
-#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT)
-#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT)
-#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT)
-#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0)
-
-#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT)
-#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT)
-#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT)
-#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0)
-
-//////////////////////////////////////////////////////////////////////////////////////
-// 32-bit
-
-#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT)
-#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT)
-#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT)
-#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0)
-
-#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT)
-#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT)
-#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT)
-#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0)
-
-#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT)
-#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT)
-#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT)
-#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0)
-
-#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT)
-#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT)
-#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT)
-#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0)
-
-//////////////////////////////////////////////////////////////////////////////////////
-// 64-bit
-
-#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT)
-#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
-#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
-#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)
-
-#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT)
-#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
-#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
-#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)
-
-#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT)
-#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
-#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
-#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)
-
-#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT)
-#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
-#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
-#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)
-
-#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT)
-#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
-#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
-#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)
-
-#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT)
-#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
-#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
-#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)
-
-//////////////////////////////////////////////////////////////////////////////////////
-// 128-bit
-
-#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT)
-#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
-#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
-#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
-
-#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT)
-#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
-#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
-#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)
-
-#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT)
-#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
-#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
-#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)
-
-#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT)
-#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
-#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
-#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)
-
-#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)
-
-#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)
-
-#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)
-
-#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)
-
-//////////////////////////////////////////////////////////////////////////////////////
-// 256-bit
-
-#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT)
-#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
-#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
-#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)
-
-#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT)
-#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
-#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
-#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
-
-#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT)
-#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
-#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
-#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)
-
-#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT)
-#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
-#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
-#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)
-
-#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT)
-#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
-#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
-#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)
-
-#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT)
-#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
-#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
-#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
-
-#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT)
-#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
-#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
-#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)
-
-#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT)
-#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
-#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
-#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)
-
-//////////////////////////////////////////////////////////////////////////////////////
-// 512-bit
-
-#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT)
-#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
-#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
-#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)
-
-#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT)
-#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
-#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
-#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
-
-#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT)
-#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
-#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
-#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)
-
-#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT)
-#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
-#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
-#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)
-
-#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT)
-#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
-#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
-#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)
-
-#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT)
-#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
-#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
-#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
-
-#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT)
-#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
-#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
-#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)
-
-#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT)
-#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
-#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
-#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)
-
-//////////////////////////////////////////////////////////////////////////////////////
-
-#endif /* VEC_IMPL_ALIGN_H_ */
--- a/include/vec/impl/arm/neon.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/arm/neon.h	Sun Nov 24 02:52:40 2024 -0500
@@ -25,465 +25,22 @@
 #ifndef VEC_IMPL_ARM_NEON_H_
 #define VEC_IMPL_ARM_NEON_H_
 
-#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vld1_##sign##bits(in); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		vstore_lane_##bits(sign, vec.neon, out); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_neon_load_aligned, \
-		v##sign##int##bits##x##size##_neon_load_aligned, \
-		v##sign##int##bits##x##size##_neon_store_aligned, \
-		v##sign##int##bits##x##size##_neon_store_aligned, \
-		v##sign##int##bits##x##size##_neon_add, \
-		v##sign##int##bits##x##size##_neon_sub, \
-		v##sign##int##bits##x##size##_neon_mul, \
-		/* .div = */ NULL, \
-		/* .avg = */ NULL, \
-		v##sign##int##bits##x##size##_neon_and, \
-		v##sign##int##bits##x##size##_neon_or, \
-		v##sign##int##bits##x##size##_neon_xor, \
-		/* .not = */ NULL, \
-		v##sign##int##bits##x##size##_neon_lshift, \
-		/* .rshift = */ NULL, \
-		/* .lrshift = */ NULL, \
-	};
-
-#define VEC_DEFINE_OPERATIONS(bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
-
-// Ok, we'll start out with the 64-bit types.
+#include "vec/vec.h"
 
-#define vadd_8  vadd_s8
-#define vadd_16 vadd_s16
-#define vadd_32 vadd_s32
-#define vsub_8  vsub_s8
-#define vsub_16 vsub_s16
-#define vsub_32 vsub_s32
-#define vmul_8  vmul_s8
-#define vmul_16 vmul_s16
-#define vmul_32 vmul_s32
-#define vshl_8  vshl_s8
-#define vshl_16 vshl_s16
-#define vshl_32 vshl_s32
-#define veor_8  veor_s8
-#define veor_16 veor_s16
-#define veor_32 veor_s32
-#define vorr_8  vorr_s8
-#define vorr_16 vorr_s16
-#define vorr_32 vorr_s32
-#define vand_8  vand_s8
-#define vand_16 vand_s16
-#define vand_32 vand_s32
-#define vld1_8  vld1_s8
-#define vld1_16 vld1_s16
-#define vld1_32 vld1_s32
-#define vget_lane_8  vget_lane_s8
-#define vget_lane_16 vget_lane_s16
-#define vget_lane_32 vget_lane_s32
-#define vstore_lane_8(sign, vec, out) \
-	do { \
-		out[0] = vget_lane_##sign##8(vec, 0); \
-		out[1] = vget_lane_##sign##8(vec, 1); \
-		out[2] = vget_lane_##sign##8(vec, 2); \
-		out[3] = vget_lane_##sign##8(vec, 3); \
-		out[4] = vget_lane_##sign##8(vec, 4); \
-		out[5] = vget_lane_##sign##8(vec, 5); \
-		out[6] = vget_lane_##sign##8(vec, 6); \
-		out[7] = vget_lane_##sign##8(vec, 7); \
-	} while (0)
-#define vstore_lane_16(sign, vec, out) \
-	do { \
-		out[0] = vget_lane_##sign##16(vec, 0); \
-		out[1] = vget_lane_##sign##16(vec, 1); \
-		out[2] = vget_lane_##sign##16(vec, 2); \
-		out[3] = vget_lane_##sign##16(vec, 3); \
-	} while (0)
-#define vstore_lane_32(sign, vec, out) \
-	do { \
-		out[0] = vget_lane_##sign##32(vec, 0); \
-		out[1] = vget_lane_##sign##32(vec, 1); \
-	} while (0)
-#define vreinterpret_8_u8(x) vreinterpret_s8_u8(x)
-#define vreinterpret_16_u16(x) vreinterpret_s16_u16(x)
-#define vreinterpret_32_u32(x) vreinterpret_s32_u32(x)
-
-VEC_DEFINE_OPERATIONS(8, 8)
-VEC_DEFINE_OPERATIONS(16, 4)
-VEC_DEFINE_OPERATIONS(32, 2)
-
-#undef vadd_8
-#undef vadd_16
-#undef vadd_32
-#undef vsub_8
-#undef vsub_16
-#undef vsub_32
-#undef vmul_8
-#undef vmul_16
-#undef vmul_32
-#undef vshl_8
-#undef vshl_16
-#undef vshl_32
-#undef veor_8
-#undef veor_16
-#undef veor_32
-#undef vorr_8
-#undef vorr_16
-#undef vorr_32
-#undef vand_8
-#undef vand_16
-#undef vand_32
-#undef vld1_8
-#undef vld1_16
-#undef vld1_32
-#undef vget_lane_8 
-#undef vget_lane_16
-#undef vget_lane_32
-#undef vstore_lane_8
-#undef vstore_lane_16
-#undef vstore_lane_32
-#undef vreinterpret_8_u8
-#undef vreinterpret_16_u16
-#undef vreinterpret_32_u32
-
-///////////////////////////////////////////////////////////////////////////////
-// 128-bit
-
-// Now we can go ahead and do the 128-bit ones.
-
-// NEON doesn't have native 64-bit multiplication, so we have
-// to do it ourselves
-static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b)
-{
-    const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a));
-    const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b));
-
-    const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a));
-
-    return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr));
-}
-
-static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b)
-{
-    const uint32x2_t ac = vmovn_u64(a);
-    const uint32x2_t pr = vmovn_u64(b);
-
-    const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a));
-
-    return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr);
-}
+extern const vint8x8_impl   vint8x8_impl_neon;
+extern const vint16x4_impl  vint16x4_impl_neon;
+extern const vint32x2_impl  vint32x2_impl_neon;
+extern const vuint8x8_impl  vuint8x8_impl_neon;
+extern const vuint16x4_impl vuint16x4_impl_neon;
+extern const vuint32x2_impl vuint32x2_impl_neon;
 
-#define vadd_8  vaddq_s8
-#define vadd_16 vaddq_s16
-#define vadd_32 vaddq_s32
-#define vadd_64 vaddq_s64
-#define vadd_u8  vaddq_u8
-#define vadd_u16 vaddq_u16
-#define vadd_u32 vaddq_u32
-#define vadd_u64 vaddq_u64
-#define vsub_8  vsubq_s8
-#define vsub_16 vsubq_s16
-#define vsub_32 vsubq_s32
-#define vsub_64 vsubq_s64
-#define vsub_u8  vsubq_u8
-#define vsub_u16 vsubq_u16
-#define vsub_u32 vsubq_u32
-#define vsub_u64 vsubq_u64
-#define vmul_8  vmulq_s8
-#define vmul_16 vmulq_s16
-#define vmul_32 vmulq_s32
-#define vmul_64 vmulq_s64
-#define vmul_u8  vmulq_u8
-#define vmul_u16 vmulq_u16
-#define vmul_u32 vmulq_u32
-#define vmul_u64 vmulq_u64
-#define vshl_8  vshlq_s8
-#define vshl_16 vshlq_s16
-#define vshl_32 vshlq_s32
-#define vshl_64 vshlq_s64
-#define vshl_u8  vshlq_u8
-#define vshl_u16 vshlq_u16
-#define vshl_u32 vshlq_u32
-#define vshl_u64 vshlq_u64
-#define veor_8  veorq_s8
-#define veor_16 veorq_s16
-#define veor_32 veorq_s32
-#define veor_64 veorq_s64
-#define veor_u8  veorq_u8
-#define veor_u16 veorq_u16
-#define veor_u32 veorq_u32
-#define veor_u64 veorq_u64
-#define vorr_8  vorrq_s8
-#define vorr_16 vorrq_s16
-#define vorr_32 vorrq_s32
-#define vorr_64 vorrq_s64
-#define vorr_u8  vorrq_u8
-#define vorr_u16 vorrq_u16
-#define vorr_u32 vorrq_u32
-#define vorr_u64 vorrq_u64
-#define vand_8  vandq_s8
-#define vand_16 vandq_s16
-#define vand_32 vandq_s32
-#define vand_64 vandq_s64
-#define vand_u8  vandq_u8
-#define vand_u16 vandq_u16
-#define vand_u32 vandq_u32
-#define vand_u64 vandq_u64
-#define vld1_8  vld1q_s8
-#define vld1_16 vld1q_s16
-#define vld1_32 vld1q_s32
-#define vld1_64 vld1q_s64
-#define vld1_u8  vld1q_u8
-#define vld1_u16 vld1q_u16
-#define vld1_u32 vld1q_u32
-#define vld1_u64 vld1q_u64
-#define vget_lane_8  vgetq_lane_s8
-#define vget_lane_16 vgetq_lane_s16
-#define vget_lane_32 vgetq_lane_s32
-#define vget_lane_64 vgetq_lane_s64
-#define vget_lane_u8  vgetq_lane_u8
-#define vget_lane_u16 vgetq_lane_u16
-#define vget_lane_u32 vgetq_lane_u32
-#define vget_lane_u64 vgetq_lane_u64
-#define vstore_lane_8(sign, vec, out) \
-	do { \
-		out[0] = vget_lane_##sign##8(vec, 0); \
-		out[1] = vget_lane_##sign##8(vec, 1); \
-		out[2] = vget_lane_##sign##8(vec, 2); \
-		out[3] = vget_lane_##sign##8(vec, 3); \
-		out[4] = vget_lane_##sign##8(vec, 4); \
-		out[5] = vget_lane_##sign##8(vec, 5); \
-		out[6] = vget_lane_##sign##8(vec, 6); \
-		out[7] = vget_lane_##sign##8(vec, 7); \
-		out[8] = vget_lane_##sign##8(vec, 8); \
-		out[9] = vget_lane_##sign##8(vec, 9); \
-		out[10] = vget_lane_##sign##8(vec, 10); \
-		out[11] = vget_lane_##sign##8(vec, 11); \
-		out[12] = vget_lane_##sign##8(vec, 12); \
-		out[13] = vget_lane_##sign##8(vec, 13); \
-		out[14] = vget_lane_##sign##8(vec, 14); \
-		out[15] = vget_lane_##sign##8(vec, 15); \
-	} while (0)
-#define vstore_lane_16(sign, vec, out) \
-	do { \
-		out[0] = vget_lane_##sign##16(vec, 0); \
-		out[1] = vget_lane_##sign##16(vec, 1); \
-		out[2] = vget_lane_##sign##16(vec, 2); \
-		out[3] = vget_lane_##sign##16(vec, 3); \
-		out[4] = vget_lane_##sign##16(vec, 4); \
-		out[5] = vget_lane_##sign##16(vec, 5); \
-		out[6] = vget_lane_##sign##16(vec, 6); \
-		out[7] = vget_lane_##sign##16(vec, 7); \
-	} while (0)
-#define vstore_lane_32(sign, vec, out) \
-	do { \
-		out[0] = vget_lane_##sign##32(vec, 0); \
-		out[1] = vget_lane_##sign##32(vec, 1); \
-		out[2] = vget_lane_##sign##32(vec, 2); \
-		out[3] = vget_lane_##sign##32(vec, 3); \
-	} while (0)
-#define vstore_lane_64(sign, vec, out) \
-	do { \
-		out[0] = vget_lane_##sign##64(vec, 0); \
-		out[1] = vget_lane_##sign##64(vec, 1); \
-	} while (0)
-#define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x)
-#define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x)
-#define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x)
-#define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x)
-
-#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vld1_##sign##bits(in); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		vstore_lane_##bits(sign, vec.neon, out); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_neon_load_aligned, \
-		v##sign##int##bits##x##size##_neon_load_aligned, \
-		v##sign##int##bits##x##size##_neon_store_aligned, \
-		v##sign##int##bits##x##size##_neon_store_aligned, \
-		v##sign##int##bits##x##size##_neon_add, \
-		v##sign##int##bits##x##size##_neon_sub, \
-		v##sign##int##bits##x##size##_neon_mul, \
-		/* .div = */ NULL, \
-		/* .avg = */ NULL, \
-		v##sign##int##bits##x##size##_neon_and, \
-		v##sign##int##bits##x##size##_neon_or, \
-		v##sign##int##bits##x##size##_neon_xor, \
-		/* .not = */ NULL, \
-		v##sign##int##bits##x##size##_neon_lshift, \
-		/* .rshift = */ NULL, \
-		/* .lrshift = */ NULL, \
-	};
-
-#define VEC_DEFINE_OPERATIONS(bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
-
-VEC_DEFINE_OPERATIONS(8, 16)
-VEC_DEFINE_OPERATIONS(16, 8)
-VEC_DEFINE_OPERATIONS(32, 4)
-VEC_DEFINE_OPERATIONS(64, 2)
-
-#undef vadd_8
-#undef vadd_16
-#undef vadd_32
-#undef vadd_64
-#undef vsub_8
-#undef vsub_16
-#undef vsub_32
-#undef vsub_64
-#undef vmul_8
-#undef vmul_16
-#undef vmul_32
-#undef vmul_64
-#undef vshl_8
-#undef vshl_16
-#undef vshl_32
-#undef vshl_64
-#undef veor_8
-#undef veor_16
-#undef veor_32
-#undef veor_64
-#undef vorr_8
-#undef vorr_16
-#undef vorr_32
-#undef vorr_64
-#undef vand_8
-#undef vand_16
-#undef vand_32
-#undef vand_64
-#undef vld1_8
-#undef vld1_16
-#undef vld1_32
-#undef vld1_64
-#undef vget_lane_8 
-#undef vget_lane_16
-#undef vget_lane_32
-#undef vget_lane_64
-#undef vstore_lane_8
-#undef vstore_lane_16
-#undef vstore_lane_32
-#undef vstore_lane_64
-
-#undef VEC_DEFINE_OPERATIONS
-#undef VEC_DEFINE_OPERATIONS_SIGN
+extern const vint8x16_impl  vint8x16_impl_neon;
+extern const vint16x8_impl  vint16x8_impl_neon;
+extern const vint32x4_impl  vint32x4_impl_neon;
+extern const vint64x2_impl  vint64x2_impl_neon;
+extern const vuint8x16_impl vuint8x16_impl_neon;
+extern const vuint16x8_impl vuint16x8_impl_neon;
+extern const vuint32x4_impl vuint32x4_impl_neon;
+extern const vuint64x2_impl vuint64x2_impl_neon;
 
 #endif /* VEC_IMPL_ARM_NEON_H_ */
--- a/include/vec/impl/cpu.h	Sat Nov 23 04:09:44 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,512 +0,0 @@
-/**
- * vec - a tiny SIMD vector library in C99
- * 
- * Copyright (c) 2024 Paper
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
-#ifndef VEC_IMPL_CPU_H_
-#define VEC_IMPL_CPU_H_
-
-/* Detect CPU SIMD support. Much of this code was stolen from SDL.
- *
- * Simple DirectMedia Layer
- * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
-*/
-
-#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
-# include <sys/sysctl.h> // For AltiVec check
-#elif defined(__OpenBSD__) && defined(__powerpc__)
-# include <sys/types.h>
-# include <sys/sysctl.h> // For AltiVec check
-# include <machine/cpu.h>
-#elif defined(__FreeBSD__) && defined(__powerpc__)
-# include <machine/cpu.h>
-# include <sys/auxv.h>
-#elif defined(__ALTIVEC__)
-# include <signal.h>
-# include <setjmp.h>
-#endif
-
-#ifdef __FreeBSD__
-# include <sys/param.h>
-#endif
-
-#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
-# include <unistd.h>
-# include <sys/types.h>
-# include <sys/stat.h>
-# include <fcntl.h>
-# include <elf.h>
-
-/*#include <asm/hwcap.h>*/
-# ifndef AT_HWCAP
-# define AT_HWCAP 16
-# endif
-# ifndef AT_PLATFORM
-#  define AT_PLATFORM 15
-# endif
-# ifndef HWCAP_NEON
-#  define HWCAP_NEON (1 << 12)
-# endif
-#endif
-
-static inline int vec_CPU_have_CPUID(void)
-{
-	int has_CPUID = 0;
-
-#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
-	__asm__ (
-"        pushfl                      # Get original EFLAGS             \n"
-"        popl    %%eax                                                 \n"
-"        movl    %%eax,%%ecx                                           \n"
-"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
-"        pushl   %%eax               # Save new EFLAGS value on stack  \n"
-"        popfl                       # Replace current EFLAGS value    \n"
-"        pushfl                      # Get new EFLAGS                  \n"
-"        popl    %%eax               # Store new EFLAGS in EAX         \n"
-"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
-"        jz      1f                  # Processor=80486                 \n"
-"        movl    $1,%0               # We have CPUID support           \n"
-"1:                                                                    \n"
-	: "=m" (has_CPUID)
-	:
-	: "%eax", "%ecx"
-	);
-#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
-/* Technically, if this is being compiled under __x86_64__ then it has
-   CPUid by definition.  But it's nice to be able to prove it.  :)      */
-	__asm__ (
-"        pushfq                      # Get original EFLAGS             \n"
-"        popq    %%rax                                                 \n"
-"        movq    %%rax,%%rcx                                           \n"
-"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
-"        pushq   %%rax               # Save new EFLAGS value on stack  \n"
-"        popfq                       # Replace current EFLAGS value    \n"
-"        pushfq                      # Get new EFLAGS                  \n"
-"        popq    %%rax               # Store new EFLAGS in EAX         \n"
-"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
-"        jz      1f                  # Processor=80486                 \n"
-"        movl    $1,%0               # We have CPUID support           \n"
-"1:                                                                    \n"
-	: "=m" (has_CPUID)
-	:
-	: "%rax", "%rcx"
-	);
-#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
-	__asm {
-		pushfd                      ; Get original EFLAGS
-		pop     eax
-		mov     ecx, eax
-		xor     eax, 200000h        ; Flip ID bit in EFLAGS
-		push    eax                 ; Save new EFLAGS value on stack
-		popfd                       ; Replace current EFLAGS value
-		pushfd                      ; Get new EFLAGS
-		pop     eax                 ; Store new EFLAGS in EAX
-		xor     eax, ecx            ; Can not toggle ID bit,
-		jz      done                ; Processor=80486
-		mov     has_CPUID,1         ; We have CPUID support
-done:
-	}
-#elif defined(_MSC_VER) && defined(_M_X64)
-	has_CPUID = 1;
-#elif defined(__sun) && defined(__i386)
-	__asm (
-"       pushfl                 \n"
-"       popl    %eax           \n"
-"       movl    %eax,%ecx      \n"
-"       xorl    $0x200000,%eax \n"
-"       pushl   %eax           \n"
-"       popfl                  \n"
-"       pushfl                 \n"
-"       popl    %eax           \n"
-"       xorl    %ecx,%eax      \n"
-"       jz      1f             \n"
-"       movl    $1,-8(%ebp)    \n"
-"1:                            \n"
-	);
-#elif defined(__sun) && defined(__amd64)
-	__asm (
-"       pushfq                 \n"
-"       popq    %rax           \n"
-"       movq    %rax,%rcx      \n"
-"       xorl    $0x200000,%eax \n"
-"       pushq   %rax           \n"
-"       popfq                  \n"
-"       pushfq                 \n"
-"       popq    %rax           \n"
-"       xorl    %ecx,%eax      \n"
-"       jz      1f             \n"
-"       movl    $1,-8(%rbp)    \n"
-"1:                            \n"
-	);
-#endif
-
-	return has_CPUID;
-}
-
-#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	__asm__ __volatile__( \
-		"        pushl %%ebx        \n" \
-		"        xorl %%ecx,%%ecx   \n" \
-		"        cpuid              \n" \
-		"        movl %%ebx, %%esi  \n" \
-		"        popl %%ebx         \n" \
-		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
-		: "a"(func))
-#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	__asm__ __volatile__( \
-		"        pushq %%rbx        \n" \
-		"        xorq %%rcx,%%rcx   \n" \
-		"        cpuid              \n" \
-		"        movq %%rbx, %%rsi  \n" \
-		"        popq %%rbx         \n" \
-		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
-		: "a"(func))
-#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	__asm { \
-		__asm mov eax, func \
-		__asm xor ecx, ecx \
-		__asm cpuid \
-		__asm mov a, eax \
-		__asm mov b, ebx \
-		__asm mov c, ecx \
-		__asm mov d, edx \
-	}
-#elif (defined(_MSC_VER) && defined(_M_X64))
-// Use __cpuidex instead of __cpuid because ICL does not clear ecx register
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	do { \
-		int CPUInfo[4]; \
-		__cpuidex(CPUInfo, func, 0); \
-		a = CPUInfo[0]; \
-		b = CPUInfo[1]; \
-		c = CPUInfo[2]; \
-		d = CPUInfo[3]; \
-	} while (0)
-#else
-# define VEC_CPU_CPUID(func, a, b, c, d) \
-	do { \
-		a = b = c = d = 0; \
-		(void)a; \
-		(void)b; \
-		(void)c; \
-		(void)d; \
-	} while (0)
-#endif
-
-// ---------------------------------------------------------------
-
-static int vec_CPU_CPUIDFeatures[4];
-static int vec_CPU_CPUIDMaxFunction = 0;
-static int vec_CPU_OSSavesYMM = 0;
-static int vec_CPU_OSSavesZMM = 0;
-
-static inline void vec_CPU_get_CPUID_features(void)
-{
-	static int checked = 0;
-	if (!checked) {
-		checked = 1;
-		if (vec_CPU_have_CPUID()) {
-			int a, b, c, d;
-			VEC_CPU_CPUID(0, a, b, c, d);
-			vec_CPU_CPUIDMaxFunction = a;
-			if (vec_CPU_CPUIDMaxFunction >= 1) {
-				VEC_CPU_CPUID(1, a, b, c, d);
-				vec_CPU_CPUIDFeatures[0] = a;
-				vec_CPU_CPUIDFeatures[1] = b;
-				vec_CPU_CPUIDFeatures[2] = c;
-				vec_CPU_CPUIDFeatures[3] = d;
-
-				// Check to make sure we can call xgetbv
-				if (c & 0x08000000) {
-					// Call xgetbv to see if YMM (etc) register state is saved
-#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
-					__asm__(".byte 0x0f, 0x01, 0xd0"
-							: "=a"(a)
-							: "c"(0)
-							: "%edx");
-#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
-					a = (int)_xgetbv(0);
-#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
-					__asm {
-						xor ecx, ecx
-						_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
-						mov a, eax
-					}
-#endif
-					vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
-					vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
-				}
-			}
-		}
-	}
-}
-
-#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
-static jmp_buf vec_jmpbuf;
-static void vec_CPU_illegal_instruction(int sig)
-{
-	longjmp(vec_jmpbuf, 1);
-}
-#endif
-
-static int vec_CPU_have_ALTIVEC(void)
-{
-	volatile int altivec = 0;
-#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
-	int selectors[2] = {
-# ifdef __OpenBSD__
-		CTL_MACHDEP, CPU_ALTIVEC
-# else
-		CTL_HW, HW_VECTORUNIT
-# endif
-	};
-	int hasVectorUnit = 0;
-	vec_uintsize length = sizeof(hasVectorUnit);
-	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
-	if (!error)
-		altivec = (hasVectorUnit != 0);
-#elif defined(__FreeBSD__) && defined(__powerpc__)
-	unsigned long cpufeatures = 0;
-	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
-	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
-#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
-	void (*handler)(int sig);
-	handler = signal(SIGILL, vec_CPU_illegal_instruction);
-	if (!setjmp(vec_jmpbuf)) {
-		vector unsigned char vec;
-		vec_and(vec, vec);
-		altivec = 1;
-	}
-	signal(SIGILL, handler);
-#endif
-	return altivec;
-}
-
-static int vec_CPU_have_ALTIVEC_VSX(void)
-{
-	volatile int vsx = 0;
-#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
-# warning Compiling UNTESTED code for VSX.
-	void (*handler)(int sig);
-	handler = signal(SIGILL, vec_CPU_illegal_instruction);
-	if (!setjmp(vec_jmpbuf)) {
-		// this is completely untested
-		//__asm__ __volatile__("mtspr 256, %0\n\t"
-		//			 "xxland %%v0, %%v0, %%v0" ::"r"(-1));
-		//vsx = 1;
-	}
-	signal(SIGILL, handler);
-#endif
-	return vsx;
-}
-
-#define vec_CPU_have_MMX()   (vec_CPU_CPUIDFeatures[3] & 0x00800000)
-#define vec_CPU_have_SSE()   (vec_CPU_CPUIDFeatures[3] & 0x02000000)
-#define vec_CPU_have_SSE2()  (vec_CPU_CPUIDFeatures[3] & 0x04000000)
-#define vec_CPU_have_SSE3()  (vec_CPU_CPUIDFeatures[2] & 0x00000001)
-#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
-#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
-#define vec_CPU_have_AVX()   (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))
-
-static inline int vec_CPU_have_AVX2(void)
-{
-	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
-		int a, b, c, d;
-		VEC_CPU_CPUID(7, a, b, c, d);
-		return b & 0x00000020;
-		(void)a, (void)c, (void)d;
-	}
-	return 0;
-}
-
-static inline int vec_CPU_have_AVX512F(void)
-{
-	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
-		int a, b, c, d;
-		VEC_CPU_CPUID(7, a, b, c, d);
-		return b & 0x00000020;
-		(void)a, (void)c, (void)d;
-	}
-	return 0;
-}
-
-#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
-static int readProcAuxvForNeon(void)
-{
-	int neon = 0;
-	int fd;
-
-	fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
-	if (fd >= 0) {
-		Elf32_auxv_t aux;
-		while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
-			if (aux.a_type == AT_HWCAP) {
-				neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
-				break;
-			}
-		}
-		close(fd);
-	}
-	return neon;
-}
-#endif
-
-static int vec_CPU_have_NEON(void)
-{
-/* The way you detect NEON is a privileged instruction on ARM, so you have
-   query the OS kernel in a platform-specific way. :/ */
-#if defined(SDL_CPUINFO_DISABLED)
-	return 0; /* disabled */
-#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
-/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
-/* Seems to have been removed */
-#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
-#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
-#endif
-	/* All WinRT ARM devices are required to support NEON, but just in case. */
-	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
-#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
-	return 1; /* ARMv8 always has non-optional NEON support. */
-#elif defined(__VITA__)
-	return 1;
-#elif defined(__3DS__)
-	return 0;
-#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
-	/* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
-	return 1; /* all Apple ARMv7 chips and later have NEON. */
-#elif defined(__APPLE__)
-	return 0; /* assume anything else from Apple doesn't have NEON. */
-#elif !defined(__arm__)
-	return 0; /* not an ARM CPU at all. */
-#elif defined(__OpenBSD__)
-	return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
-#elif defined(HAVE_ELF_AUX_INFO)
-	unsigned long hasneon = 0;
-	if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
-		return 0;
-
-	return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
-#elif defined(__QNXNTO__)
-	return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
-#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
-	return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
-#elif defined(__linux__)
-	return readProcAuxvForNeon();
-#elif defined(__ANDROID__)
-	/* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
-	{
-		AndroidCpuFamily cpu_family = android_getCpuFamily();
-		if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
-			uint64_t cpu_features = android_getCpuFeatures();
-			if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
-				return 1;
-			}
-		}
-		return 0;
-	}
-#elif defined(__RISCOS__)
-	/* Use the VFPSupport_Features SWI to access the MVFR registers */
-	{
-		_kernel_swi_regs regs;
-		regs.r[0] = 0;
-		if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
-			if ((regs.r[2] & 0xFFF000) == 0x111000) {
-				return 1;
-			}
-		}
-		return 0;
-	}
-#else
-#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
-	return 0;
-#endif
-}
-
-enum {
-	VEC_CPU_HAS_ALTIVEC = (1 << 0),
-	VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1),
-	VEC_CPU_HAS_MMX = (1 << 2),
-	VEC_CPU_HAS_SSE = (1 << 3),
-	VEC_CPU_HAS_SSE2 = (1 << 4),
-	VEC_CPU_HAS_SSE3 = (1 << 5),
-	VEC_CPU_HAS_SSE41 = (1 << 6),
-	VEC_CPU_HAS_SSE42 = (1 << 7),
-	VEC_CPU_HAS_AVX = (1 << 8),
-	VEC_CPU_HAS_AVX2 = (1 << 9),
-	VEC_CPU_HAS_AVX512F = (1 << 10),
-	VEC_CPU_HAS_NEON = (1 << 11),
-};
-
-#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF)
-
-static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;
-
-static void vec_get_CPU_features(void)
-{
-	vec_CPU_get_CPUID_features();
-	vec_CPU_features = 0;
-	if (vec_CPU_have_ALTIVEC())
-		vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
-	if (vec_CPU_have_ALTIVEC_VSX())
-		vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
-	if (vec_CPU_have_MMX())
-		vec_CPU_features |= VEC_CPU_HAS_MMX;
-	if (vec_CPU_have_SSE())
-		vec_CPU_features |= VEC_CPU_HAS_SSE;
-	if (vec_CPU_have_SSE2())
-		vec_CPU_features |= VEC_CPU_HAS_SSE2;
-	if (vec_CPU_have_SSE3())
-		vec_CPU_features |= VEC_CPU_HAS_SSE3;
-	if (vec_CPU_have_SSE41())
-		vec_CPU_features |= VEC_CPU_HAS_SSE41;
-	if (vec_CPU_have_SSE42())
-		vec_CPU_features |= VEC_CPU_HAS_SSE42;
-	if (vec_CPU_have_AVX())
-		vec_CPU_features |= VEC_CPU_HAS_AVX;
-	if (vec_CPU_have_AVX2())
-		vec_CPU_features |= VEC_CPU_HAS_AVX2;
-	if (vec_CPU_have_AVX512F())
-		vec_CPU_features |= VEC_CPU_HAS_AVX512F;
-	if (vec_CPU_have_NEON())
-		vec_CPU_features |= VEC_CPU_HAS_NEON;
-}
-
-#endif /* VEC_IMPL_CPU_H_ */
--- a/include/vec/impl/fallback.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/fallback.h	Sun Nov 24 02:52:40 2024 -0500
@@ -25,148 +25,29 @@
 #ifndef VEC_IMPL_FALLBACK_H_
 #define VEC_IMPL_FALLBACK_H_
 
-#include <string.h>
-
-// Fallback implementations - this is what an implementation should use if it
-// doesn't support a specific function. Note that the load_aligned and
-// store_aligned functions are not implemented here - this is on purpose;
-// every single implementation *needs* to have one of these.
-
-#define VEC_FALLBACK_OPERATION(op, sign, csign, bits, size) \
-	do { \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \
-	\
-		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
-		v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \
-	\
-		for (int i = 0; i < size; i++) varr1[i] = (op); \
-	\
-		return v##sign##int##bits##x##size##_load_aligned(varr1); \
-	} while (0)
-
-#define VEC_FALLBACK_CMP(op, sign, csign, bits, size) \
-	VEC_FALLBACK_OPERATION((varr1[i] op varr2[i]) ? UINT##bits##_MAX : 0, sign, csign, bits, size)
-
-#define VEC_FALLBACK_SHIFT(op, sign, csign, bits, size) \
-	do { \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
-		VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \
-	\
-		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
-		vuint##bits##x##size##_store_aligned(vec2, varr2); \
-	\
-		for (int i = 0; i < size; i++) varr1[i] = (op); \
-	\
-		return v##sign##int##bits##x##size##_load_aligned(varr1); \
-	} while (0)
+#include "vec/vec.h"
 
 #define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x) \
-	{ \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
-		for (int i = 0; i < size; i++) arr[i] = x; \
-		return v##sign##int##bits##x##size##_load_aligned(arr); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]) \
-	{ \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
-		memcpy(arr, in, sizeof(vec_##sign##int##bits) * size); \
-		return v##sign##int##bits##x##size##_load_aligned(arr); \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
-		v##sign##int##bits##x##size##_store_aligned(vec, arr); \
-		memcpy(out, arr, sizeof(vec_##sign##int##bits) * size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_add(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \
-	{ \
-		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)UINT##bits##_MAX)); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_CMP(<, sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_CMP(<=, sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_CMP(==, sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_CMP(>=, sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_CMP(>, sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_FALLBACK_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size); \
-	}
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]); \
+	void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
 
 #define VEC_DEFINE_FALLBACK_OPERATIONS(bits, size) \
 	VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( ,  , bits, size) \
@@ -202,9 +83,6 @@
 VEC_DEFINE_FALLBACK_OPERATIONS(32, 16)
 VEC_DEFINE_FALLBACK_OPERATIONS(64, 8)
 
-#undef VEC_FALLBACK_OPERATION
-#undef VEC_FALLBACK_CMP
-#undef VEC_FALLBACK_SHIFT
 #undef VEC_DEFINE_FALLBACK_OPERATIONS
 #undef VEC_DEFINE_FALLBACK_OPERATIONS_SIGN
 
--- a/include/vec/impl/generic.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/generic.h	Sun Nov 24 02:52:40 2024 -0500
@@ -27,114 +27,113 @@
 #ifndef VEC_IMPL_GENERIC_H_
 #define VEC_IMPL_GENERIC_H_
 
-#include <string.h>
-
-// -----------------------------------------------------------------
-
-// TODO implement these so we don't waste stack space by doing the
-// fallbacks
-#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		memcpy(vec.generic, in, sizeof(vec_##sign##int##bits) * size); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		memcpy(out, vec.generic, sizeof(vec_##sign##int##bits) * size); \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_generic_load_aligned, \
-		v##sign##int##bits##x##size##_generic_load_aligned, \
-		v##sign##int##bits##x##size##_generic_store_aligned, \
-		v##sign##int##bits##x##size##_generic_store_aligned, \
-	};
-
-#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \
-	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
-	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
-
-VEC_GENERIC_DEFINE_OPERATIONS(8, 2)
-VEC_GENERIC_DEFINE_OPERATIONS(16, 2)
-VEC_GENERIC_DEFINE_OPERATIONS(32, 2)
-VEC_GENERIC_DEFINE_OPERATIONS(64, 2)
-
-#undef VEC_GENERIC_DEFINE_OPERATIONS
-#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN
+#include "vec/vec.h"
 
-// -----------------------------------------------------------------
-// now we can just keep doubling the same implementation
+#define VEC_DEFINE_GENERIC_OPERATIONS_SIGN(sign, csign, bits, size) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_splat(vec_##sign##int##bits x); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load(const vec_##sign##int##bits in[size]); \
+	void v##sign##int##bits##x##size##_generic_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_not(v##sign##int##bits##x##size vec); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
 
-#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.generic[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \
-		vec.generic[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.generic[0] = v##sign##int##bits##x##halfsize##_load(in); \
-		vec.generic[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[0], out); \
-		v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[1], out + halfsize); \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_generic_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		v##sign##int##bits##x##halfsize##_store(vec.generic[0], out); \
-		v##sign##int##bits##x##halfsize##_store(vec.generic[1], out + halfsize); \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_generic_load_aligned, \
-		v##sign##int##bits##x##size##_generic_load, \
-		v##sign##int##bits##x##size##_generic_store_aligned, \
-		v##sign##int##bits##x##size##_generic_store, \
-	};
+#define VEC_DEFINE_GENERIC_OPERATIONS(bits, size) \
+	VEC_DEFINE_GENERIC_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_GENERIC_OPERATIONS_SIGN(u, U, bits, size)
 
-#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \
-	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size, halfsize) \
-	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize)
+// 16-bit
+VEC_DEFINE_GENERIC_OPERATIONS(8, 2)
 
 // 32-bit
-VEC_GENERIC_DEFINE_OPERATIONS(8, 4, 2)
+VEC_DEFINE_GENERIC_OPERATIONS(8, 4)
+VEC_DEFINE_GENERIC_OPERATIONS(16, 2)
 
 // 64-bit
-VEC_GENERIC_DEFINE_OPERATIONS(8, 8, 4)
-VEC_GENERIC_DEFINE_OPERATIONS(16, 4, 2)
+VEC_DEFINE_GENERIC_OPERATIONS(8, 8)
+VEC_DEFINE_GENERIC_OPERATIONS(16, 4)
+VEC_DEFINE_GENERIC_OPERATIONS(32, 2)
 
 // 128-bit
-VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8)
-VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4)
-VEC_GENERIC_DEFINE_OPERATIONS(32, 4, 2)
+VEC_DEFINE_GENERIC_OPERATIONS(8, 16)
+VEC_DEFINE_GENERIC_OPERATIONS(16, 8)
+VEC_DEFINE_GENERIC_OPERATIONS(32, 4)
+VEC_DEFINE_GENERIC_OPERATIONS(64, 2)
 
 // 256-bit
-VEC_GENERIC_DEFINE_OPERATIONS(8, 32, 16)
-VEC_GENERIC_DEFINE_OPERATIONS(16, 16, 8)
-VEC_GENERIC_DEFINE_OPERATIONS(32, 8, 4)
-VEC_GENERIC_DEFINE_OPERATIONS(64, 4, 2)
+VEC_DEFINE_GENERIC_OPERATIONS(8, 32)
+VEC_DEFINE_GENERIC_OPERATIONS(16, 16)
+VEC_DEFINE_GENERIC_OPERATIONS(32, 8)
+VEC_DEFINE_GENERIC_OPERATIONS(64, 4)
 
 // 512-bit
-VEC_GENERIC_DEFINE_OPERATIONS(8, 64, 32)
-VEC_GENERIC_DEFINE_OPERATIONS(16, 32, 16)
-VEC_GENERIC_DEFINE_OPERATIONS(32, 16, 8)
-VEC_GENERIC_DEFINE_OPERATIONS(64, 8, 4)
+VEC_DEFINE_GENERIC_OPERATIONS(8, 64)
+VEC_DEFINE_GENERIC_OPERATIONS(16, 32)
+VEC_DEFINE_GENERIC_OPERATIONS(32, 16)
+VEC_DEFINE_GENERIC_OPERATIONS(64, 8)
+
+#undef VEC_DEFINE_GENERIC_OPERATIONS
+#undef VEC_DEFINE_GENERIC_OPERATIONS_SIGN
+
+// 16-bit
+extern const vint8x2_impl    vint8x2_impl_generic;
+extern const vuint8x2_impl   vuint8x2_impl_generic;
+
+// 32-bit
+extern const vint8x4_impl    vint8x4_impl_generic;
+extern const vuint8x4_impl   vuint8x4_impl_generic;
+extern const vint16x2_impl   vint16x2_impl_generic;
+extern const vuint16x2_impl  vuint16x2_impl_generic;
+
+// 64-bit
+extern const vint8x8_impl    vint8x8_impl_generic;
+extern const vuint8x8_impl   vuint8x8_impl_generic;
+extern const vint16x4_impl   vint16x4_impl_generic;
+extern const vuint16x4_impl  vuint16x4_impl_generic;
+extern const vint32x2_impl   vint32x2_impl_generic;
+extern const vuint32x2_impl  vuint32x2_impl_generic;
 
-#undef VEC_GENERIC_DEFINE_OPERATIONS
-#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN
+// 128-bit
+extern const vint8x16_impl   vint8x16_impl_generic;
+extern const vuint8x16_impl  vuint8x16_impl_generic;
+extern const vint16x8_impl   vint16x8_impl_generic;
+extern const vuint16x8_impl  vuint16x8_impl_generic;
+extern const vint32x4_impl   vint32x4_impl_generic;
+extern const vuint32x4_impl  vuint32x4_impl_generic;
+extern const vint64x2_impl   vint64x2_impl_generic;
+extern const vuint64x2_impl  vuint64x2_impl_generic;
+
+// 256-bit
+extern const vint8x32_impl   vint8x32_impl_generic;
+extern const vuint8x32_impl  vuint8x32_impl_generic;
+extern const vint16x16_impl  vint16x16_impl_generic;
+extern const vuint16x16_impl vuint16x16_impl_generic;
+extern const vint32x8_impl   vint32x8_impl_generic;
+extern const vuint32x8_impl  vuint32x8_impl_generic;
+extern const vint64x4_impl   vint64x4_impl_generic;
+extern const vuint64x4_impl  vuint64x4_impl_generic;
+
+// 512-bit
+extern const vint8x64_impl   vint8x64_impl_generic;
+extern const vuint8x64_impl  vuint8x64_impl_generic;
+extern const vint16x32_impl  vint16x32_impl_generic;
+extern const vuint16x32_impl vuint16x32_impl_generic;
+extern const vint32x16_impl  vint32x16_impl_generic;
+extern const vuint32x16_impl vuint32x16_impl_generic;
+extern const vint64x8_impl   vint64x8_impl_generic;
+extern const vuint64x8_impl  vuint64x8_impl_generic;
 
 #endif /* VEC_IMPL_GENERIC_H_ */
--- a/include/vec/impl/integer.h.in	Sat Nov 23 04:09:44 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,58 +0,0 @@
-/**
- * vec - a tiny SIMD vector library in plain C99
- * 
- * Copyright (c) 2024 Paper
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
-#ifndef VEC_IMPL_INTEGER_H_
-#define VEC_IMPL_INTEGER_H_
-
-#cmakedefine HAVE_SYS_TYPES_H
-#cmakedefine HAVE_STDDEF_H
-#cmakedefine HAVE_STDINT_H
-
-#ifdef HAVE_SYS_TYPES_H
-# include <sys/types.h>
-#endif
-#ifdef HAVE_STDDEF_H
-# include <stddef.h>
-#endif
-#ifdef HAVE_STDINT_H
-# include <stdint.h>
-#endif
-
-typedef signed char   vec_int8;
-typedef @SIZE16@      vec_int16;
-typedef @SIZE32@      vec_int32;
-typedef @SIZE64@      vec_int64;
-
-typedef unsigned char vec_uint8;
-typedef @USIZE16@     vec_uint16;
-typedef @USIZE32@     vec_uint32;
-typedef @USIZE64@     vec_uint64;
-
-/* this is only used for bitshifting right now */
-typedef vec_int64     vec_intmax;
-typedef vec_uint64    vec_uintmax;
-
-typedef @USIZEPTR@    vec_uintptr;
-
-#endif /* VEC_IMPL_INTEGER_H_ */
\ No newline at end of file
--- a/include/vec/impl/ppc/altivec.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/ppc/altivec.h	Sun Nov 24 02:52:40 2024 -0500
@@ -27,228 +27,13 @@
 #ifndef VEC_IMPL_PPC_ALTIVEC_H_
 #define VEC_IMPL_PPC_ALTIVEC_H_
 
-#include <altivec.h>
-
-/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */
-#ifdef vec_mul
-# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_mul(vec1.altivec, vec2.altivec); \
-		return vec; \
-	}
-# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \
-	v##sign##int##bits##x##size##_altivec_mul
-#else
-# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size)
-# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) NULL
-#endif
-
-#ifdef vec_splats
-# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(vec_##sign##int##bits x) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_splats(x); \
-		return vec; \
-	}
-# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \
-	v##sign##int##bits##x##size##_altivec_splat
-#else
-# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size)
-# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) NULL
-#endif
-
-#define VEC_ALTIVEC_uRSHIFT vec_sr
-#define VEC_ALTIVEC_RSHIFT vec_sra
-
-#define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_sr(vec1.altivec, vec2.altivec); \
-		return vec; \
-	}
-#define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \
-	v##sign##int##bits##x##size##_altivec_lrshift
-
-#define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size)
-#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) NULL
-
-#define VEC_ALTIVEC_CAST_BOOL_8 (vector signed char)
-#define VEC_ALTIVEC_CAST_BOOL_U8 (vector unsigned char)
-#define VEC_ALTIVEC_CAST_BOOL_16 (vector signed short)
-#define VEC_ALTIVEC_CAST_BOOL_U16 (vector unsigned short)
-#define VEC_ALTIVEC_CAST_BOOL_32 (vector signed int)
-#define VEC_ALTIVEC_CAST_BOOL_U32 (vector unsigned int)
+#include "vec/vec.h"
 
-/* Since altivec conveniently made their API super user friendly, we can just use
- * one giant macro to define literally everything */
-#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_ld(0, in); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		vec_st(vec.altivec, 0, out); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_add(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_sub(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_sl(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_avg(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_and(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_or(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = vec_xor(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits  vec_cmplt(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmplt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpeq(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmpgt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpgt(vec1.altivec, vec2.altivec); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \
-		VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size), \
-		v##sign##int##bits##x##size##_altivec_load_aligned, \
-		v##sign##int##bits##x##size##_altivec_load, \
-		v##sign##int##bits##x##size##_altivec_store_aligned, \
-		/* .store = */ NULL, \
-		v##sign##int##bits##x##size##_altivec_add, \
-		v##sign##int##bits##x##size##_altivec_sub, \
-		VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size), \
-		/* .div = */ NULL, \
-		v##sign##int##bits##x##size##_altivec_avg, \
-		v##sign##int##bits##x##size##_altivec_and, \
-		v##sign##int##bits##x##size##_altivec_or, \
-		v##sign##int##bits##x##size##_altivec_xor, \
-		/* .not = */ NULL, \
-		v##sign##int##bits##x##size##_altivec_lshift, \
-		v##sign##int##bits##x##size##_altivec_rshift, \
-		VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size), \
-		v##sign##int##bits##x##size##_altivec_cmplt, \
-		v##sign##int##bits##x##size##_altivec_cmple, \
-		v##sign##int##bits##x##size##_altivec_cmpeq, \
-		v##sign##int##bits##x##size##_altivec_cmpge, \
-		v##sign##int##bits##x##size##_altivec_cmpgt, \
-	};
-
-#define VEC_DEFINE_OPERATIONS(bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
-
-VEC_DEFINE_OPERATIONS(8, 16)
-VEC_DEFINE_OPERATIONS(16, 8)
-VEC_DEFINE_OPERATIONS(32, 4)
-//#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-//VEC_DEFINE_OPERATIONS(64, 2)
-//#endif
-
-#undef VEC_DEFINE_OPERATIONS
-#undef VEC_DEFINE_OPERATIONS_SIGN
-#undef VEC_ALTIVEC_DEFINE_MUL
-#undef VEC_ALTIVEC_STRUCT_MUL
-#undef VEC_ALTIVEC_DEFINE_LRSHIFT
-#undef VEC_ALTIVEC_STRUCT_LRSHIFT
-#undef VEC_ALTIVEC_DEFINE_uLRSHIFT
-#undef VEC_ALTIVEC_STRUCT_uLRSHIFT
-#undef VEC_ALTIVEC_DEFINE_SPLAT
-#undef VEC_ALTIVEC_STRUCT_SPLAT
-#undef VEC_ALTIVEC_uRSHIFT
-#undef VEC_ALTIVEC_RSHIFT
+extern const vint8x16_impl vint8x16_impl_altivec;
+extern const vint16x8_impl vint16x8_impl_altivec;
+extern const vint32x4_impl vint32x4_impl_altivec;
+extern const vuint8x16_impl vuint8x16_impl_altivec;
+extern const vuint16x8_impl vuint16x8_impl_altivec;
+extern const vuint32x4_impl vuint32x4_impl_altivec;
 
 #endif /* VEC_IMPL_PPC_ALTIVEC_H_ */
--- a/include/vec/impl/x86/avx2.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/x86/avx2.h	Sun Nov 24 02:52:40 2024 -0500
@@ -25,272 +25,15 @@
 #ifndef VEC_IMPL_X86_AVX2_H_
 #define VEC_IMPL_X86_AVX2_H_
 
-#define VEC_AVX2_OPERATION_8x32_16x16(op, sign) \
-	do { \
-		/* unpack and multiply */ \
-		__m256i dst_even = _mm256_##op##_epi16(vec1.avx2, vec2.avx2); \
-		__m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1.avx2, 8), _mm256_srli_epi16(vec2.avx2, 8)); \
-	\
-		/* repack */ \
-		v##sign##int8x32 vec; \
-		vec.avx2 = _mm256_or_si256( \
-			_mm256_slli_epi16(dst_odd, 8), \
-			_mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \
-		); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX2_OPERATION_8x32_32x8(op, sign) \
-	do { \
-		/* unpack */ \
-		__m256i dst_1 = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \
-		__m256i dst_2 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 8), _mm256_srli_epi32(vec2.avx2, 8)); \
-		__m256i dst_3 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \
-		__m256i dst_4 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 24), _mm256_srli_epi32(vec2.avx2, 24)); \
-	\
-		/* repack */ \
-		v##sign##int8x32 vec; \
-		vec.avx2 = _mm256_or_si256( \
-			_mm256_or_si256( \
-				_mm256_slli_epi32(dst_4, 8), \
-				_mm256_srli_epi32(_mm256_slli_epi32(dst_3, 8), 8) \
-			), \
-			_mm256_or_si256( \
-				_mm256_slli_epi32(_mm256_slli_epi32(dst_2, 8), 16), \
-				_mm256_srli_epi32(_mm256_slli_epi32(dst_1, 8), 24) \
-			) \
-		); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX2_OPERATION_16x16(op, sign) \
-	do { \
-		/* unpack and multiply */ \
-		__m256i dst_even = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \
-		__m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \
-	\
-		/* repack */ \
-		v##sign##int16x16 vec; \
-		vec.avx2 = _mm256_or_si256( \
-			_mm256_slli_epi32(dst_odd, 16), \
-			_mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \
-		); \
-		return vec; \
-	} while (0)
-
-// shifting
-
-#define VEC_AVX2_LSHIFT_8x32(sign) \
-	VEC_AVX2_OPERATION_8x32_32x8(sllv, sign)
-
-#define VEC_AVX2_LSHIFT_16x16(sign) \
-	VEC_AVX2_OPERATION_16x16(sllv, sign)
-
-#define VEC_AVX2_LSHIFT_32x8(sign) \
-	do { \
-		v##sign##int32x8 vec; \
-		vec.avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX2_LSHIFT_64x4(sign) \
-	do { \
-		v##sign##int64x4 vec; \
-		vec.avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX2_RSHIFT_8x32(sign, aORl) \
-	VEC_AVX2_OPERATION_8x32_32x8(sr##aORl##v, sign)
-
-#define VEC_AVX2_RSHIFT_16x16(sign, aORl) \
-	VEC_AVX2_OPERATION_16x16(sr##aORl##v, sign)
-
-#define VEC_AVX2_RSHIFT_32x8(sign, aORl) \
-	do { \
-		v##sign##int32x8 vec; \
-		vec.avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX2_aRSHIFT_64x4(sign) \
-	do { \
-		return v##sign##int64x4_fallback_rshift(vec1, vec2); \
-	} while (0)
-
-#define VEC_AVX2_lRSHIFT_64x4(sign) \
-	do { \
-		v##sign##int64x4 vec; \
-		vec.avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX2_RSHIFT_64x4(sign, aORl) \
-	VEC_AVX2_##aORl##RSHIFT_64x4(sign)
-
-// multiplication
-
-#define VEC_AVX2_MUL_8x32(sign) \
-	VEC_AVX2_OPERATION_8x32_16x16(mullo, sign)
-
-#define VEC_AVX2_MUL_16x16(sign) \
-	do { \
-		v##sign##int16x16 vec; \
-		vec.avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX2_MUL_32x8(sign) \
-	do { \
-		v##sign##int32x8 vec; \
-		vec.avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} while (0)
+#include "vec/vec.h"
 
-#define VEC_AVX2_MUL_64x4(sign) \
-	do { \
-		__m256i ac = _mm256_mul_epu32(vec1.avx2, vec2.avx2); \
-		__m256i b  = _mm256_srli_epi64(vec1.avx2, 32); \
-		__m256i bc = _mm256_mul_epu32(b, vec2.avx2); \
-		__m256i d  = _mm256_srli_epi64(vec2.avx2, 32); \
-		__m256i ad = _mm256_mul_epu32(vec1.avx2, d); \
-		__m256i hi = _mm256_add_epi64(bc, ad); \
-		hi = _mm256_slli_epi64(hi, 32); \
-	\
-		v##sign##int64x4 vec; \
-		vec.avx2 = _mm256_add_epi64(hi, ac); \
-		return vec; \
-	} while (0)
-
-// operations
-
-#define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx2 = _mm256_load_si256((const __m256i *)in); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx2 = _mm256_loadu_si256((const __m256i *)in); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		_mm256_store_si256((__m256i *)out, vec.avx2); \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		_mm256_storeu_si256((__m256i *)out, vec.avx2); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_AVX2_MUL_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_AVX2_LSHIFT_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_AVX2_RSHIFT_##bits##x##size(sign, a); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_AVX2_RSHIFT_##bits##x##size(sign, l); \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_avx2_load_aligned, \
-		v##sign##int##bits##x##size##_avx2_load, \
-		v##sign##int##bits##x##size##_avx2_store_aligned, \
-		v##sign##int##bits##x##size##_avx2_store, \
-		v##sign##int##bits##x##size##_avx2_add, \
-		v##sign##int##bits##x##size##_avx2_sub, \
-		v##sign##int##bits##x##size##_avx2_mul, \
-		/* .div = */ NULL, \
-		/* .avg = */ NULL, \
-		v##sign##int##bits##x##size##_avx2_and, \
-		v##sign##int##bits##x##size##_avx2_or, \
-		v##sign##int##bits##x##size##_avx2_xor, \
-		/* .not = */ NULL, \
-		v##sign##int##bits##x##size##_avx2_lshift, \
-		v##sign##int##bits##x##size##_avx2_rshift, \
-		v##sign##int##bits##x##size##_avx2_lrshift, \
-	};
-
-#define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \
-	VEC_AVX2_DEFINE_OPERATIONS_SIGN( , bits, size) \
-	VEC_AVX2_DEFINE_OPERATIONS_SIGN(u, bits, size)
-
-VEC_AVX2_DEFINE_OPERATIONS(8, 32)
-VEC_AVX2_DEFINE_OPERATIONS(16, 16)
-VEC_AVX2_DEFINE_OPERATIONS(32, 8)
-VEC_AVX2_DEFINE_OPERATIONS(64, 4)
-
-#undef VEC_AVX2_DEFINE_OPERATIONS
-#undef VEC_AVX2_DEFINE_OPERATIONS_SIGN
-#undef VEC_AVX2_MUL_8x32
-#undef VEC_AVX2_MUL_16x16
-#undef VEC_AVX2_MUL_32x8
-#undef VEC_AVX2_MUL_64x4
-#undef VEC_AVX2_OPERATION_8x32_16x16
-#undef VEC_AVX2_OPERATION_8x32_32x8
-#undef VEC_AVX2_OPERATION_16x16
-#undef VEC_AVX2_LSHIFT_8x32
-#undef VEC_AVX2_LSHIFT_16x16
-#undef VEC_AVX2_LSHIFT_32x8
-#undef VEC_AVX2_LSHIFT_64x4
-#undef VEC_AVX2_RSHIFT_8x32
-#undef VEC_AVX2_RSHIFT_16x16
-#undef VEC_AVX2_RSHIFT_32x8
-#undef VEC_AVX2_aRSHIFT_64x4
-#undef VEC_AVX2_lRSHIFT_64x4
-#undef VEC_AVX2_RSHIFT_64x4
+extern const vint8x32_impl   vint8x32_impl_avx2;
+extern const vint16x16_impl  vint16x16_impl_avx2;
+extern const vint32x8_impl   vint32x8_impl_avx2;
+extern const vint64x4_impl   vint64x4_impl_avx2;
+extern const vuint8x32_impl  vuint8x32_impl_avx2;
+extern const vuint16x16_impl vuint16x16_impl_avx2;
+extern const vuint32x8_impl  vuint32x8_impl_avx2;
+extern const vuint64x4_impl  vuint64x4_impl_avx2;
 
 #endif /* VEC_IMPL_X86_AVX2_H_ */
--- a/include/vec/impl/x86/avx512f.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/x86/avx512f.h	Sun Nov 24 02:52:40 2024 -0500
@@ -25,272 +25,15 @@
 #ifndef VEC_IMPL_X86_AVX512F_H_
 #define VEC_IMPL_X86_AVX512F_H_
 
-#define VEC_AVX512F_OPERATION_8x64(op, sign) \
-	do { \
-		/* unpack and add */ \
-		__m512i dst_1 = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \
-		__m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 8), _mm512_srli_epi32(vec2.avx512f, 8)); \
-		__m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \
-		__m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 24), _mm512_srli_epi32(vec2.avx512f, 24)); \
-	\
-		/* repack */ \
-		v##sign##int8x64 vec; \
-		vec.avx512f = _mm512_or_si512( \
-			_mm512_or_si512( \
-				_mm512_slli_epi32(dst_4, 8), \
-				_mm512_srli_epi32(_mm512_slli_epi32(dst_3, 8), 8) \
-			), \
-			_mm512_or_si512( \
-				_mm512_slli_epi32(_mm512_slli_epi32(dst_2, 8), 16), \
-				_mm512_srli_epi32(_mm512_slli_epi32(dst_1, 8), 24) \
-			) \
-		); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_OPERATION_16x32(op, sign) \
-	do { \
-		/* unpack and add */ \
-		__m512i dst_even = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \
-		__m512i dst_odd = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \
-	\
-		/* repack */ \
-		v##sign##int16x32 vec; \
-		vec.avx512f = _mm512_or_si512( \
-			_mm512_slli_epi32(dst_odd, 16), \
-			_mm512_srli_epi32(_mm512_slli_epi32(dst_even, 16), 16) \
-		); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_ADD_8x64(sign) \
-	VEC_AVX512F_OPERATION_8x64(add, sign)
-
-#define VEC_AVX512F_ADD_16x32(sign) \
-	VEC_AVX512F_OPERATION_16x32(add, sign)
-
-#define VEC_AVX512F_ADD_32x16(sign) \
-	do { \
-		v##sign##int32x16 vec; \
-		vec.avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_ADD_64x8(sign) \
-	do { \
-		v##sign##int64x8 vec; \
-		vec.avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_SUB_8x64(sign) \
-	VEC_AVX512F_OPERATION_8x64(sub, sign)
-
-#define VEC_AVX512F_SUB_16x32(sign) \
-	VEC_AVX512F_OPERATION_16x32(sub, sign)
-
-#define VEC_AVX512F_SUB_32x16(sign) \
-	do { \
-		v##sign##int32x16 vec; \
-		vec.avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_SUB_64x8(sign) \
-	do { \
-		v##sign##int64x8 vec; \
-		vec.avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_MUL_8x64(sign) \
-	VEC_AVX512F_OPERATION_8x64(mullo, sign)
-
-#define VEC_AVX512F_MUL_16x32(sign) \
-	VEC_AVX512F_OPERATION_16x32(mullo, sign)
-
-#define VEC_AVX512F_MUL_32x16(sign) \
-	do { \
-		v##sign##int32x16 vec; \
-		vec.avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_MUL_64x8(sign) \
-	do { \
-		__m512i ac = _mm512_mul_epu32(vec1.avx512f, vec2.avx512f); \
-		__m512i b  = _mm512_srli_epi64(vec1.avx512f, 32); \
-		__m512i bc = _mm512_mul_epu32(b, vec2.avx512f); \
-		__m512i d  = _mm512_srli_epi64(vec2.avx512f, 32); \
-		__m512i ad = _mm512_mul_epu32(vec1.avx512f, d); \
-		__m512i hi = _mm512_add_epi64(bc, ad); \
-		hi = _mm512_slli_epi64(hi, 32); \
-	\
-		v##sign##int64x8 vec; \
-		vec.avx512f = _mm512_add_epi64(hi, ac); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_LSHIFT_8x64(sign) \
-	VEC_AVX512F_OPERATION_8x64(sllv, sign)
-
-#define VEC_AVX512F_LSHIFT_16x32(sign) \
-	VEC_AVX512F_OPERATION_16x32(sllv, sign)
-
-#define VEC_AVX512F_LSHIFT_32x16(sign) \
-	do { \
-		v##sign##int32x16 vec; \
-		vec.avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_LSHIFT_64x8(sign) \
-	do { \
-		v##sign##int64x8 vec; \
-		vec.avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_RSHIFT_8x64(sign, aORl) \
-	VEC_AVX512F_OPERATION_8x64(sr##aORl##v, sign)
-
-#define VEC_AVX512F_RSHIFT_16x32(sign, aORl) \
-	VEC_AVX512F_OPERATION_16x32(sr##aORl##v, sign)
+#include "vec/vec.h"
 
-#define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \
-	do { \
-		v##sign##int32x16 vec; \
-		vec.avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \
-	do { \
-		v##sign##int64x8 vec; \
-		vec.avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} while (0)
-
-#define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx512f = _mm512_load_si512((const __m512i *)in); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		_mm512_store_si512((__m512i *)out, vec.avx512f); \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		_mm512_storeu_si512((__m512i *)out, vec.avx512f); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_AVX512F_ADD_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_AVX512F_SUB_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_AVX512F_MUL_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_AVX512F_LSHIFT_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_AVX512F_RSHIFT_##bits##x##size(sign, a); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_avx512f_load_aligned, \
-		v##sign##int##bits##x##size##_avx512f_load, \
-		v##sign##int##bits##x##size##_avx512f_store_aligned, \
-		v##sign##int##bits##x##size##_avx512f_store, \
-		v##sign##int##bits##x##size##_avx512f_add, \
-		v##sign##int##bits##x##size##_avx512f_sub, \
-		v##sign##int##bits##x##size##_avx512f_mul, \
-		/* .div = */ NULL, \
-		/* .avg = */ NULL, \
-		v##sign##int##bits##x##size##_avx512f_and, \
-		v##sign##int##bits##x##size##_avx512f_or, \
-		v##sign##int##bits##x##size##_avx512f_xor, \
-		/* .not = */ NULL, \
-		v##sign##int##bits##x##size##_avx512f_lshift, \
-		v##sign##int##bits##x##size##_avx512f_rshift, \
-		v##sign##int##bits##x##size##_avx512f_lrshift, \
-	};
-
-#define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \
-	VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size) \
-	VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size)
-
-VEC_AVX512F_DEFINE_OPERATIONS(8, 64)
-VEC_AVX512F_DEFINE_OPERATIONS(16, 32)
-VEC_AVX512F_DEFINE_OPERATIONS(32, 16)
-VEC_AVX512F_DEFINE_OPERATIONS(64, 8)
-
-#undef VEC_AVX512F_DEFINE_OPERATIONS
-#undef VEC_AVX512F_DEFINE_OPERATIONS_SIGN
-#undef VEC_AVX512F_MUL_8x64
-#undef VEC_AVX512F_MUL_16x32
-#undef VEC_AVX512F_MUL_32x16
-#undef VEC_AVX512F_MUL_64x8
-
-#undef VEC_AVX512F_LSHIFT_8x64
-#undef VEC_AVX512F_LSHIFT_16x32
-#undef VEC_AVX512F_LSHIFT_32x16
-#undef VEC_AVX512F_LSHIFT_64x8
-
-#undef VEC_AVX512F_RSHIFT_8x64
-#undef VEC_AVX512F_RSHIFT_16x32
-#undef VEC_AVX512F_RSHIFT_32x16
-#undef VEC_AVX512F_RSHIFT_64x8
+extern const vint8x64_impl   vint8x64_impl_avx512f;
+extern const vint16x32_impl  vint16x32_impl_avx512f;
+extern const vint32x16_impl  vint32x16_impl_avx512f;
+extern const vint64x8_impl   vint64x8_impl_avx512f;
+extern const vuint8x64_impl  vuint8x64_impl_avx512f;
+extern const vuint16x32_impl vuint16x32_impl_avx512f;
+extern const vuint32x16_impl vuint32x16_impl_avx512f;
+extern const vuint64x8_impl  vuint64x8_impl_avx512f;
 
 #endif /* VEC_IMPL_X86_AVX512F_H_ */
--- a/include/vec/impl/x86/mmx.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/x86/mmx.h	Sun Nov 24 02:52:40 2024 -0500
@@ -25,190 +25,13 @@
 #ifndef VEC_IMPL_X86_MMX_H_
 #define VEC_IMPL_X86_MMX_H_
 
-#define VEC_MMX_OPERATION_8x8(op, sign) \
-	do { \
-		/* unpack and multiply */ \
-		__m64 dst_even = _mm_##op##_pi16(vec1.mmx, vec2.mmx); \
-		__m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1.mmx, 8), _mm_srli_pi16(vec2.mmx, 8)); \
-	\
-		/* repack */ \
-		v##sign##int8x8 vec; \
-		vec.mmx = _mm_or_si64( \
-			_mm_slli_pi16(dst_odd, 8), \
-			_mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \
-		); \
-		return vec; \
-	} while (0)
-
-// shifting
-#define VEC_MMX_LSHIFT_8x8(sign) \
-	VEC_MMX_OPERATION_8x8(sll, sign)
-
-#define VEC_MMX_LSHIFT_16x4(sign) \
-	do { \
-		v##sign##int16x4 vec; \
-		vec.mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} while (0)
-
-#define VEC_MMX_LSHIFT_32x2(sign) \
-	do { \
-		v##sign##int32x2 vec; \
-		vec.mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} while (0)
-
-#define VEC_MMX_RSHIFT_8x8(sign, aORl) \
-	VEC_MMX_OPERATION_8x8(sr##aORl, sign)
-
-#define VEC_MMX_RSHIFT_16x4(sign, aORl) \
-	do { \
-		v##sign##int16x4 vec; \
-		vec.mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} while (0)
-
-#define VEC_MMX_RSHIFT_32x2(sign, aORl) \
-	do { \
-		v##sign##int32x2 vec; \
-		vec.mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} while (0)
-
-// shared between MMX variations
-#define VEC_MMX_MUL_8x8(sign) \
-	VEC_MMX_OPERATION_8x8(mullo, sign)
-
-#define VEC_MMX_MUL_16x4(sign) \
-	do { \
-		/* we have a real instruction for this */ \
-		v##sign##int16x4 vec; \
-		vec.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} while (0)
-
-#define VEC_MMX_MUL_32x2(sign) \
-	do { \
-		__m64 ac = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \
-		__m64 b  = _mm_srli_pi32(vec1.mmx, 16); \
-		__m64 bc = _mm_mullo_pi16(b, vec2.mmx); \
-		__m64 d  = _mm_srli_pi32(vec2.mmx, 16); \
-		__m64 ad = _mm_mullo_pi16(vec1.mmx, d); \
-		__m64 hi = _mm_add_pi32(bc, ad); \
-		hi = _mm_slli_pi32(hi, 16); \
-	\
-		v##sign##int32x2 vec; \
-		vec.mmx = _mm_add_pi32(hi, ac); \
-		return vec; \
-	} while (0)
+#include "vec/vec.h"
 
-#define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		memcpy(&vec.mmx, in, sizeof(vec.mmx)); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		memcpy(out, &vec.mmx, sizeof(vec.mmx)); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_MMX_MUL_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.mmx = _mm_and_si64(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.mmx = _mm_or_si64(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.mmx = _mm_xor_si64(vec1.mmx, vec2.mmx); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_MMX_LSHIFT_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_MMX_RSHIFT_##bits##x##size(sign, a); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_MMX_RSHIFT_##bits##x##size(sign, l); \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_mmx_load_aligned, \
-		v##sign##int##bits##x##size##_mmx_load_aligned, \
-		v##sign##int##bits##x##size##_mmx_store_aligned, \
-		v##sign##int##bits##x##size##_mmx_store_aligned, \
-		v##sign##int##bits##x##size##_mmx_add, \
-		v##sign##int##bits##x##size##_mmx_sub, \
-		v##sign##int##bits##x##size##_mmx_mul, \
-		/* .div = */ NULL, \
-		/* .avg = */ NULL, \
-		v##sign##int##bits##x##size##_mmx_and, \
-		v##sign##int##bits##x##size##_mmx_or, \
-		v##sign##int##bits##x##size##_mmx_xor, \
-		/* .not = */ NULL, \
-		v##sign##int##bits##x##size##_mmx_lshift, \
-		v##sign##int##bits##x##size##_mmx_rshift, \
-		v##sign##int##bits##x##size##_mmx_lrshift, \
-	};
-
-#define VEC_MMX_DEFINE_OPERATIONS(bits, size) \
-	VEC_MMX_DEFINE_OPERATIONS_SIGN( , bits, size) \
-	VEC_MMX_DEFINE_OPERATIONS_SIGN(u, bits, size)
-
-VEC_MMX_DEFINE_OPERATIONS(8, 8)
-VEC_MMX_DEFINE_OPERATIONS(16, 4)
-VEC_MMX_DEFINE_OPERATIONS(32, 2)
-
-#undef VEC_MMX_DEFINE_OPERATIONS
-#undef VEC_MMX_DEFINE_OPERATIONS_SIGN
-#undef VEC_MMX_MUL_8x8
-#undef VEC_MMX_MUL_16x4
-#undef VEC_MMX_MUL_32x2
-#undef VEC_MMX_OPERATION_8x8
-#undef VEC_MMX_LSHIFT_8x8
-#undef VEC_MMX_LSHIFT_16x4
-#undef VEC_MMX_LSHIFT_32x2
-#undef VEC_MMX_RSHIFT_8x8
-#undef VEC_MMX_RSHIFT_16x4
-#undef VEC_MMX_RSHIFT_32x2
+extern const vint8x8_impl vint8x8_impl_mmx;
+extern const vint16x4_impl vint16x4_impl_mmx;
+extern const vint32x2_impl vint32x2_impl_mmx;
+extern const vuint8x8_impl vuint8x8_impl_mmx;
+extern const vuint16x4_impl vuint16x4_impl_mmx;
+extern const vuint32x2_impl vuint32x2_impl_mmx;
 
 #endif /* VEC_IMPL_X86_MMX_H_ */
--- a/include/vec/impl/x86/sse2.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/x86/sse2.h	Sun Nov 24 02:52:40 2024 -0500
@@ -25,290 +25,42 @@
 #ifndef VEC_IMPL_X86_SSE2_H_
 #define VEC_IMPL_X86_SSE2_H_
 
-#define VEC_SSE2_OPERATION_8x16(op, sign) \
-	do { \
-		/* unpack and multiply */ \
-		__m128i dst_even = _mm_##op##_epi16(vec1.sse, vec2.sse); \
-		__m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \
-	\
-		/* repack */ \
-		v##sign##int8x16 vec; \
-		vec.sse = _mm_or_si128( \
-			_mm_slli_epi16(dst_odd, 8), \
-			_mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
-		); \
-		return vec; \
-	} while (0)
-
-// shifting
-#define VEC_SSE2_LSHIFT_8x16(sign) \
-	VEC_SSE2_OPERATION_8x16(sll, sign)
-
-#define VEC_SSE2_LSHIFT_16x8(sign) \
-	do { \
-		v##sign##int16x8 vec; \
-		vec.sse = _mm_sll_epi16(vec1.sse, vec2.sse); \
-		return vec; \
-	} while (0)
-
-#define VEC_SSE2_LSHIFT_32x4(sign) \
-	do { \
-		v##sign##int32x4 vec; \
-		vec.sse = _mm_sll_epi32(vec1.sse, vec2.sse); \
-		return vec; \
-	} while (0)
-
-#define VEC_SSE2_LSHIFT_64x2(sign) \
-	do { \
-		v##sign##int64x2 vec; \
-		vec.sse = _mm_sll_epi64(vec1.sse, vec2.sse); \
-		return vec; \
-	} while (0)
-
-#define VEC_SSE2_RSHIFT_8x16(sign, aORl) \
-	VEC_SSE2_OPERATION_8x16(sr##aORl, sign)
-
-#define VEC_SSE2_RSHIFT_16x8(sign, aORl) \
-	do { \
-		v##sign##int16x8 vec; \
-		vec.sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse); \
-		return vec; \
-	} while (0)
-
-#define VEC_SSE2_RSHIFT_32x4(sign, aORl) \
-	do { \
-		v##sign##int32x4 vec; \
-		vec.sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse); \
-		return vec; \
-	} while (0)
-
-#define VEC_SSE2_aRSHIFT_64x2(sign) \
-	do { \
-		return v##sign##int64x2_fallback_rshift(vec1, vec2); \
-	} while (0)
-
-#define VEC_SSE2_lRSHIFT_64x2(sign) \
-	do { \
-		v##sign##int64x2 vec; \
-		vec.sse = _mm_srl_epi64(vec1.sse, vec2.sse); \
-		return vec; \
-	} while (0)
+#include "vec/vec.h"
 
-#define VEC_SSE2_RSHIFT_64x2(sign, aORl) \
-	VEC_SSE2_##aORl##RSHIFT_64x2(sign)
-
-// shared between SSE2 variations
-#define VEC_SSE2_MUL_8x16(sign) \
-	VEC_SSE2_OPERATION_8x16(mullo, sign)
-
-#define VEC_SSE2_MUL_16x8(sign) \
-	do { \
-		/* we have a real instruction for this */ \
-		vec1.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \
-		return vec1; \
-	} while (0)
-
-#define VEC_SSE2_MUL_32x4(sign) \
-	do { \
-		/* this was stolen from... somewhere :) */ \
-		__m128i a13    = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \
-		__m128i b13    = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \
-		__m128i prod02 = _mm_mul_epu32(vec1.sse, vec2.sse); /* (-,a2*b2,-,a0*b0) */ \
-		__m128i prod13 = _mm_mul_epu32(a13, b13);           /* (-,a3*b3,-,a1*b1) */ \
-		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
-		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
-	\
-		vec1.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \
-		return vec1; \
-	} while (0)
-
-#define VEC_SSE2_MUL_64x2(sign) \
-	do { \
-		__m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \
-		__m128i b  = _mm_srli_epi64(vec1.sse, 32);      /* b = vec1 >> 32; */ \
-		__m128i bc = _mm_mul_epu32(b, vec2.sse);        /* bc = b * (vec2 & UINT32_MAX); */ \
-		__m128i d  = _mm_srli_epi64(vec2.sse, 32);      /* d = vec2 >> 32; */ \
-		__m128i ad = _mm_mul_epu32(vec1.sse, d);        /* ad = (vec1 & UINT32_MAX) * d; */ \
-		__m128i hi = _mm_add_epi64(bc, ad);             /* hi = bc + ad; */ \
-		hi = _mm_slli_epi64(hi, 32);                    /* hi <<= 32; */ \
-	\
-		vec1.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \
-		return vec1; \
-	} while (0)
-
-#define VEC_SSE2_CMPEQ_8x16(sign) \
-	do { \
-		vec1.sse = _mm_cmpeq_epi8(vec1.sse, vec2.sse); \
-		return vec1; \
-	} while (0)
-
-#define VEC_SSE2_CMPEQ_16x8(sign) \
-	do { \
-		vec1.sse = _mm_cmpeq_epi16(vec1.sse, vec2.sse); \
-		return vec1; \
-	} while (0)
-
-#define VEC_SSE2_CMPEQ_32x4(sign) \
-	do { \
-		vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \
-		return vec1; \
-	} while (0)
-
-// SSE2 doesn't have an intrinsic for 64x2 equality comparison,
-// so how can we take a 32x4 comparison result and turn it into
-// a 64x2 comparison result?
-//
-// well, Intel conveniently provided an operation where we can
-// shuffle around 32-bit integers (_mm_shuffle_epi32).
-//
-// this means all we have to do is simply do the 32-bit operation,
-// shuffle the parts, and then return a bitwise AND of the result.
+// These are only extern because the SSE 4.1 translation unit needs to access it.
+#define VEC_DEFINE_SSE2_OPERATIONS_SIGN(sign, csign, bits, size) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_splat(vec_##sign##int##bits x); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]); \
+	void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+	void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2);
 
-#define VEC_SSE2_CMPEQ_64x2(sign) \
-	do { \
-		vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \
-		vec2.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(1, 1, 3, 3)); \
-		vec1.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(0, 0, 2, 2)); \
-		vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \
-		return vec1; \
-	} while (0)
+#define VEC_DEFINE_SSE2_OPERATIONS(bits, size) \
+	VEC_DEFINE_SSE2_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_SSE2_OPERATIONS_SIGN(u, U, bits, size)
+
+VEC_DEFINE_SSE2_OPERATIONS(8, 16)
+VEC_DEFINE_SSE2_OPERATIONS(16, 8)
+VEC_DEFINE_SSE2_OPERATIONS(32, 4)
+VEC_DEFINE_SSE2_OPERATIONS(64, 2)
 
-#define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.sse = _mm_load_si128((const __m128i *)in); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.sse = _mm_loadu_si128((const __m128i *)in); \
-		return vec; \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		_mm_store_si128((__m128i *)out, vec.sse); \
-	} \
-	\
-	static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		_mm_storeu_si128((__m128i *)out, vec.sse); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.sse = _mm_add_epi##bits(vec1.sse, vec2.sse); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.sse = _mm_sub_epi##bits(vec1.sse, vec2.sse); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_SSE2_MUL_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.sse = _mm_and_si128(vec1.sse, vec2.sse); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.sse = _mm_or_si128(vec1.sse, vec2.sse); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		v##sign##int##bits##x##size vec; \
-		vec.sse = _mm_xor_si128(vec1.sse, vec2.sse); \
-		return vec; \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_SSE2_LSHIFT_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_SSE2_RSHIFT_##bits##x##size(sign, a); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		VEC_SSE2_RSHIFT_##bits##x##size(sign, l); \
-	} \
-	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		VEC_SSE2_CMPEQ_##bits##x##size(sign); \
-	} \
-	\
-	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \
-		/* .splat = */ NULL, \
-		v##sign##int##bits##x##size##_sse2_load_aligned, \
-		v##sign##int##bits##x##size##_sse2_load, \
-		v##sign##int##bits##x##size##_sse2_store_aligned, \
-		v##sign##int##bits##x##size##_sse2_store, \
-		v##sign##int##bits##x##size##_sse2_add, \
-		v##sign##int##bits##x##size##_sse2_sub, \
-		v##sign##int##bits##x##size##_sse2_mul, \
-		/* .div = */ NULL, \
-		/* .avg = */ NULL, \
-		v##sign##int##bits##x##size##_sse2_and, \
-		v##sign##int##bits##x##size##_sse2_or, \
-		v##sign##int##bits##x##size##_sse2_xor, \
-		/* .not = */ NULL, \
-		v##sign##int##bits##x##size##_sse2_lshift, \
-		v##sign##int##bits##x##size##_sse2_rshift, \
-		v##sign##int##bits##x##size##_sse2_lrshift, \
-		/* .cmplt = */ NULL, \
-		/* .cmple = */ NULL, \
-		v##sign##int##bits##x##size##_sse2_cmpeq, \
-		/* .cmpge = */ NULL, \
-		/* .cmpgt = */ NULL, \
-	};
+#undef VEC_DEFINE_SSE2_OPERATIONS
+#undef VEC_DEFINE_SSE2_OPERATIONS_SIGN
 
-#define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \
-	VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size) \
-	VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size)
-
-// SSE is *only* 128-bit
-VEC_SSE2_DEFINE_OPERATIONS(8, 16)
-VEC_SSE2_DEFINE_OPERATIONS(16, 8)
-VEC_SSE2_DEFINE_OPERATIONS(32, 4)
-VEC_SSE2_DEFINE_OPERATIONS(64, 2)
-
-#undef VEC_SSE2_DEFINE_OPERATIONS
-#undef VEC_SSE2_DEFINE_OPERATIONS_SIGN
-#undef VEC_SSE2_MUL_8x16
-#undef VEC_SSE2_MUL_16x8
-#undef VEC_SSE2_MUL_32x4
-#undef VEC_SSE2_MUL_64x2
-#undef VEC_SSE2_OPERATION_8x16
-#undef VEC_SSE2_LSHIFT_8x16
-#undef VEC_SSE2_LSHIFT_16x8
-#undef VEC_SSE2_LSHIFT_32x4
-#undef VEC_SSE2_LSHIFT_64x2
-#undef VEC_SSE2_RSHIFT_8x16
-#undef VEC_SSE2_RSHIFT_16x8
-#undef VEC_SSE2_RSHIFT_32x4
-#undef VEC_SSE2_aRSHIFT_64x2
-#undef VEC_SSE2_lRSHIFT_64x2
-#undef VEC_SSE2_RSHIFT_64x2
+extern const vint8x16_impl vint8x16_impl_sse2;
+extern const vint16x8_impl vint16x8_impl_sse2;
+extern const vint32x4_impl vint32x4_impl_sse2;
+extern const vint64x2_impl vint64x2_impl_sse2;
+extern const vuint8x16_impl vuint8x16_impl_sse2;
+extern const vuint16x8_impl vuint16x8_impl_sse2;
+extern const vuint32x4_impl vuint32x4_impl_sse2;
+extern const vuint64x2_impl vuint64x2_impl_sse2;
 
 #endif /* VEC_IMPL_X86_SSE2_H_ */
--- a/include/vec/impl/x86/sse41.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/impl/x86/sse41.h	Sun Nov 24 02:52:40 2024 -0500
@@ -25,43 +25,9 @@
 #ifndef VEC_IMPL_X86_SSE41_H_
 #define VEC_IMPL_X86_SSE41_H_
 
-// SSE 4.1 provides a real _mm_mullo_epi32
-#define VEC_SSE41_DEFINE_OPERATIONS(sign) \
-	static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \
-	{ \
-		v##sign##int32x4 vec; \
-		vec.sse = _mm_mullo_epi32(vec1.sse, vec2.sse); \
-		return vec; \
-	} \
-	\
-	static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \
-		/* .splat = */ NULL, \
-		v##sign##int32x4_sse2_load_aligned, \
-		v##sign##int32x4_sse2_load, \
-		v##sign##int32x4_sse2_store_aligned, \
-		v##sign##int32x4_sse2_store, \
-		v##sign##int32x4_sse2_add, \
-		v##sign##int32x4_sse2_sub, \
-		v##sign##int32x4_sse41_mul, \
-		/* .div = */ NULL, \
-		/* .avg = */ NULL, \
-		v##sign##int32x4_sse2_and, \
-		v##sign##int32x4_sse2_or, \
-		v##sign##int32x4_sse2_xor, \
-		/* .not = */ NULL, \
-		v##sign##int32x4_sse2_lshift, \
-		v##sign##int32x4_sse2_rshift, \
-		v##sign##int32x4_sse2_lrshift, \
-		/* .cmplt = */ NULL, \
-		/* .cmple = */ NULL, \
-		v##sign##int32x4_sse2_cmpeq, \
-		/* .cmpge = */ NULL, \
-		/* .cmpgt = */ NULL, \
-	};
+#include "vec/vec.h"
 
-VEC_SSE41_DEFINE_OPERATIONS()
-VEC_SSE41_DEFINE_OPERATIONS(u)
-
-#undef VEC_SSE41_DEFINE_OPERATIONS
+extern const vint32x4_impl  vint32x4_impl_sse41;
+extern const vuint32x4_impl vuint32x4_impl_sse41;
 
 #endif /* VEC_IMPL_X86_SSE41_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/types.h.in	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,114 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_TYPES_H_
+#define VEC_TYPES_H_
+
+#cmakedefine HAVE_SYS_TYPES_H
+#cmakedefine HAVE_STDDEF_H
+#cmakedefine HAVE_STDINT_H
+
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+
+typedef signed char   vec_int8;
+typedef @SIZE16@      vec_int16;
+typedef @SIZE32@      vec_int32;
+typedef @SIZE64@      vec_int64;
+
+typedef unsigned char vec_uint8;
+typedef @USIZE16@     vec_uint16;
+typedef @USIZE32@     vec_uint32;
+typedef @USIZE64@     vec_uint64;
+
+/* this is only used for bitshifting right now */
+typedef vec_int64     vec_intmax;
+typedef vec_uint64    vec_uintmax;
+
+typedef @USIZESIZE@   vec_uintsize;
+typedef @USIZEPTR@    vec_uintptr;
+
+// okay, now we have to do this crap.
+#ifdef HAVE_STDINT_H
+# define VEC_INT8_C(x)    INT8_C(x)
+# define VEC_UINT8_C(x)   UINT8_C(x)
+# define VEC_INT16_C(x)   INT16_C(x)
+# define VEC_UINT16_C(x)  UINT16_C(x)
+# define VEC_INT32_C(x)   INT32_C(x)
+# define VEC_UINT32_C(x)  UINT32_C(x)
+# define VEC_INT64_C(x)   INT64_C(x)
+# define VEC_UINT64_C(x)  UINT64_C(x)
+# define VEC_INTMAX_C(x)  INTMAX_C(x)
+# define VEC_UINTMAX_C(x) UINTMAX_C(x)
+
+# define VEC_INT8_MAX     INT8_MAX
+# define VEC_INT8_MIN     INT8_MIN
+# define VEC_UINT8_MAX    UINT8_MAX
+# define VEC_INT16_MAX    INT16_MAX
+# define VEC_INT16_MIN    INT16_MIN
+# define VEC_UINT16_MAX   UINT16_MAX
+# define VEC_INT32_MAX    INT32_MAX
+# define VEC_INT32_MIN    INT32_MIN
+# define VEC_UINT32_MAX   UINT32_MAX
+# define VEC_INT64_MAX    INT64_MAX
+# define VEC_INT64_MIN    INT64_MIN
+# define VEC_UINT64_MAX   UINT64_MAX
+#else
+// These are based on the minimum sizes for each integer type.
+//
+// i.e. long is guaranteed to be at least 32 bits, long long is
+// guaranteed to be at least 64 bits, etc.
+# define VEC_INT8_C(x)    x
+# define VEC_UINT8_C(x)   x##U
+# define VEC_INT16_C(x)   x
+# define VEC_UINT16_C(x)  x##U
+# define VEC_INT32_C(x)   x##L
+# define VEC_UINT32_C(x)  x##UL
+# define VEC_INT64_C(x)   x##LL
+# define VEC_UINT64_C(x)  x##ULL
+# define VEC_INTMAX_C(x)  VEC_INT64_C(x)
+# define VEC_UINTMAX_C(x) VEC_UINT64_C(x)
+
+# define VEC_INT8_MAX     0x7F
+# define VEC_INT8_MIN     (-0x7F - 1)
+# define VEC_UINT8_MAX    0xFFU
+# define VEC_INT16_MAX    0x7FFF
+# define VEC_INT16_MIN    (-0x7FFF - 1)
+# define VEC_UINT16_MAX   0xFFFFU
+# define VEC_INT32_MAX    0x7FFFFFFFL
+# define VEC_INT32_MIN    (-0x7FFFFFFFL - 1L)
+# define VEC_UINT32_MAX   0xFFFFFFFFUL
+# define VEC_INT64_MAX    0x7FFFFFFFFFFFFFFFLL
+# define VEC_INT64_MIN    (-0x7FFFFFFFFFFFFFFFLL - 1LL)
+# define VEC_UINT64_MAX   0xFFFFFFFFFFFFFFFFULL
+#endif
+
+#endif /* VEC_TYPES_H_ */
--- a/include/vec/vec.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/include/vec/vec.h	Sun Nov 24 02:52:40 2024 -0500
@@ -29,52 +29,37 @@
 extern "C" {
 #endif
 
-
-#ifdef VEC_HAVE_IMPL_INTEGER_H
-# include "impl/integer.h"
-#else
-# if __cplusplus >= (201103L)
-#  include <cstdint>
-#  include <cstddef>
-typedef std::size_t    vec_uintsize;
-
-typedef std::uint8_t   vec_uint8;
-typedef std::uint16_t  vec_uint16;
-typedef std::uint32_t  vec_uint32;
-typedef std::uint64_t  vec_uint64;
-typedef std::uintmax_t vec_uintmax;
-typedef std::uintptr_t vec_uintptr;
-
-typedef std::int8_t    vec_int8;
-typedef std::int16_t   vec_int16;
-typedef std::int32_t   vec_int32;
-typedef std::int64_t   vec_int64;
-typedef std::intmax_t  vec_intmax;
-# elif __STDC_VERSION__ >= 199901L
-#  include <stdint.h>
-#  include <stddef.h>
-typedef uint8_t   vec_uint8;
-typedef uint16_t  vec_uint16;
-typedef uint32_t  vec_uint32;
-typedef uint64_t  vec_uint64;
-typedef uintmax_t vec_uintmax;
-typedef uintptr_t vec_uintptr;
-typedef size_t    vec_uintsize;
-typedef int8_t    vec_int8;
-typedef int16_t   vec_int16;
-typedef int32_t   vec_int32;
-typedef int64_t   vec_int64;
-typedef intmax_t  vec_intmax;
-# else
-#  error Unable to find integer types with known size.
-# endif
-#endif
+// different on every implementation
+#include "vec/types.h"
 
 #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
 	(((a) >= (x)) && \
 	 ((a) > x || (b) >= (y)) && \
 	 ((a) > x || (b) > (y) || (c) >= (z)))
 
+// MSVC sucks and its a pain in the ass to find out this stuff
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
+# define VEC_MSVC_VERSION_MAJOR (_MSC_FULL_VER / 10000000)
+# define VEC_MSVC_VERSION_MINOR ((_MSC_FULL_VER % 10000000) / 100000)
+# define VEC_MSVC_VERSION_PATCH (_MSC_FULL_VER % 100000)
+#elif defined(_MSC_FULL_VER)
+# define VEC_MSVC_VERSION_MAJOR (_MSC_FULL_VER / 1000000)
+# define VEC_MSVC_VERSION_MINOR ((_MSC_FULL_VER % 1000000) / 10000)
+# define VEC_MSVC_VERSION_PATCH (_MSC_FULL_VER % 10000)
+#elif defined(_MSC_VER)
+# define VEC_MSVC_VERSION_MAJOR (_MSC_VER / 100)
+# define VEC_MSVC_VERSION_MINOR (_MSC_VER % 100)
+# define VEC_MSVC_VERSION_PATCH (0)
+#endif
+
+#ifdef VEC_MSVC_VERSION_MAJOR
+# define VEC_MSVC_ATLEAST(x, y, z) \
+	VEC_SEMVER_ATLEAST(VEC_MSVC_VERSION_MAJOR, VEC_MSVC_VERSION_MINOR, VEC_MSVC_VERSION_PATCH, x, y, z)
+#else
+# define VEC_MSVC_ATLEAST(x, y, z) (0)
+#endif
+
+// now we get to GNU C stuff (not necessarily GCC)
 #ifdef __GNUC__
 # define VEC_GNUC_ATLEAST(x, y, z) \
 	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)
@@ -82,13 +67,13 @@
 # define VEC_GNUC_ATLEAST(x, y, z) (0)
 #endif
 
-/* GCC/clang attributes */
 #if defined(__has_attribute)
 # define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x)
 #else
 # define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch)
 #endif
 
+// this isn't used anywhere (yet!) but still useful to have
 #if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
 # define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg)
 #elif (__STDC_VERSION__ >= 201112L)
@@ -99,332 +84,86 @@
 		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
 #endif
 
-#ifndef VEC_ASSERT
-# ifndef VEC_DISABLE_ASSERTIONS
-#  include <assert.h>
-#  define VEC_ASSERT(x, msg) assert(msg && x)
-# else
-#  define VEC_ASSERT(x, msg)
-# endif
-#endif
-
-/* --------------------------------------------------------------- */
-/* Detect compiler SIMD support */
-
-#define VEC_NEON_ALIGNMENT    16
-#define VEC_ALTIVEC_ALIGNMENT 16
-#define VEC_SSE2_ALIGNMENT    16
-#define VEC_AVX2_ALIGNMENT    32
-#define VEC_AVX512F_ALIGNMENT 64
-
-// for the generic implementation
-#define VINT8x2_ALIGNMENT   1
-#define VUINT8x2_ALIGNMENT  1
-
-#define VINT8x4_ALIGNMENT   VINT8x2_ALIGNMENT
-#define VINT16x2_ALIGNMENT  2
-#define VUINT8x4_ALIGNMENT  VUINT8x2_ALIGNMENT
-#define VUINT16x2_ALIGNMENT 2
-
-#define VINT8x8_ALIGNMENT   VINT8x4_ALIGNMENT
-#define VINT16x4_ALIGNMENT  VINT16x2_ALIGNMENT
-#define VINT32x2_ALIGNMENT  4
-#define VUINT8x8_ALIGNMENT  VUINT8x4_ALIGNMENT
-#define VUINT16x4_ALIGNMENT VUINT16x2_ALIGNMENT
-#define VUINT32x2_ALIGNMENT 4
-
-#define VINT8x16_ALIGNMENT  VINT8x8_ALIGNMENT
-#define VINT16x8_ALIGNMENT  VINT16x4_ALIGNMENT
-#define VINT32x4_ALIGNMENT  VINT32x2_ALIGNMENT
-#define VINT64x2_ALIGNMENT  8
-#define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT
-#define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT
-#define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT
-#define VUINT64x2_ALIGNMENT 8
-
-#define VINT8x32_ALIGNMENT   VINT8x16_ALIGNMENT
-#define VINT16x16_ALIGNMENT  VINT16x8_ALIGNMENT
-#define VINT32x8_ALIGNMENT   VINT32x4_ALIGNMENT
-#define VINT64x4_ALIGNMENT   VINT64x2_ALIGNMENT
-#define VUINT8x32_ALIGNMENT  VUINT8x16_ALIGNMENT
-#define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
-#define VUINT32x8_ALIGNMENT  VUINT32x4_ALIGNMENT
-#define VUINT64x4_ALIGNMENT  VUINT64x2_ALIGNMENT
+//////////////////////////////////////////////////////////////////////////////
+// Detect compiler SIMD support
 
-#define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
-#define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
-#define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
-#define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
-#define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
-#define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
-#define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT
-#define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT
-
-#ifndef VEC_SUPPRESS_HW
+// Current known alignments for each implementation, ordered by
+// architecture and instruction set:
+//
+// /---------------------------------------------------\
+// | Architecture | Instruction Set | Bits | Alignment |
+// |---------------------------------------------------|
+// | ARM          | NEON            | 64   | 8 bytes   |
+// | ARM          | NEON            | 128  | 16 bytes  |
+// | PowerPC      | AltiVec         | 128  | 16 bytes  |
+// | x86          | MMX             | 64   | None?     |
+// | x86          | SSE2            | 128  | 16 bytes  |
+// | x86          | AVX2            | 256  | 32 bytes  |
+// | x86          | AVX512-F        | 512  | 64 bytes  |
+// \---------------------------------------------------/
+//
+// If these ever have to be extended or changed, there absolutely *must*
+// be a new major release of vec, since that would change the ABI...
 
-// IIRC `__VEC__' is also defined, but I don't know for sure.
-// IBM says that `__ALTIVEC__' is standard though.
-#ifdef __ALTIVEC__
-# include <altivec.h>
-# define VEC_COMPILER_HAS_ALTIVEC
-# if defined(__POWER8__) && defined(__VSX__)
-#  define VEC_COMPILER_HAS_ALTIVEC_VSX
-# endif
-# if VINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VINT8x16_ALIGNMENT
-#  define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-# if VINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VINT16x8_ALIGNMENT
-#  define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-# if VINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VINT32x4_ALIGNMENT
-#  define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-# if VINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VINT64x2_ALIGNMENT
-#  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-# if VUINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VUINT8x16_ALIGNMENT
-#  define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-# if VUINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VUINT16x8_ALIGNMENT
-#  define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-# if VUINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VUINT32x4_ALIGNMENT
-#  define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-# if VUINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
-#  undef VUINT64x2_ALIGNMENT
-#  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
-# endif
-#endif
+#define VINT8x2_ALIGNMENT   2
+#define VUINT8x2_ALIGNMENT  2
+
+#define VINT8x4_ALIGNMENT   4
+#define VINT16x2_ALIGNMENT  4
+#define VUINT8x4_ALIGNMENT  4
+#define VUINT16x2_ALIGNMENT 4
 
-#ifdef __ARM_NEON
-# include <arm_neon.h>
-# define VEC_COMPILER_HAS_NEON
-# if VINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VINT8x8_ALIGNMENT
-#  define VINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VINT16x4_ALIGNMENT
-#  define VINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VINT32x4_ALIGNMENT
-#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VUINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VUINT8x8_ALIGNMENT
-#  define VUINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VUINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VUINT16x4_ALIGNMENT
-#  define VUINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VUINT32x4_ALIGNMENT
-#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VINT8x16_ALIGNMENT
-#  define VINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VINT16x8_ALIGNMENT
-#  define VINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VINT32x4_ALIGNMENT
-#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VINT64x2_ALIGNMENT
-#  define VINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VUINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VUINT8x16_ALIGNMENT
-#  define VUINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VUINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VUINT16x8_ALIGNMENT
-#  define VUINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VUINT32x4_ALIGNMENT
-#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-# if VUINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
-#  undef VUINT64x2_ALIGNMENT
-#  define VUINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
-# endif
-#endif
+// 64-bit
+#define VINT8x8_ALIGNMENT   8
+#define VINT16x4_ALIGNMENT  8
+#define VINT32x2_ALIGNMENT  8
+#define VUINT8x8_ALIGNMENT  8
+#define VUINT16x4_ALIGNMENT 8
+#define VUINT32x2_ALIGNMENT 8
 
-#ifdef __MMX__
-# include <mmintrin.h>
-# define VEC_COMPILER_HAS_MMX
-#endif
+// 128-bit
+#define VINT8x16_ALIGNMENT  16
+#define VINT16x8_ALIGNMENT  16
+#define VINT32x4_ALIGNMENT  16
+#define VINT64x2_ALIGNMENT  16
+#define VUINT8x16_ALIGNMENT 16
+#define VUINT16x8_ALIGNMENT 16
+#define VUINT32x4_ALIGNMENT 16
+#define VUINT64x2_ALIGNMENT 16
 
-#ifdef __SSE2__
-# include <emmintrin.h>
-# define VEC_COMPILER_HAS_SSE2
-# ifdef __SSE4_1__
-#  define VEC_COMPILER_HAS_SSE41
-# endif
-# if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VINT8x16_ALIGNMENT
-#  define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-# if VINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VINT16x8_ALIGNMENT
-#  define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-# if VINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VINT32x4_ALIGNMENT
-#  define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-# if VINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VINT64x2_ALIGNMENT
-#  define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-# if VUINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VUINT8x16_ALIGNMENT
-#  define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-# if VUINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VUINT16x8_ALIGNMENT
-#  define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-# if VUINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VUINT32x4_ALIGNMENT
-#  define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-# if VUINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
-#  undef VUINT64x2_ALIGNMENT
-#  define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
-# endif
-#endif
+// 256-bit
+#define VINT8x32_ALIGNMENT   32
+#define VINT16x16_ALIGNMENT  32
+#define VINT32x8_ALIGNMENT   32
+#define VINT64x4_ALIGNMENT   32
+#define VUINT8x32_ALIGNMENT  32
+#define VUINT16x16_ALIGNMENT 32
+#define VUINT32x8_ALIGNMENT  32
+#define VUINT64x4_ALIGNMENT  32
 
-#ifdef __AVX2__
-# include <immintrin.h>
-# define VEC_COMPILER_HAS_AVX2
-# if VINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VINT8x32_ALIGNMENT
-#  define VINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-# if VINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VINT16x16_ALIGNMENT
-#  define VINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-# if VINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VINT32x8_ALIGNMENT
-#  define VINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-# if VINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VINT64x4_ALIGNMENT
-#  define VINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-# if VUINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VUINT8x32_ALIGNMENT
-#  define VUINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-# if VUINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VUINT16x16_ALIGNMENT
-#  define VUINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-# if VUINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VUINT32x8_ALIGNMENT
-#  define VUINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-# if VUINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
-#  undef VUINT64x4_ALIGNMENT
-#  define VUINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
-# endif
-#endif
+// 512-bit
+#define VINT8x64_ALIGNMENT   64
+#define VINT16x32_ALIGNMENT  64
+#define VINT32x16_ALIGNMENT  64
+#define VINT64x8_ALIGNMENT   64
+#define VUINT8x64_ALIGNMENT  64
+#define VUINT16x32_ALIGNMENT 64
+#define VUINT32x16_ALIGNMENT 64
+#define VUINT64x8_ALIGNMENT  64
 
-#ifdef __AVX512F__
-# include <immintrin.h>
-# define VEC_COMPILER_HAS_AVX512F
-# if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VINT8x64_ALIGNMENT
-#  define VINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-# if VINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VINT16x32_ALIGNMENT
-#  define VINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-# if VINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VINT32x16_ALIGNMENT
-#  define VINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-# if VINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VINT64x8_ALIGNMENT
-#  define VINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-# if VUINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VUINT8x64_ALIGNMENT
-#  define VUINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-# if VUINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VUINT16x32_ALIGNMENT
-#  define VUINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-# if VUINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VUINT32x16_ALIGNMENT
-#  define VUINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-# if VUINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
-#  undef VUINT64x8_ALIGNMENT
-#  define VUINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
-# endif
-#endif
+//////////////////////////////////////////////////////////////////////////////
+// bit shift
 
-#endif
-
-/* --------------------------------------------------------------- */
-/* bit shift */
-
-inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y)
+inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y)
 {
 	return x >> y;
 }
 
-inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y)
+inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y)
 {
 	return x << y;
 }
 
-inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y)
-{
-	// reinterpret as unsigned integer and then shift
-	union {
-		vec_intmax d;
-		vec_uintmax u;
-	} xx;
-
-	xx.d = x;
-	xx.u >>= y;
-	return xx.d;
-}
-
-inline vec_intmax vec_llshift(vec_intmax x, unsigned int y)
-{
-	// reinterpret as unsigned integer and then shift
-	union {
-		vec_intmax d;
-		vec_uintmax u;
-	} xx;
-
-	xx.d = x;
-	xx.u <<= y;
-	return xx.d;
-}
-
 inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y)
 {
 	return x >> y;
@@ -497,334 +236,399 @@
 	return xx.d;
 }
 
-#ifdef VEC_IMPLEMENTATION
-extern inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y);
-extern inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y);
-extern inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y);
-extern inline vec_intmax vec_llshift(vec_intmax x, unsigned int y);
-extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
-extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
-extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
-extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);
+//////////////////////////////////////////////////////////////////////////////
+// array alignment
+
+#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
+# define VEC_ALIGNOF(type) alignof(x)
+#elif (__STDC_VERSION__ >= 201112L)
+# define VEC_ALIGNOF(type) _Alignof(x)
+#elif defined(HAVE_STDDEF_H) // already included
+# define VEC_ALIGNOF(type) \
+	(offsetof(struct { char slot1; x slot2; }, slot2))
+#else
+// inline offsetof
+# define VEC_ALIGNOF(type) \
+	((vec_uintsize)((char *)&((struct { char slot1; x slot2; } *)0)->slot2 - (char *)0))
+#endif
+
+#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
+# define VEC_ALIGNAS(x) alignas(x)
+#elif (__STDC_VERSION__ >= 201112L)
+# define VEC_ALIGNAS(x) _Alignas(x)
+#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
+# define VEC_ALIGNAS(x) __attribute__((__aligned__(x)))
+#elif VEC_MSVC_ATLEAST(0, 0, 0) // FIXME which version?
+# define VEC_ALIGNAS(x) __declspec(align(x))
+#else
+# error vec: vec requires compiler alignment support
+#endif
+
+// this wart is here because originally vec didn't require that
+// there be compiler support for alignment. now that we *do*,
+// we need to 
+#ifdef VEC_ALIGNAS
+# define VEC_ALIGNED_ARRAY(type, var, length, align) \
+	VEC_ALIGNAS(align) type var[length]
+# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
+	(sizeof(var))
 #endif
 
-/* --------------------------------------------------------------- */
+#define VEC_ALIGNED_ARRAY_LENGTH(var) \
+	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))
+
+//////////////////////////////////////////////////////////////////////////////////////
+// predefined variants for each vector type
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 16-bit
+
+#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT)
+#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT)
+#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT)
+#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0)
+
+#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT)
+#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT)
+#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT)
+#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 32-bit
+
+#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT)
+#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT)
+#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT)
+#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0)
+
+#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT)
+#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT)
+#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT)
+#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0)
+
+#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT)
+#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT)
+#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT)
+#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0)
+
+#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT)
+#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT)
+#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT)
+#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 64-bit
+
+#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT)
+#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
+#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
+#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)
+
+#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT)
+#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
+#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
+#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)
+
+#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT)
+#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
+#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
+#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)
+
+#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)
+
+#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)
+
+#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 128-bit
+
+#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
+
+#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)
+
+#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)
+
+#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)
+
+#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)
 
-#include "impl/align.h"
+#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)
+
+#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)
+
+#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 256-bit
+
+#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT)
+#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
+#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
+#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)
+
+#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT)
+#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
+#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
+#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
+
+#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT)
+#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
+#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
+#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)
+
+#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT)
+#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
+#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
+#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)
+
+#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)
+
+#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
 
-/* --------------------------------------------------------------- */
-/* Defines the structures for each vector type */
+#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)
+
+#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 512-bit
+
+#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT)
+#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
+#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
+#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)
+
+#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT)
+#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
+#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
+#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
+
+#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT)
+#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
+#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
+#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)
+
+#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT)
+#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
+#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
+#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)
+
+#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)
+
+#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
+
+#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)
+
+#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////
+// Defines the structures for each vector type
 
 // 16-bit
-typedef union {
-	vec_uint8 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VUINT8x2_ALIGNMENT) vec_uint8 bytes[2];
 } vuint8x2;
 
-typedef union {
-	vec_int8 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VINT8x2_ALIGNMENT) vec_uint8 bytes[2];
 } vint8x2;
 
 // 32-bit
-typedef union {
-	vuint8x2 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VUINT8x4_ALIGNMENT) vec_uint8 bytes[4];
 } vuint8x4;
 
-typedef union {
-	vec_uint16 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VUINT16x2_ALIGNMENT) vec_uint8 bytes[4];
 } vuint16x2;
 
-typedef union {
-	vint8x2 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VINT8x4_ALIGNMENT) vec_uint8 bytes[4];
 } vint8x4;
 
-typedef union {
-	vec_int16 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VINT16x2_ALIGNMENT) vec_uint8 bytes[4];
 } vint16x2;
 
 // 64-bit
-typedef union {
-#ifdef VEC_COMPILER_HAS_MMX
-	__m64 mmx;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	uint8x8_t neon;
-#endif
-
-	vuint8x4 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VUINT8x8_ALIGNMENT) vec_uint8 bytes[8];
 } vuint8x8;
 
-typedef union {
-#ifdef VEC_COMPILER_HAS_MMX
-	__m64 mmx;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	uint16x4_t neon;
-#endif
-
-	vuint16x2 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VUINT16x4_ALIGNMENT) vec_uint8 bytes[8];
 } vuint16x4;
 
-typedef union {
-#ifdef VEC_COMPILER_HAS_MMX
-	__m64 mmx;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	uint32x2_t neon;
-#endif
-
-	vec_uint32 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VUINT32x2_ALIGNMENT) vec_uint8 bytes[8];
 } vuint32x2;
 
-typedef union {
-#ifdef VEC_COMPILER_HAS_MMX
-	__m64 mmx;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	int8x8_t neon;
-#endif
-
-	vint8x4 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VINT8x8_ALIGNMENT) vec_uint8 bytes[8];
 } vint8x8;
 
-typedef union {
-#ifdef VEC_COMPILER_HAS_MMX
-	__m64 mmx;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	int16x4_t neon;
-#endif
-
-	vint16x2 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VINT16x4_ALIGNMENT) vec_uint8 bytes[8];
 } vint16x4;
 
-typedef union {
-#ifdef VEC_COMPILER_HAS_MMX
-	__m64 mmx;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	int32x2_t neon;
-#endif
-
-	vec_int32 generic[2];
+typedef struct {
+	VEC_ALIGNAS(VINT32x2_ALIGNMENT) vec_uint8 bytes[8];
 } vint32x2;
 
 // 128-bit
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	vector unsigned char altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	uint8x16_t neon;
-#endif
-	vuint8x8 generic[2];
+	VEC_ALIGNAS(VUINT8x16_ALIGNMENT) vec_uint8 bytes[16];
 } vuint8x16;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	vector unsigned short altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	uint16x8_t neon;
-#endif
-	vuint16x4 generic[2];
+	VEC_ALIGNAS(VUINT16x8_ALIGNMENT) vec_uint8 bytes[16];
 } vuint16x8;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	vector unsigned int altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	uint32x4_t neon;
-#endif
-	vuint32x2 generic[2];
+	VEC_ALIGNAS(VUINT32x4_ALIGNMENT) vec_uint8 bytes[16];
 } vuint32x4;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-	vector unsigned long long altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	uint64x2_t neon;
-#endif
-	vec_uint64 generic[2];
+	VEC_ALIGNAS(VUINT64x2_ALIGNMENT) vec_uint8 bytes[16];
 } vuint64x2;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	vector signed char altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	int8x16_t neon;
-#endif
-	vint8x8 generic[2];
+	VEC_ALIGNAS(VINT8x16_ALIGNMENT) vec_uint8 bytes[16];
 } vint8x16;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	vector signed short altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	int16x8_t neon;
-#endif
-	vint16x4 generic[2];
+	VEC_ALIGNAS(VINT16x8_ALIGNMENT) vec_uint8 bytes[16];
 } vint16x8;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	vector signed int altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	int32x4_t neon;
-#endif
-	vint32x2 generic[2];
+	VEC_ALIGNAS(VINT32x4_ALIGNMENT) vec_uint8 bytes[16];
 } vint32x4;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_SSE2
-	__m128i sse;
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-	vector signed long long altivec;
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	int64x2_t neon;
-#endif
-	vec_int64 generic[2];
+	VEC_ALIGNAS(VINT64x2_ALIGNMENT) vec_uint8 bytes[16];
 } vint64x2;
 
 // 256-bit
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vuint8x16 generic[2];
+	VEC_ALIGNAS(VUINT8x32_ALIGNMENT) vec_uint8 bytes[32];
 } vuint8x32;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vuint16x8 generic[2];
+	VEC_ALIGNAS(VUINT16x16_ALIGNMENT) vec_uint8 bytes[32];
 } vuint16x16;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vuint32x4 generic[2];
+	VEC_ALIGNAS(VUINT32x8_ALIGNMENT) vec_uint8 bytes[32];
 } vuint32x8;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vuint64x2 generic[2];
+	VEC_ALIGNAS(VUINT64x4_ALIGNMENT) vec_uint8 bytes[32];
 } vuint64x4;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vint8x16 generic[2];
+	VEC_ALIGNAS(VINT8x32_ALIGNMENT) vec_uint8 bytes[32];
 } vint8x32;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vint16x8 generic[2];
+	VEC_ALIGNAS(VINT16x16_ALIGNMENT) vec_uint8 bytes[32];
 } vint16x16;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vint32x4 generic[2];
+	VEC_ALIGNAS(VINT32x8_ALIGNMENT) vec_uint8 bytes[32];
 } vint32x8;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX2
-	__m256i avx2;
-#endif
-	vint64x2 generic[2];
+	VEC_ALIGNAS(VINT64x4_ALIGNMENT) vec_uint8 bytes[32];
 } vint64x4;
 
 // 512-bit
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vuint8x32 generic[2];
+	VEC_ALIGNAS(VUINT8x64_ALIGNMENT) vec_uint8 bytes[64];
 } vuint8x64;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vuint16x16 generic[2];
+	VEC_ALIGNAS(VUINT16x32_ALIGNMENT) vec_uint8 bytes[64];
 } vuint16x32;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vuint32x8 generic[2];
+	VEC_ALIGNAS(VUINT32x16_ALIGNMENT) vec_uint8 bytes[64];
 } vuint32x16;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vuint64x4 generic[2];
+	VEC_ALIGNAS(VUINT64x8_ALIGNMENT) vec_uint8 bytes[64];
 } vuint64x8;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vint8x32 generic[2];
+	VEC_ALIGNAS(VINT8x64_ALIGNMENT) vec_uint8 bytes[64];
 } vint8x64;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vint16x16 generic[2];
+	VEC_ALIGNAS(VINT16x32_ALIGNMENT) vec_uint8 bytes[64];
 } vint16x32;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vint32x8 generic[2];
+	VEC_ALIGNAS(VINT32x16_ALIGNMENT) vec_uint8 bytes[64];
 } vint32x16;
 
 typedef union {
-#ifdef VEC_COMPILER_HAS_AVX512F
-	__m512i avx512f;
-#endif
-	vint64x4 generic[2];
+	VEC_ALIGNAS(VINT64x8_ALIGNMENT) vec_uint8 bytes[64];
 } vint64x8;
 
 // ---------------------------------------------------------------------------------
@@ -832,77 +636,12 @@
 
 int vec_init(void);
 
-#define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
-	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
-	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
-
-#define VEC_DECLARE_OPERATIONS(bits, size) \
-	VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \
-	VEC_DECLARE_OPERATIONS_SIGN(u, bits, size)
-
-// 16-bit
-VEC_DECLARE_OPERATIONS(8, 2)
-
-// 32-bit
-VEC_DECLARE_OPERATIONS(8, 4)
-VEC_DECLARE_OPERATIONS(16, 2)
-
-// 64-bit
-VEC_DECLARE_OPERATIONS(8, 8)
-VEC_DECLARE_OPERATIONS(16, 4)
-VEC_DECLARE_OPERATIONS(32, 2)
-
-// 128-bit
-VEC_DECLARE_OPERATIONS(8, 16)
-VEC_DECLARE_OPERATIONS(16, 8)
-VEC_DECLARE_OPERATIONS(32, 4)
-VEC_DECLARE_OPERATIONS(64, 2)
-
-// 256-bit
-VEC_DECLARE_OPERATIONS(8, 32)
-VEC_DECLARE_OPERATIONS(16, 16)
-VEC_DECLARE_OPERATIONS(32, 8)
-VEC_DECLARE_OPERATIONS(64, 4)
-
-// 512-bit
-VEC_DECLARE_OPERATIONS(8, 64)
-VEC_DECLARE_OPERATIONS(16, 32)
-VEC_DECLARE_OPERATIONS(32, 16)
-VEC_DECLARE_OPERATIONS(64, 8)
-
-#undef VEC_DECLARE_OPERATIONS
-#undef VEC_DECLARE_OPERATIONS_SIGN
-
-// ---------------------------------------------------------------------------------
-// okay, now we can actually implement the functions
-
-#ifdef VEC_IMPLEMENTATION
-
-// Fallback functions, need to be defined before everything else.
-#include "impl/fallback.h"
+//////////////////////////////////////////////////////////////////////////////
+// these are, for the most part, meant to be used internally
 
 // okay, these are filled in for each supported backend.
 // `and', `or', `xor', and `nor' have to be prefixed with
-// `b' because of <iso646.h>
+// `b' because of <iso646.h>/cxxisms
 #define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \
 	typedef struct { \
 		v##sign##int##bits##x##size (*splat)(vec_##sign##int##bits x); \
@@ -966,389 +705,166 @@
 #undef VEC_DEFINE_IMPL_STRUCT
 #undef VEC_DEFINE_IMPL_STRUCT_SIGN
 
-// ------------------------------------------------------------------------
-
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-# include "impl/ppc/altivec.h"
-#endif
-
-#ifdef VEC_COMPILER_HAS_AVX512F
-# include "impl/x86/avx512f.h"
-#endif
-
-#ifdef VEC_COMPILER_HAS_AVX2
-# include "impl/x86/avx2.h"
-#endif
-
-#ifdef VEC_COMPILER_HAS_SSE2
-# include "impl/x86/sse2.h"
-#endif
-
-// depends on SSE2 functions; the only thing SSE4.1 provides for us
-// is a native 32-bit multiply
-#ifdef VEC_COMPILER_HAS_SSE41
-# include "impl/x86/sse41.h"
-#endif
-
-#ifdef VEC_COMPILER_HAS_MMX
-# include "impl/x86/mmx.h"
-#endif
-
-#ifdef VEC_COMPILER_HAS_NEON
-# include "impl/arm/neon.h"
-#endif
-
-#include "impl/generic.h"
-
-/* ---------------------------------------------------------------- */
-
-#include "impl/cpu.h" // CPU detection crap
-
 // 16-bit
-static vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
-static vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;
+extern const vint8x2_impl   *vint8x2_impl_cpu;
+extern const vuint8x2_impl  *vuint8x2_impl_cpu;
 
 // 32-bit
-static vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
-static vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
-static vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
-static vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
+extern const vint8x4_impl   *vint8x4_impl_cpu;
+extern const vuint8x4_impl  *vuint8x4_impl_cpu;
+extern const vint16x2_impl  *vint16x2_impl_cpu;
+extern const vuint16x2_impl *vuint16x2_impl_cpu;
 
 // 64-bit
-static vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
-static vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
-static vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
-static vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
-static vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
-static vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
+extern const vint8x8_impl   *vint8x8_impl_cpu;
+extern const vuint8x8_impl  *vuint8x8_impl_cpu;
+extern const vint16x4_impl  *vint16x4_impl_cpu;
+extern const vuint16x4_impl *vuint16x4_impl_cpu;
+extern const vint32x2_impl  *vint32x2_impl_cpu;
+extern const vuint32x2_impl *vuint32x2_impl_cpu;
 
 // 128-bit
-static vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
-static vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
-static vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
-static vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
-static vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
-static vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
-static vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
-static vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
+extern const vint8x16_impl  *vint8x16_impl_cpu;
+extern const vuint8x16_impl *vuint8x16_impl_cpu;
+extern const vint16x8_impl  *vint16x8_impl_cpu;
+extern const vuint16x8_impl *vuint16x8_impl_cpu;
+extern const vint32x4_impl  *vint32x4_impl_cpu;
+extern const vuint32x4_impl *vuint32x4_impl_cpu;
+extern const vint64x2_impl  *vint64x2_impl_cpu;
+extern const vuint64x2_impl *vuint64x2_impl_cpu;
 
 // 256-bit
-static vint8x32_impl  *vint8x32_impl_cpu  = &vint8x32_impl_generic;
-static vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic;
-static vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
-static vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
-static vint32x8_impl  *vint32x8_impl_cpu  = &vint32x8_impl_generic;
-static vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic;
-static vint64x4_impl  *vint64x4_impl_cpu  = &vint64x4_impl_generic;
-static vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic;
+extern const vint8x32_impl   *vint8x32_impl_cpu;
+extern const vuint8x32_impl  *vuint8x32_impl_cpu;
+extern const vint16x16_impl  *vint16x16_impl_cpu;
+extern const vuint16x16_impl *vuint16x16_impl_cpu;
+extern const vint32x8_impl   *vint32x8_impl_cpu;
+extern const vuint32x8_impl  *vuint32x8_impl_cpu;
+extern const vint64x4_impl   *vint64x4_impl_cpu;
+extern const vuint64x4_impl  *vuint64x4_impl_cpu;
 
 // 512-bit
-static vint8x64_impl  *vint8x64_impl_cpu  = &vint8x64_impl_generic;
-static vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic;
-static vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
-static vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
-static vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
-static vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
-static vint64x8_impl  *vint64x8_impl_cpu  = &vint64x8_impl_generic;
-static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic;
-
-// returns 0 or a negative error code on failure
-int vec_init(void)
-{
-	// This function is NOT thread safe. However, once vec
-	// is initialized, all of the vector functions are thread-safe.
-	//
-	// In fact, it's possible to use vec without calling
-	// vec_init() at all, but it would be completely useless since
-	// it would just use a generic implementation without any
-	// vectorization whatsoever (unless maybe the compiler is
-	// smart enough to optimize it into vectors)
-
-	vec_get_CPU_features();
+extern const vint8x64_impl  *vint8x64_impl_cpu;
+extern const vuint8x64_impl *vuint8x64_impl_cpu;
+extern const vint16x32_impl  *vint16x32_impl_cpu;
+extern const vuint16x32_impl *vuint16x32_impl_cpu;
+extern const vint32x16_impl  *vint32x16_impl_cpu;
+extern const vuint32x16_impl *vuint32x16_impl_cpu;
+extern const vint64x8_impl  *vint64x8_impl_cpu;
+extern const vuint64x8_impl *vuint64x8_impl_cpu;
 
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (vec_CPU_have_ALTIVEC()) {
-		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
-		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
-		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
-		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
-		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
-		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
-#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-		if (vec_CPU_have_ALTIVEC_VSX()) {
-			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
-			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
-		}
-#endif
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_AVX512F
-	if (vec_CPU_have_AVX512F()) {
-		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
-		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
-		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
-		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
-		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
-		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
-		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
-		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_AVX2
-	if (vec_CPU_have_AVX2()) {
-		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
-		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
-		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
-		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
-		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
-		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
-		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
-		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (vec_CPU_have_SSE2()) {
-		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
-		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
-		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
-		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
-# ifdef VEC_COMPILER_HAS_SSE41
-		if (vec_CPU_have_SSE41()) {
-			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
-			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
-		} else
-# endif
-		{
-			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
-			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
-		}
-		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
-		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_MMX
-	if (vec_CPU_have_MMX()) {
-		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
-		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
-		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
-		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
-		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
-		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	if (vec_CPU_have_NEON()) {
-		// 64-bit
-		vint8x8_impl_cpu  = &vint8x8_impl_neon;
-		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
-		vint16x4_impl_cpu  = &vint16x4_impl_neon;
-		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
-		vint32x2_impl_cpu  = &vint32x2_impl_neon;
-		vuint32x2_impl_cpu = &vuint32x2_impl_neon;
-
-		// 128-bit
-		vint8x16_impl_cpu  = &vint8x16_impl_neon;
-		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
-		vint16x8_impl_cpu  = &vint16x8_impl_neon;
-		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
-		vint32x4_impl_cpu  = &vint32x4_impl_neon;
-		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
-		vint64x2_impl_cpu  = &vint64x2_impl_neon;
-		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
-	}
-#endif
-	{
-		// do nothing, they're already set to generics
-	}
-
-	return 0;
-}
-
-/* ---------------------------------------------------------------- */
+//////////////////////////////////////////////////////////////////////////////
+// declared as inline for  ! performance : )
 
 #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \
+	{ \
+		return v##sign##int##bits##x##size##_impl_cpu->splat(x); \
+	} \
+	\
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->splat) \
-			return v##sign##int##bits##x##size##_impl_cpu->splat(x); \
+		return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \
+	} \
 	\
-		return v##sign##int##bits##x##size##_fallback_splat(x); \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \
+	{ \
+		return v##sign##int##bits##x##size##_impl_cpu->load(in); \
+	} \
+	\
+	inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \
+	inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
-		v##sign##int##bits##x##size err = {0}; \
+		return v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \
+	} \
 	\
-		if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \
-			return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \
-	\
-		VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \
-	\
-		return err; \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \
+	} \
+	\
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->load) \
-			return v##sign##int##bits##x##size##_impl_cpu->load(in); \
+		return v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \
+	} \
 	\
-		return v##sign##int##bits##x##size##_fallback_load(in); \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \
 	} \
 	\
-	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \
-			v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \
-			return; \
-		} \
-	\
-		VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \
+		return v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \
 	} \
 	\
-	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
-	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->store) { \
-			v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \
-			return; \
-		} \
-	\
-		v##sign##int##bits##x##size##_fallback_store(vec, out); \
-	} \
-	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->add) \
-			v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_add(vec1, vec2); \
-	} \
-	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->sub) \
-			v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_sub(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->mul) \
-			v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_mul(vec1, vec2); \
-	} \
-	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->div) \
-			v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_div(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->avg) \
-			v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_avg(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->band) \
-			v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->bor) \
-			v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->bxor) \
-			v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->bnot) \
-			v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \
-	\
-		return v##sign##int##bits##x##size##_fallback_not(vec); \
+		return v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->cmplt) \
-			v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_cmplt(vec1, vec2); \
-	} \
-	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->cmple) \
-			v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_cmple(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->cmpeq) \
-			v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_cmpeq(vec1, vec2); \
-	} \
-	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
-	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->cmpge) \
-			v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_cmpge(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->cmpgt) \
-			v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_cmpgt(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->lshift) \
-			v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_lshift(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->rshift) \
-			v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_rshift(vec1, vec2); \
-	} \
-	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
-	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->lrshift) \
-			v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \
-	\
-		return v##sign##int##bits##x##size##_fallback_lrshift(vec1, vec2); \
+		return v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \
 	}
 
 #define VEC_DEFINE_OPERATIONS(bits, size) \
@@ -1388,8 +904,6 @@
 #undef VEC_DEFINE_OPERATIONS
 #undef VEC_DEFINE_OPERATIONS_SIGN
 
-#endif /* VEC_IMPLEMENTATION */
-
 #ifdef __cplusplus
 }
 #endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/cpu.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,497 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+/* Detect CPU SIMD support. Much of this code was stolen from SDL.
+ *
+ * Simple DirectMedia Layer
+ * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "vec/cpu.h"
+
+#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
+# include <sys/sysctl.h> // For AltiVec check
+#elif defined(__OpenBSD__) && defined(__powerpc__)
+# include <sys/types.h>
+# include <sys/sysctl.h> // For AltiVec check
+# include <machine/cpu.h>
+#elif defined(__FreeBSD__) && defined(__powerpc__)
+# include <machine/cpu.h>
+# include <sys/auxv.h>
+#elif defined(__ALTIVEC__)
+# include <signal.h>
+# include <setjmp.h>
+#endif
+
+#ifdef __FreeBSD__
+# include <sys/param.h>
+#endif
+
+#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
+# include <unistd.h>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+# include <elf.h>
+
+/*#include <asm/hwcap.h>*/
+# ifndef AT_HWCAP
+# define AT_HWCAP 16
+# endif
+# ifndef AT_PLATFORM
+#  define AT_PLATFORM 15
+# endif
+# ifndef HWCAP_NEON
+#  define HWCAP_NEON (1 << 12)
+# endif
+#endif
+
+static inline int vec_CPU_have_CPUID(void)
+{
+	int has_CPUID = 0;
+
+#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
+	__asm__ (
+"        pushfl                      # Get original EFLAGS             \n"
+"        popl    %%eax                                                 \n"
+"        movl    %%eax,%%ecx                                           \n"
+"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
+"        pushl   %%eax               # Save new EFLAGS value on stack  \n"
+"        popfl                       # Replace current EFLAGS value    \n"
+"        pushfl                      # Get new EFLAGS                  \n"
+"        popl    %%eax               # Store new EFLAGS in EAX         \n"
+"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
+"        jz      1f                  # Processor=80486                 \n"
+"        movl    $1,%0               # We have CPUID support           \n"
+"1:                                                                    \n"
+	: "=m" (has_CPUID)
+	:
+	: "%eax", "%ecx"
+	);
+#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
+/* Technically, if this is being compiled under __x86_64__ then it has
+   CPUid by definition.  But it's nice to be able to prove it.  :)      */
+	__asm__ (
+"        pushfq                      # Get original EFLAGS             \n"
+"        popq    %%rax                                                 \n"
+"        movq    %%rax,%%rcx                                           \n"
+"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
+"        pushq   %%rax               # Save new EFLAGS value on stack  \n"
+"        popfq                       # Replace current EFLAGS value    \n"
+"        pushfq                      # Get new EFLAGS                  \n"
+"        popq    %%rax               # Store new EFLAGS in EAX         \n"
+"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
+"        jz      1f                  # Processor=80486                 \n"
+"        movl    $1,%0               # We have CPUID support           \n"
+"1:                                                                    \n"
+	: "=m" (has_CPUID)
+	:
+	: "%rax", "%rcx"
+	);
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+	__asm {
+		pushfd                      ; Get original EFLAGS
+		pop     eax
+		mov     ecx, eax
+		xor     eax, 200000h        ; Flip ID bit in EFLAGS
+		push    eax                 ; Save new EFLAGS value on stack
+		popfd                       ; Replace current EFLAGS value
+		pushfd                      ; Get new EFLAGS
+		pop     eax                 ; Store new EFLAGS in EAX
+		xor     eax, ecx            ; Can not toggle ID bit,
+		jz      done                ; Processor=80486
+		mov     has_CPUID,1         ; We have CPUID support
+done:
+	}
+#elif defined(_MSC_VER) && defined(_M_X64)
+	has_CPUID = 1;
+#elif defined(__sun) && defined(__i386)
+	__asm (
+"       pushfl                 \n"
+"       popl    %eax           \n"
+"       movl    %eax,%ecx      \n"
+"       xorl    $0x200000,%eax \n"
+"       pushl   %eax           \n"
+"       popfl                  \n"
+"       pushfl                 \n"
+"       popl    %eax           \n"
+"       xorl    %ecx,%eax      \n"
+"       jz      1f             \n"
+"       movl    $1,-8(%ebp)    \n"
+"1:                            \n"
+	);
+#elif defined(__sun) && defined(__amd64)
+	__asm (
+"       pushfq                 \n"
+"       popq    %rax           \n"
+"       movq    %rax,%rcx      \n"
+"       xorl    $0x200000,%eax \n"
+"       pushq   %rax           \n"
+"       popfq                  \n"
+"       pushfq                 \n"
+"       popq    %rax           \n"
+"       xorl    %ecx,%eax      \n"
+"       jz      1f             \n"
+"       movl    $1,-8(%rbp)    \n"
+"1:                            \n"
+	);
+#endif
+
+	return has_CPUID;
+}
+
+#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm__ __volatile__( \
+		"        pushl %%ebx        \n" \
+		"        xorl %%ecx,%%ecx   \n" \
+		"        cpuid              \n" \
+		"        movl %%ebx, %%esi  \n" \
+		"        popl %%ebx         \n" \
+		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
+		: "a"(func))
+#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm__ __volatile__( \
+		"        pushq %%rbx        \n" \
+		"        xorq %%rcx,%%rcx   \n" \
+		"        cpuid              \n" \
+		"        movq %%rbx, %%rsi  \n" \
+		"        popq %%rbx         \n" \
+		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
+		: "a"(func))
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	__asm { \
+		__asm mov eax, func \
+		__asm xor ecx, ecx \
+		__asm cpuid \
+		__asm mov a, eax \
+		__asm mov b, ebx \
+		__asm mov c, ecx \
+		__asm mov d, edx \
+	}
+#elif (defined(_MSC_VER) && defined(_M_X64))
+// Use __cpuidex instead of __cpuid because ICL does not clear ecx register
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	do { \
+		int CPUInfo[4]; \
+		__cpuidex(CPUInfo, func, 0); \
+		a = CPUInfo[0]; \
+		b = CPUInfo[1]; \
+		c = CPUInfo[2]; \
+		d = CPUInfo[3]; \
+	} while (0)
+#else
+# define VEC_CPU_CPUID(func, a, b, c, d) \
+	do { \
+		a = b = c = d = 0; \
+		(void)a; \
+		(void)b; \
+		(void)c; \
+		(void)d; \
+	} while (0)
+#endif
+
+// ---------------------------------------------------------------
+
+static int vec_CPU_CPUIDFeatures[4];
+static int vec_CPU_CPUIDMaxFunction = 0;
+static int vec_CPU_OSSavesYMM = 0;
+static int vec_CPU_OSSavesZMM = 0;
+
+static inline void vec_CPU_get_CPUID_features(void)
+{
+	static int checked = 0;
+	if (!checked) {
+		checked = 1;
+		if (vec_CPU_have_CPUID()) {
+			int a, b, c, d;
+			VEC_CPU_CPUID(0, a, b, c, d);
+			vec_CPU_CPUIDMaxFunction = a;
+			if (vec_CPU_CPUIDMaxFunction >= 1) {
+				VEC_CPU_CPUID(1, a, b, c, d);
+				vec_CPU_CPUIDFeatures[0] = a;
+				vec_CPU_CPUIDFeatures[1] = b;
+				vec_CPU_CPUIDFeatures[2] = c;
+				vec_CPU_CPUIDFeatures[3] = d;
+
+				// Check to make sure we can call xgetbv
+				if (c & 0x08000000) {
+					// Call xgetbv to see if YMM (etc) register state is saved
+#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
+					__asm__(".byte 0x0f, 0x01, 0xd0"
+							: "=a"(a)
+							: "c"(0)
+							: "%edx");
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
+					a = (int)_xgetbv(0);
+#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
+					__asm {
+						xor ecx, ecx
+						_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
+						mov a, eax
+					}
+#endif
+					vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
+					vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
+				}
+			}
+		}
+	}
+}
+
+#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
+static jmp_buf vec_jmpbuf;
+static void vec_CPU_illegal_instruction(int sig)
+{
+	longjmp(vec_jmpbuf, 1);
+}
+#endif
+
+static int vec_CPU_have_ALTIVEC(void)
+{
+	volatile int altivec = 0;
+#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
+	int selectors[2] = {
+# ifdef __OpenBSD__
+		CTL_MACHDEP, CPU_ALTIVEC
+# else
+		CTL_HW, HW_VECTORUNIT
+# endif
+	};
+	int hasVectorUnit = 0;
+	vec_uintsize length = sizeof(hasVectorUnit);
+	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
+	if (!error)
+		altivec = (hasVectorUnit != 0);
+#elif defined(__FreeBSD__) && defined(__powerpc__)
+	unsigned long cpufeatures = 0;
+	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
+#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
+	void (*handler)(int sig);
+	handler = signal(SIGILL, vec_CPU_illegal_instruction);
+	if (!setjmp(vec_jmpbuf)) {
+		vector unsigned char vec;
+		vec_and(vec, vec);
+		altivec = 1;
+	}
+	signal(SIGILL, handler);
+#endif
+	return altivec;
+}
+
+static int vec_CPU_have_ALTIVEC_VSX(void)
+{
+	volatile int vsx = 0;
+#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
+# warning Compiling UNTESTED code for VSX.
+	void (*handler)(int sig);
+	handler = signal(SIGILL, vec_CPU_illegal_instruction);
+	if (!setjmp(vec_jmpbuf)) {
+		// this is completely untested
+		//__asm__ __volatile__("mtspr 256, %0\n\t"
+		//			 "xxland %%v0, %%v0, %%v0" ::"r"(-1));
+		//vsx = 1;
+	}
+	signal(SIGILL, handler);
+#endif
+	return vsx;
+}
+
+#define vec_CPU_have_MMX()   (vec_CPU_CPUIDFeatures[3] & 0x00800000)
+#define vec_CPU_have_SSE()   (vec_CPU_CPUIDFeatures[3] & 0x02000000)
+#define vec_CPU_have_SSE2()  (vec_CPU_CPUIDFeatures[3] & 0x04000000)
+#define vec_CPU_have_SSE3()  (vec_CPU_CPUIDFeatures[2] & 0x00000001)
+#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
+#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
+#define vec_CPU_have_AVX()   (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))
+
+static inline int vec_CPU_have_AVX2(void)
+{
+	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
+		int a, b, c, d;
+		VEC_CPU_CPUID(7, a, b, c, d);
+		return b & 0x00000020;
+		(void)a, (void)c, (void)d;
+	}
+	return 0;
+}
+
+static inline int vec_CPU_have_AVX512F(void)
+{
+	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
+		int a, b, c, d;
+		VEC_CPU_CPUID(7, a, b, c, d);
+		return b & 0x00000020;
+		(void)a, (void)c, (void)d;
+	}
+	return 0;
+}
+
+#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
+static int readProcAuxvForNeon(void)
+{
+	int neon = 0;
+	int fd;
+
+	fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+	if (fd >= 0) {
+		Elf32_auxv_t aux;
+		while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
+			if (aux.a_type == AT_HWCAP) {
+				neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
+				break;
+			}
+		}
+		close(fd);
+	}
+	return neon;
+}
+#endif
+
+static int vec_CPU_have_NEON(void)
+{
+/* The way you detect NEON is a privileged instruction on ARM, so you have
+   query the OS kernel in a platform-specific way. :/ */
+#if defined(SDL_CPUINFO_DISABLED)
+	return 0; /* disabled */
+#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
+/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
+/* Seems to have been removed */
+#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
+#endif
+	/* All WinRT ARM devices are required to support NEON, but just in case. */
+	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
+#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
+	return 1; /* ARMv8 always has non-optional NEON support. */
+#elif defined(__VITA__)
+	return 1;
+#elif defined(__3DS__)
+	return 0;
+#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
+	/* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
+	return 1; /* all Apple ARMv7 chips and later have NEON. */
+#elif defined(__APPLE__)
+	return 0; /* assume anything else from Apple doesn't have NEON. */
+#elif !defined(__arm__)
+	return 0; /* not an ARM CPU at all. */
+#elif defined(__OpenBSD__)
+	return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
+#elif defined(HAVE_ELF_AUX_INFO)
+	unsigned long hasneon = 0;
+	if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
+		return 0;
+
+	return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
+#elif defined(__QNXNTO__)
+	return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
+#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
+#elif defined(__linux__)
+	return readProcAuxvForNeon();
+#elif defined(__ANDROID__)
+	/* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
+	{
+		AndroidCpuFamily cpu_family = android_getCpuFamily();
+		if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
+			uint64_t cpu_features = android_getCpuFeatures();
+			if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
+				return 1;
+			}
+		}
+		return 0;
+	}
+#elif defined(__RISCOS__)
+	/* Use the VFPSupport_Features SWI to access the MVFR registers */
+	{
+		_kernel_swi_regs regs;
+		regs.r[0] = 0;
+		if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
+			if ((regs.r[2] & 0xFFF000) == 0x111000) {
+				return 1;
+			}
+		}
+		return 0;
+	}
+#else
+#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
+	return 0;
+#endif
+}
+
+#define VEC_CPU_FEATURES_RESET VEC_UINT32_C(0xFFFFFFFF)
+
+static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;
+
+vec_uint32 vec_get_CPU_features(void)
+{
+	if (vec_CPU_features == VEC_CPU_FEATURES_RESET) {
+		vec_CPU_get_CPUID_features();
+		vec_CPU_features = 0;
+		if (vec_CPU_have_ALTIVEC())
+			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
+		if (vec_CPU_have_ALTIVEC_VSX())
+			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
+		if (vec_CPU_have_MMX())
+			vec_CPU_features |= VEC_CPU_HAS_MMX;
+		if (vec_CPU_have_SSE())
+			vec_CPU_features |= VEC_CPU_HAS_SSE;
+		if (vec_CPU_have_SSE2())
+			vec_CPU_features |= VEC_CPU_HAS_SSE2;
+		if (vec_CPU_have_SSE3())
+			vec_CPU_features |= VEC_CPU_HAS_SSE3;
+		if (vec_CPU_have_SSE41())
+			vec_CPU_features |= VEC_CPU_HAS_SSE41;
+		if (vec_CPU_have_SSE42())
+			vec_CPU_features |= VEC_CPU_HAS_SSE42;
+		if (vec_CPU_have_AVX())
+			vec_CPU_features |= VEC_CPU_HAS_AVX;
+		if (vec_CPU_have_AVX2())
+			vec_CPU_features |= VEC_CPU_HAS_AVX2;
+		if (vec_CPU_have_AVX512F())
+			vec_CPU_features |= VEC_CPU_HAS_AVX512F;
+		if (vec_CPU_have_NEON())
+			vec_CPU_features |= VEC_CPU_HAS_NEON;
+	}
+	return vec_CPU_features;
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/arm/neon.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,488 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include "vec/impl/arm/neon.h"
+
+#include <arm_neon.h>
+
+// There is LOTS of preprocessor hacking here (as if the other files
+// weren't bad enough... lol)
+
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vld1_##sign##bits(in); \
+		return vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		vstore_lane_##bits(sign, vec.neon, out); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_add, \
+		v##sign##int##bits##x##size##_neon_sub, \
+		v##sign##int##bits##x##size##_neon_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_neon_and, \
+		v##sign##int##bits##x##size##_neon_or, \
+		v##sign##int##bits##x##size##_neon_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_neon_lshift, \
+		/* .rshift = */ NULL, \
+		/* .lrshift = */ NULL, \
+	};
+
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
+
+// Ok, we'll start out with the 64-bit types.
+
+#define vadd_8  vadd_s8
+#define vadd_16 vadd_s16
+#define vadd_32 vadd_s32
+#define vsub_8  vsub_s8
+#define vsub_16 vsub_s16
+#define vsub_32 vsub_s32
+#define vmul_8  vmul_s8
+#define vmul_16 vmul_s16
+#define vmul_32 vmul_s32
+#define vshl_8  vshl_s8
+#define vshl_16 vshl_s16
+#define vshl_32 vshl_s32
+#define veor_8  veor_s8
+#define veor_16 veor_s16
+#define veor_32 veor_s32
+#define vorr_8  vorr_s8
+#define vorr_16 vorr_s16
+#define vorr_32 vorr_s32
+#define vand_8  vand_s8
+#define vand_16 vand_s16
+#define vand_32 vand_s32
+#define vld1_8  vld1_s8
+#define vld1_16 vld1_s16
+#define vld1_32 vld1_s32
+#define vget_lane_8  vget_lane_s8
+#define vget_lane_16 vget_lane_s16
+#define vget_lane_32 vget_lane_s32
+#define vstore_lane_8(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##8(vec, 0); \
+		out[1] = vget_lane_##sign##8(vec, 1); \
+		out[2] = vget_lane_##sign##8(vec, 2); \
+		out[3] = vget_lane_##sign##8(vec, 3); \
+		out[4] = vget_lane_##sign##8(vec, 4); \
+		out[5] = vget_lane_##sign##8(vec, 5); \
+		out[6] = vget_lane_##sign##8(vec, 6); \
+		out[7] = vget_lane_##sign##8(vec, 7); \
+	} while (0)
+#define vstore_lane_16(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##16(vec, 0); \
+		out[1] = vget_lane_##sign##16(vec, 1); \
+		out[2] = vget_lane_##sign##16(vec, 2); \
+		out[3] = vget_lane_##sign##16(vec, 3); \
+	} while (0)
+#define vstore_lane_32(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##32(vec, 0); \
+		out[1] = vget_lane_##sign##32(vec, 1); \
+	} while (0)
+#define vreinterpret_8_u8(x) vreinterpret_s8_u8(x)
+#define vreinterpret_16_u16(x) vreinterpret_s16_u16(x)
+#define vreinterpret_32_u32(x) vreinterpret_s32_u32(x)
+
+VEC_DEFINE_OPERATIONS(8, 8)
+VEC_DEFINE_OPERATIONS(16, 4)
+VEC_DEFINE_OPERATIONS(32, 2)
+
+#undef vadd_8
+#undef vadd_16
+#undef vadd_32
+#undef vsub_8
+#undef vsub_16
+#undef vsub_32
+#undef vmul_8
+#undef vmul_16
+#undef vmul_32
+#undef vshl_8
+#undef vshl_16
+#undef vshl_32
+#undef veor_8
+#undef veor_16
+#undef veor_32
+#undef vorr_8
+#undef vorr_16
+#undef vorr_32
+#undef vand_8
+#undef vand_16
+#undef vand_32
+#undef vld1_8
+#undef vld1_16
+#undef vld1_32
+#undef vget_lane_8 
+#undef vget_lane_16
+#undef vget_lane_32
+#undef vstore_lane_8
+#undef vstore_lane_16
+#undef vstore_lane_32
+#undef vreinterpret_8_u8
+#undef vreinterpret_16_u16
+#undef vreinterpret_32_u32
+
+///////////////////////////////////////////////////////////////////////////////
+// 128-bit
+
+// Now we can go ahead and do the 128-bit ones.
+
+// NEON doesn't have native 64-bit multiplication, so we have
+// to do it ourselves
+static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b)
+{
+    const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a));
+    const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b));
+
+    const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a));
+
+    return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr));
+}
+
+static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b)
+{
+    const uint32x2_t ac = vmovn_u64(a);
+    const uint32x2_t pr = vmovn_u64(b);
+
+    const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a));
+
+    return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr);
+}
+
+#define vadd_8  vaddq_s8
+#define vadd_16 vaddq_s16
+#define vadd_32 vaddq_s32
+#define vadd_64 vaddq_s64
+#define vadd_u8  vaddq_u8
+#define vadd_u16 vaddq_u16
+#define vadd_u32 vaddq_u32
+#define vadd_u64 vaddq_u64
+#define vsub_8  vsubq_s8
+#define vsub_16 vsubq_s16
+#define vsub_32 vsubq_s32
+#define vsub_64 vsubq_s64
+#define vsub_u8  vsubq_u8
+#define vsub_u16 vsubq_u16
+#define vsub_u32 vsubq_u32
+#define vsub_u64 vsubq_u64
+#define vmul_8  vmulq_s8
+#define vmul_16 vmulq_s16
+#define vmul_32 vmulq_s32
+#define vmul_64 vmulq_s64
+#define vmul_u8  vmulq_u8
+#define vmul_u16 vmulq_u16
+#define vmul_u32 vmulq_u32
+#define vmul_u64 vmulq_u64
+#define vshl_8  vshlq_s8
+#define vshl_16 vshlq_s16
+#define vshl_32 vshlq_s32
+#define vshl_64 vshlq_s64
+#define vshl_u8  vshlq_u8
+#define vshl_u16 vshlq_u16
+#define vshl_u32 vshlq_u32
+#define vshl_u64 vshlq_u64
+#define veor_8  veorq_s8
+#define veor_16 veorq_s16
+#define veor_32 veorq_s32
+#define veor_64 veorq_s64
+#define veor_u8  veorq_u8
+#define veor_u16 veorq_u16
+#define veor_u32 veorq_u32
+#define veor_u64 veorq_u64
+#define vorr_8  vorrq_s8
+#define vorr_16 vorrq_s16
+#define vorr_32 vorrq_s32
+#define vorr_64 vorrq_s64
+#define vorr_u8  vorrq_u8
+#define vorr_u16 vorrq_u16
+#define vorr_u32 vorrq_u32
+#define vorr_u64 vorrq_u64
+#define vand_8  vandq_s8
+#define vand_16 vandq_s16
+#define vand_32 vandq_s32
+#define vand_64 vandq_s64
+#define vand_u8  vandq_u8
+#define vand_u16 vandq_u16
+#define vand_u32 vandq_u32
+#define vand_u64 vandq_u64
+#define vld1_8  vld1q_s8
+#define vld1_16 vld1q_s16
+#define vld1_32 vld1q_s32
+#define vld1_64 vld1q_s64
+#define vld1_u8  vld1q_u8
+#define vld1_u16 vld1q_u16
+#define vld1_u32 vld1q_u32
+#define vld1_u64 vld1q_u64
+#define vget_lane_8  vgetq_lane_s8
+#define vget_lane_16 vgetq_lane_s16
+#define vget_lane_32 vgetq_lane_s32
+#define vget_lane_64 vgetq_lane_s64
+#define vget_lane_u8  vgetq_lane_u8
+#define vget_lane_u16 vgetq_lane_u16
+#define vget_lane_u32 vgetq_lane_u32
+#define vget_lane_u64 vgetq_lane_u64
+#define vstore_lane_8(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##8(vec, 0); \
+		out[1] = vget_lane_##sign##8(vec, 1); \
+		out[2] = vget_lane_##sign##8(vec, 2); \
+		out[3] = vget_lane_##sign##8(vec, 3); \
+		out[4] = vget_lane_##sign##8(vec, 4); \
+		out[5] = vget_lane_##sign##8(vec, 5); \
+		out[6] = vget_lane_##sign##8(vec, 6); \
+		out[7] = vget_lane_##sign##8(vec, 7); \
+		out[8] = vget_lane_##sign##8(vec, 8); \
+		out[9] = vget_lane_##sign##8(vec, 9); \
+		out[10] = vget_lane_##sign##8(vec, 10); \
+		out[11] = vget_lane_##sign##8(vec, 11); \
+		out[12] = vget_lane_##sign##8(vec, 12); \
+		out[13] = vget_lane_##sign##8(vec, 13); \
+		out[14] = vget_lane_##sign##8(vec, 14); \
+		out[15] = vget_lane_##sign##8(vec, 15); \
+	} while (0)
+#define vstore_lane_16(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##16(vec, 0); \
+		out[1] = vget_lane_##sign##16(vec, 1); \
+		out[2] = vget_lane_##sign##16(vec, 2); \
+		out[3] = vget_lane_##sign##16(vec, 3); \
+		out[4] = vget_lane_##sign##16(vec, 4); \
+		out[5] = vget_lane_##sign##16(vec, 5); \
+		out[6] = vget_lane_##sign##16(vec, 6); \
+		out[7] = vget_lane_##sign##16(vec, 7); \
+	} while (0)
+#define vstore_lane_32(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##32(vec, 0); \
+		out[1] = vget_lane_##sign##32(vec, 1); \
+		out[2] = vget_lane_##sign##32(vec, 2); \
+		out[3] = vget_lane_##sign##32(vec, 3); \
+	} while (0)
+#define vstore_lane_64(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##64(vec, 0); \
+		out[1] = vget_lane_##sign##64(vec, 1); \
+	} while (0)
+#define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x)
+#define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x)
+#define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x)
+#define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x)
+
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vld1_##sign##bits(in); \
+		return vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		vstore_lane_##bits(sign, vec.neon, out); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_add, \
+		v##sign##int##bits##x##size##_neon_sub, \
+		v##sign##int##bits##x##size##_neon_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_neon_and, \
+		v##sign##int##bits##x##size##_neon_or, \
+		v##sign##int##bits##x##size##_neon_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_neon_lshift, \
+		/* .rshift = */ NULL, \
+		/* .lrshift = */ NULL, \
+	};
+
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
+
+VEC_DEFINE_OPERATIONS(8, 16)
+VEC_DEFINE_OPERATIONS(16, 8)
+VEC_DEFINE_OPERATIONS(32, 4)
+VEC_DEFINE_OPERATIONS(64, 2)
+
+#undef vadd_8
+#undef vadd_16
+#undef vadd_32
+#undef vadd_64
+#undef vsub_8
+#undef vsub_16
+#undef vsub_32
+#undef vsub_64
+#undef vmul_8
+#undef vmul_16
+#undef vmul_32
+#undef vmul_64
+#undef vshl_8
+#undef vshl_16
+#undef vshl_32
+#undef vshl_64
+#undef veor_8
+#undef veor_16
+#undef veor_32
+#undef veor_64
+#undef vorr_8
+#undef vorr_16
+#undef vorr_32
+#undef vorr_64
+#undef vand_8
+#undef vand_16
+#undef vand_32
+#undef vand_64
+#undef vld1_8
+#undef vld1_16
+#undef vld1_32
+#undef vld1_64
+#undef vget_lane_8 
+#undef vget_lane_16
+#undef vget_lane_32
+#undef vget_lane_64
+#undef vstore_lane_8
+#undef vstore_lane_16
+#undef vstore_lane_32
+#undef vstore_lane_64
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/fallback.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,189 @@
+#include "vec/impl/fallback.h"
+
+#include <string.h>
+
+// Fallback implementations - this is what an implementation should use if it
+// doesn't support a specific function *and* the actual representation in
+// memory is unknown or yields incorrect results from the generic functions.
+// This is *extremely* unlikely; for x86 the layout is exactly the same in
+// memory as the generic functions (i.e. it is literally stored as an array of
+// integers).
+//
+// These functions can probably be removed if every current implementation is
+// found to have the same 
+
+#define VEC_FALLBACK_OPERATION(op, sign, csign, bits, size) \
+	do { \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \
+	\
+		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
+		v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \
+	\
+		for (int i = 0; i < size; i++) varr1[i] = (op); \
+	\
+		return v##sign##int##bits##x##size##_load_aligned(varr1); \
+	} while (0)
+
+#define VEC_FALLBACK_CMP(op, sign, csign, bits, size) \
+	VEC_FALLBACK_OPERATION((varr1[i] op varr2[i]) ? VEC_UINT##bits##_MAX : 0, sign, csign, bits, size)
+
+#define VEC_FALLBACK_SHIFT(op, sign, csign, bits, size) \
+	do { \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
+		VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \
+	\
+		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
+		vuint##bits##x##size##_store_aligned(vec2, varr2); \
+	\
+		for (int i = 0; i < size; i++) varr1[i] = (op); \
+	\
+		return v##sign##int##bits##x##size##_load_aligned(varr1); \
+	} while (0)
+
+#define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
+		for (int i = 0; i < size; i++) arr[i] = x; \
+		return v##sign##int##bits##x##size##_load_aligned(arr); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
+		memcpy(arr, in, sizeof(vec_##sign##int##bits) * size); \
+		return v##sign##int##bits##x##size##_load_aligned(arr); \
+	} \
+	\
+	void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
+		v##sign##int##bits##x##size##_store_aligned(vec, arr); \
+		memcpy(out, arr, sizeof(vec_##sign##int##bits) * size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION((varr1[i] + varr2[i] + 1) / 2, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \
+	{ \
+		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)VEC_UINT##bits##_MAX)); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(<, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(<=, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(==, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(>=, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_CMP(>, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_FALLBACK_SHIFT(vec_lrshift((vec_uint##bits)varr1[i], varr2[i]), sign, csign, bits, size); \
+	}
+
+#define VEC_DEFINE_FALLBACK_OPERATIONS(bits, size) \
+	VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(u, U, bits, size)
+
+// 16-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 2)
+
+// 32-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 4)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 2)
+
+// 64-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 8)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 4)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 2)
+
+// 128-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 16)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 8)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 4)
+VEC_DEFINE_FALLBACK_OPERATIONS(64, 2)
+
+// 256-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 32)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 16)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 8)
+VEC_DEFINE_FALLBACK_OPERATIONS(64, 4)
+
+// 512-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 64)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 32)
+VEC_DEFINE_FALLBACK_OPERATIONS(32, 16)
+VEC_DEFINE_FALLBACK_OPERATIONS(64, 8)
+
+#undef VEC_FALLBACK_OPERATION
+#undef VEC_FALLBACK_CMP
+#undef VEC_FALLBACK_SHIFT
+#undef VEC_DEFINE_FALLBACK_OPERATIONS
+#undef VEC_DEFINE_FALLBACK_OPERATIONS_SIGN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/generic.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,460 @@
+#include "vec/impl/generic.h"
+
+#include <string.h>
+
+// -----------------------------------------------------------------
+
+#define VEC_GENERIC_OPERATION(op, sign, csign, bits, size) \
+	do { \
+		for (int i = 0; i < size; i++) \
+			((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] = (op); \
+	\
+		return vec1; \
+	} while (0)
+
+#define VEC_GENERIC_BUILTIN_OPERATION(op, sign, csign, bits, size) \
+	VEC_GENERIC_OPERATION(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] op ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i], sign, csign, bits, size)
+
+#define VEC_GENERIC_CMP(op, sign, csign, bits, size) \
+	VEC_GENERIC_OPERATION((((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] op ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]) ? VEC_UINT##bits##_MAX : 0, sign, csign, bits, size)
+
+// TODO implement these so we don't waste stack space by doing the
+// generics
+#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	union v##sign##int##bits##x##size##_impl_data { \
+		v##sign##int##bits##x##size vec; \
+		vec_##sign##int##bits impl[size]; \
+	}; \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_splat(vec_##sign##int##bits x) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		for (int i = 0; i < size; i++) \
+			((union v##sign##int##bits##x##size##_impl_data *)&vec)->impl[i] = x; \
+		return vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		memcpy(&vec, in, sizeof(vec_##sign##int##bits) * size); \
+		return vec; \
+	} \
+	\
+	void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		memcpy(out, &vec, sizeof(vec_##sign##int##bits) * size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_BUILTIN_OPERATION(+, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_BUILTIN_OPERATION(-, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_BUILTIN_OPERATION(*, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_OPERATION(((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i] ? (((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] / ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]) : 0, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_OPERATION((((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] + ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i] + 1) / 2, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_BUILTIN_OPERATION(&, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_BUILTIN_OPERATION(|, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_BUILTIN_OPERATION(^, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_not(v##sign##int##bits##x##size vec) \
+	{ \
+		return v##sign##int##bits##x##size##_generic_xor(vec, v##sign##int##bits##x##size##_generic_splat((vec_##sign##int##bits)VEC_UINT##bits##_MAX)); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_CMP(<, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_CMP(<=, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_CMP(==, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_CMP(>=, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_CMP(>, sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_OPERATION(vec_##sign##lshift(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i], ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]), sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_OPERATION(vec_##sign##rshift(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i], ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]), sign, csign, bits, size); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_GENERIC_OPERATION(vec_lrshift((vec_uint##bits)(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i]), ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]), sign, csign, bits, size); \
+	} \
+	\
+	const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
+		v##sign##int##bits##x##size##_generic_splat, \
+		v##sign##int##bits##x##size##_generic_load_aligned, \
+		v##sign##int##bits##x##size##_generic_load_aligned, \
+		v##sign##int##bits##x##size##_generic_store_aligned, \
+		v##sign##int##bits##x##size##_generic_store_aligned, \
+		v##sign##int##bits##x##size##_generic_add, \
+		v##sign##int##bits##x##size##_generic_sub, \
+		v##sign##int##bits##x##size##_generic_mul, \
+		v##sign##int##bits##x##size##_generic_div, \
+		v##sign##int##bits##x##size##_generic_avg, \
+		v##sign##int##bits##x##size##_generic_and, \
+		v##sign##int##bits##x##size##_generic_or, \
+		v##sign##int##bits##x##size##_generic_xor, \
+		v##sign##int##bits##x##size##_generic_not, \
+		v##sign##int##bits##x##size##_generic_lshift, \
+		v##sign##int##bits##x##size##_generic_rshift, \
+		v##sign##int##bits##x##size##_generic_lrshift, \
+		v##sign##int##bits##x##size##_generic_cmplt, \
+		v##sign##int##bits##x##size##_generic_cmple, \
+		v##sign##int##bits##x##size##_generic_cmpeq, \
+		v##sign##int##bits##x##size##_generic_cmpge, \
+		v##sign##int##bits##x##size##_generic_cmpgt, \
+	};
+
+#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size)
+
+VEC_GENERIC_DEFINE_OPERATIONS(8, 2)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 2)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 2)
+VEC_GENERIC_DEFINE_OPERATIONS(64, 2)
+
+#undef VEC_GENERIC_DEFINE_OPERATIONS
+#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN
+
+// -----------------------------------------------------------------
+// now we can just keep doubling the same implementation
+
+#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \
+	union v##sign##int##bits##x##size##_impl_data { \
+		v##sign##int##bits##x##size vec; \
+		v##sign##int##bits##x##halfsize impl[2]; \
+	}; \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_splat(vec_##sign##int##bits x) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.impl[0] = v##sign##int##bits##x##halfsize##_splat(x); \
+		vec.impl[1] = v##sign##int##bits##x##halfsize##_splat(x); \
+		return vec.vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.impl[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \
+		vec.impl[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \
+		return vec.vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.impl[0] = v##sign##int##bits##x##halfsize##_load(in); \
+		vec.impl[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \
+		return vec.vec; \
+	} \
+	\
+	void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vecd = (union v##sign##int##bits##x##size##_impl_data *)&vec; \
+	\
+		v##sign##int##bits##x##halfsize##_store_aligned(vecd->impl[0], out); \
+		v##sign##int##bits##x##halfsize##_store_aligned(vecd->impl[1], out + halfsize); \
+	} \
+	\
+	void v##sign##int##bits##x##size##_generic_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vecd = (union v##sign##int##bits##x##size##_impl_data *)&vec; \
+	\
+		v##sign##int##bits##x##halfsize##_store(vecd->impl[0], out); \
+		v##sign##int##bits##x##halfsize##_store(vecd->impl[1], out + halfsize); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_add(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_add(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_sub(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_sub(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_mul(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_mul(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_div(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_div(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_avg(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_avg(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_and(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_and(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_or(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_or(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_xor(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_xor(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_not(v##sign##int##bits##x##size vec1) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_not(vec1d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_not(vec1d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union vuint##bits##x##size##_impl_data *vec2d = (union vuint##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_lshift(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_lshift(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union vuint##bits##x##size##_impl_data *vec2d = (union vuint##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_rshift(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_rshift(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union vuint##bits##x##size##_impl_data *vec2d = (union vuint##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_lrshift(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_lrshift(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmplt(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmplt(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmple(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmple(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmpeq(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmpeq(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmpge(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmpge(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmpgt(vec1d->impl[0], vec2d->impl[0]); \
+		vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmpgt(vec1d->impl[1], vec2d->impl[1]); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
+		v##sign##int##bits##x##size##_generic_splat, \
+		v##sign##int##bits##x##size##_generic_load_aligned, \
+		v##sign##int##bits##x##size##_generic_load, \
+		v##sign##int##bits##x##size##_generic_store_aligned, \
+		v##sign##int##bits##x##size##_generic_store, \
+		v##sign##int##bits##x##size##_generic_add, \
+		v##sign##int##bits##x##size##_generic_sub, \
+		v##sign##int##bits##x##size##_generic_mul, \
+		v##sign##int##bits##x##size##_generic_div, \
+		v##sign##int##bits##x##size##_generic_avg, \
+		v##sign##int##bits##x##size##_generic_and, \
+		v##sign##int##bits##x##size##_generic_or, \
+		v##sign##int##bits##x##size##_generic_xor, \
+		v##sign##int##bits##x##size##_generic_not, \
+		v##sign##int##bits##x##size##_generic_lshift, \
+		v##sign##int##bits##x##size##_generic_rshift, \
+		v##sign##int##bits##x##size##_generic_lrshift, \
+		v##sign##int##bits##x##size##_generic_cmplt, \
+		v##sign##int##bits##x##size##_generic_cmple, \
+		v##sign##int##bits##x##size##_generic_cmpeq, \
+		v##sign##int##bits##x##size##_generic_cmpge, \
+		v##sign##int##bits##x##size##_generic_cmpgt, \
+	};
+
+#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize) \
+	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size, halfsize)
+
+// 32-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 4, 2)
+
+// 64-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 8, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 4, 2)
+
+// 128-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 4, 2)
+
+// 256-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 32, 16)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 16, 8)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 8, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(64, 4, 2)
+
+// 512-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 64, 32)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 32, 16)
+VEC_GENERIC_DEFINE_OPERATIONS(32, 16, 8)
+VEC_GENERIC_DEFINE_OPERATIONS(64, 8, 4)
+
+#undef VEC_GENERIC_DEFINE_OPERATIONS
+#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/ppc/altivec.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,233 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include "vec/impl/ppc/altivec.h"
+
+#include <altivec.h>
+
+/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */
+#ifdef vec_mul
+# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_mul(vec1.altivec, vec2.altivec); \
+		return vec; \
+	}
+# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \
+	v##sign##int##bits##x##size##_altivec_mul
+#else
+# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size)
+# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) NULL
+#endif
+
+#ifdef vec_splats
+# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(vec_##sign##int##bits x) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_splats(x); \
+		return vec; \
+	}
+# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \
+	v##sign##int##bits##x##size##_altivec_splat
+#else
+# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size)
+# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) NULL
+#endif
+
+#define VEC_ALTIVEC_uRSHIFT vec_sr
+#define VEC_ALTIVEC_RSHIFT vec_sra
+
+#define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_sr(vec1.altivec, vec2.altivec); \
+		return vec; \
+	}
+#define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \
+	v##sign##int##bits##x##size##_altivec_lrshift
+
+#define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size)
+#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) NULL
+
+#define VEC_ALTIVEC_CAST_BOOL_8 (vector signed char)
+#define VEC_ALTIVEC_CAST_BOOL_U8 (vector unsigned char)
+#define VEC_ALTIVEC_CAST_BOOL_16 (vector signed short)
+#define VEC_ALTIVEC_CAST_BOOL_U16 (vector unsigned short)
+#define VEC_ALTIVEC_CAST_BOOL_32 (vector signed int)
+#define VEC_ALTIVEC_CAST_BOOL_U32 (vector unsigned int)
+
+/* Since altivec conveniently made their API super user friendly, we can just use
+ * one giant macro to define literally everything */
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_ld(0, in); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_perm(vec_ld(0, in), vec_ld(15, in), vec_lvsl(0, in)); \
+		return vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		vec_st(vec.altivec, 0, out); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_add(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_sub(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_sl(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_avg(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_and(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_or(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_xor(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits  vec_cmplt(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmplt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpeq(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmpgt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpgt(vec1.altivec, vec2.altivec); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \
+		VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size), \
+		v##sign##int##bits##x##size##_altivec_load_aligned, \
+		v##sign##int##bits##x##size##_altivec_load, \
+		v##sign##int##bits##x##size##_altivec_store_aligned, \
+		/* .store = */ NULL, \
+		v##sign##int##bits##x##size##_altivec_add, \
+		v##sign##int##bits##x##size##_altivec_sub, \
+		VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size), \
+		/* .div = */ NULL, \
+		v##sign##int##bits##x##size##_altivec_avg, \
+		v##sign##int##bits##x##size##_altivec_and, \
+		v##sign##int##bits##x##size##_altivec_or, \
+		v##sign##int##bits##x##size##_altivec_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_altivec_lshift, \
+		v##sign##int##bits##x##size##_altivec_rshift, \
+		VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size), \
+		v##sign##int##bits##x##size##_altivec_cmplt, \
+		v##sign##int##bits##x##size##_altivec_cmple, \
+		v##sign##int##bits##x##size##_altivec_cmpeq, \
+		v##sign##int##bits##x##size##_altivec_cmpge, \
+		v##sign##int##bits##x##size##_altivec_cmpgt, \
+	};
+
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
+
+VEC_DEFINE_OPERATIONS(8, 16)
+VEC_DEFINE_OPERATIONS(16, 8)
+VEC_DEFINE_OPERATIONS(32, 4)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/x86/avx2.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,219 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include "vec/impl/x86/avx2.h"
+#include "vec/impl/generic.h"
+
+#include <immintrin.h>
+
+// this does NOT handle sign bits properly, use with caution
+#define VEC_AVX2_OPERATION_8x32_16x16(op, sign) \
+	do { \
+		union v##sign##int8x32_impl_data *vec1d = (union v##sign##int8x32_impl_data *)&vec1; \
+		union v##sign##int8x32_impl_data *vec2d = (union v##sign##int8x32_impl_data *)&vec2; \
+	\
+		/* unpack and multiply */ \
+		__m256i dst_even = _mm256_##op##_epi16(vec1d->avx2, vec2d->avx2); \
+		__m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1d->avx2, 8), _mm256_srli_epi16(vec2d->avx2, 8)); \
+	\
+		/* repack */ \
+		vec1d->avx2 = _mm256_or_si256( \
+			_mm256_slli_epi16(dst_odd, 8), \
+			_mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \
+		); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX2_OPERATION_16x16(op, sign) \
+	do { \
+		union v##sign##int16x16_impl_data *vec1d = (union v##sign##int16x16_impl_data *)&vec1; \
+		union v##sign##int16x16_impl_data *vec2d = (union v##sign##int16x16_impl_data *)&vec2; \
+	\
+		/* unpack and multiply */ \
+		__m256i dst_even = _mm256_##op##_epi32(vec1d->avx2, vec2d->avx2); \
+		__m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1d->avx2, 16), _mm256_srli_epi32(vec2d->avx2, 16)); \
+	\
+		/* repack */ \
+		vec1d->avx2 = _mm256_or_si256( \
+			_mm256_slli_epi32(dst_odd, 16), \
+			_mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \
+		); \
+		return vec1d->vec; \
+	} while (0)
+
+// multiplication
+
+#define VEC_AVX2_MUL_8x32(sign) \
+	VEC_AVX2_OPERATION_8x32_16x16(mullo, sign)
+
+#define VEC_AVX2_MUL_16x16(sign) \
+	do { \
+		union v##sign##int16x16_impl_data *vec1d = (union v##sign##int16x16_impl_data *)&vec1; \
+		union v##sign##int16x16_impl_data *vec2d = (union v##sign##int16x16_impl_data *)&vec2; \
+	\
+		vec1d->avx2 = _mm256_mullo_epi16(vec1d->avx2, vec2d->avx2); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX2_MUL_32x8(sign) \
+	do { \
+		union v##sign##int32x8_impl_data *vec1d = (union v##sign##int32x8_impl_data *)&vec1; \
+		union v##sign##int32x8_impl_data *vec2d = (union v##sign##int32x8_impl_data *)&vec2; \
+	\
+		vec1d->avx2 = _mm256_mullo_epi32(vec1d->avx2, vec2d->avx2); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX2_MUL_64x4(sign) \
+	do { \
+		union v##sign##int64x4_impl_data *vec1d = (union v##sign##int64x4_impl_data *)&vec1; \
+		union v##sign##int64x4_impl_data *vec2d = (union v##sign##int64x4_impl_data *)&vec2; \
+	\
+		__m256i ac = _mm256_mul_epu32(vec1d->avx2, vec2d->avx2); \
+		__m256i b  = _mm256_srli_epi64(vec1d->avx2, 32); \
+		__m256i bc = _mm256_mul_epu32(b, vec2d->avx2); \
+		__m256i d  = _mm256_srli_epi64(vec2d->avx2, 32); \
+		__m256i ad = _mm256_mul_epu32(vec1d->avx2, d); \
+		__m256i hi = _mm256_add_epi64(bc, ad); \
+		hi = _mm256_slli_epi64(hi, 32); \
+	\
+		vec1d->avx2 = _mm256_add_epi64(hi, ac); \
+		return vec1d->vec; \
+	} while (0)
+
+// operations
+
+#define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	union v##sign##int##bits##x##size##_impl_data { \
+		v##sign##int##bits##x##size vec; \
+		__m256i avx2; \
+	}; \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.avx2 = _mm256_load_si256((const __m256i *)in); \
+		return vec.vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.avx2 = _mm256_loadu_si256((const __m256i *)in); \
+		return vec.vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		_mm256_store_si256((__m256i *)out, ((union v##sign##int##bits##x##size##_impl_data*)&vec)->avx2); \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		_mm256_storeu_si256((__m256i *)out, ((union v##sign##int##bits##x##size##_impl_data*)&vec)->avx2); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx2 = _mm256_add_epi##bits(vec1d->avx2, vec2d->avx2); \
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx2 = _mm256_sub_epi##bits(vec1d->avx2, vec2d->avx2); \
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX2_MUL_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx2 = _mm256_and_si256(vec1d->avx2, vec2d->avx2); \
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx2 = _mm256_or_si256(vec1d->avx2, vec2d->avx2); \
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx2 = _mm256_xor_si256(vec1d->avx2, vec2d->avx2); \
+		return vec1d->vec; \
+	} \
+	\
+	const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \
+		v##sign##int##bits##x##size##_generic_splat, \
+		v##sign##int##bits##x##size##_avx2_load_aligned, \
+		v##sign##int##bits##x##size##_avx2_load, \
+		v##sign##int##bits##x##size##_avx2_store_aligned, \
+		v##sign##int##bits##x##size##_avx2_store, \
+		v##sign##int##bits##x##size##_avx2_add, \
+		v##sign##int##bits##x##size##_avx2_sub, \
+		v##sign##int##bits##x##size##_avx2_mul, \
+		v##sign##int##bits##x##size##_generic_div, \
+		v##sign##int##bits##x##size##_generic_avg, \
+		v##sign##int##bits##x##size##_avx2_and, \
+		v##sign##int##bits##x##size##_avx2_or, \
+		v##sign##int##bits##x##size##_avx2_xor, \
+		v##sign##int##bits##x##size##_generic_not, \
+		v##sign##int##bits##x##size##_generic_lshift, \
+		v##sign##int##bits##x##size##_generic_rshift, \
+		v##sign##int##bits##x##size##_generic_lrshift, \
+		v##sign##int##bits##x##size##_generic_cmplt, \
+		v##sign##int##bits##x##size##_generic_cmple, \
+		v##sign##int##bits##x##size##_generic_cmpeq, \
+		v##sign##int##bits##x##size##_generic_cmpge, \
+		v##sign##int##bits##x##size##_generic_cmpgt, \
+	};
+
+#define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \
+	VEC_AVX2_DEFINE_OPERATIONS_SIGN( , bits, size) \
+	VEC_AVX2_DEFINE_OPERATIONS_SIGN(u, bits, size)
+
+VEC_AVX2_DEFINE_OPERATIONS(8, 32)
+VEC_AVX2_DEFINE_OPERATIONS(16, 16)
+VEC_AVX2_DEFINE_OPERATIONS(32, 8)
+VEC_AVX2_DEFINE_OPERATIONS(64, 4)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/x86/avx512f.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,342 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include "vec/impl/x86/avx512f.h"
+#include "vec/impl/generic.h"
+
+#include <immintrin.h>
+
+// this is a stupid amount of work just to do these operations, is it really worth it ?
+// also same note in avx2.c applies here, these do not handle sign bits properly, which
+// isn't that big of a deal for regular arithmetic operations, but matters quite a bit
+// when doing things like arithmetic shifts.
+#define VEC_AVX512F_OPERATION_8x64(op, sign) \
+	do { \
+		union v##sign##int8x64_impl_data *vec1d = (union v##sign##int8x64_impl_data *)&vec1; \
+		union v##sign##int8x64_impl_data *vec2d = (union v##sign##int8x64_impl_data *)&vec2; \
+	\
+		/* unpack and operate */ \
+		__m512i dst_1 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 24), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 24), 24)); \
+		__m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 16), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 16), 24)); \
+		__m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 8), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 8), 24)); \
+		__m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1d->avx512f, 24), _mm512_srli_epi32(vec2d->avx512f, 24)); \
+	\
+		/* repack */ \
+		vec1d->avx512f = _mm512_or_si512( \
+			_mm512_or_si512( \
+				_mm512_srli_epi32(_mm512_slli_epi32(dst_1, 24), 24), \
+				_mm512_srli_epi32(_mm512_slli_epi32(dst_2, 24), 16) \
+			), \
+			_mm512_or_si512( \
+				_mm512_srli_epi32(_mm512_slli_epi32(dst_3, 24), 8), \
+				_mm512_slli_epi32(dst_4, 24) \
+			) \
+		); \
+	\
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_OPERATION_16x32(op, sign) \
+	do { \
+		union v##sign##int16x32_impl_data *vec1d = (union v##sign##int16x32_impl_data *)&vec1; \
+		union v##sign##int16x32_impl_data *vec2d = (union v##sign##int16x32_impl_data *)&vec2; \
+	\
+		/* unpack and operate; it would be nice if we had an _m512_andi_epi32... */ \
+		__m512i dst_1 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 16), 16), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 16), 16)); \
+		__m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1d->avx512f, 16), _mm512_srli_epi32(vec2d->avx512f, 16)); \
+	\
+		/* repack */ \
+		vec1d->avx512f = _mm512_or_si512( \
+			_mm512_srli_epi32(_mm512_slli_epi32(dst_1, 16), 16), \
+			_mm512_slli_epi32(dst_2, 16) \
+		); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_ADD_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(add, sign)
+
+#define VEC_AVX512F_ADD_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(add, sign)
+
+#define VEC_AVX512F_ADD_32x16(sign) \
+	do { \
+		union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \
+		union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_add_epi32(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_ADD_64x8(sign) \
+	do { \
+		union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \
+		union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_add_epi64(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_SUB_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(sub, sign)
+
+#define VEC_AVX512F_SUB_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(sub, sign)
+
+#define VEC_AVX512F_SUB_32x16(sign) \
+	do { \
+		union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \
+		union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_sub_epi32(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_SUB_64x8(sign) \
+	do { \
+		union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \
+		union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_sub_epi64(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_MUL_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(mullo, sign)
+
+#define VEC_AVX512F_MUL_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(mullo, sign)
+
+#define VEC_AVX512F_MUL_32x16(sign) \
+	do { \
+		union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \
+		union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_mullo_epi32(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_MUL_64x8(sign) \
+	do { \
+		union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \
+		union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \
+	\
+		__m512i ac = _mm512_mul_epu32(vec1d->avx512f, vec2d->avx512f); \
+		__m512i b  = _mm512_srli_epi64(vec1d->avx512f, 32); \
+		__m512i bc = _mm512_mul_epu32(b, vec2d->avx512f); \
+		__m512i d  = _mm512_srli_epi64(vec2d->avx512f, 32); \
+		__m512i ad = _mm512_mul_epu32(vec1d->avx512f, d); \
+		__m512i hi = _mm512_add_epi64(bc, ad); \
+		hi = _mm512_slli_epi64(hi, 32); \
+	\
+		vec1d->avx512f = _mm512_add_epi64(hi, ac); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_LSHIFT_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(sllv, sign)
+
+#define VEC_AVX512F_LSHIFT_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(sllv, sign)
+
+#define VEC_AVX512F_LSHIFT_32x16(sign) \
+	do { \
+		union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \
+		union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_sllv_epi32(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_LSHIFT_64x8(sign) \
+	do { \
+		union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \
+		union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_sllv_epi64(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_lRSHIFT_8x64(sign) \
+	VEC_AVX512F_OPERATION_8x64(srlv, sign)
+
+#define VEC_AVX512F_lRSHIFT_16x32(sign) \
+	VEC_AVX512F_OPERATION_16x32(srlv, sign)
+
+#define VEC_AVX512F_aRSHIFT_8x64(sign) \
+	do { \
+		return v##sign##int8x64_generic_rshift(vec1, vec2); \
+	} while (0)
+
+#define VEC_AVX512F_aRSHIFT_16x32(sign) \
+	do { \
+		return v##sign##int16x32_generic_rshift(vec1, vec2); \
+	} while (0)
+
+#define VEC_AVX512F_RSHIFT_8x64(sign, aORl) VEC_AVX512F_##aORl##RSHIFT_8x64(sign)
+#define VEC_AVX512F_RSHIFT_16x32(sign, aORl) VEC_AVX512F_##aORl##RSHIFT_16x32(sign)
+
+#define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \
+	do { \
+		union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \
+		union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_sr##aORl##v_epi32(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \
+	do { \
+		union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \
+		union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_sr##aORl##v_epi64(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_AVX512F_uRSHIFT_8x64(sign, aORl) VEC_AVX512F_RSHIFT_8x64(sign, l)
+#define VEC_AVX512F_uRSHIFT_16x32(sign, aORl) VEC_AVX512F_RSHIFT_16x32(sign, l)
+#define VEC_AVX512F_uRSHIFT_32x16(sign, aORl) VEC_AVX512F_RSHIFT_32x16(sign, l)
+#define VEC_AVX512F_uRSHIFT_64x8(sign, aORl) VEC_AVX512F_RSHIFT_64x8(sign, l)
+
+#define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	union v##sign##int##bits##x##size##_impl_data { \
+		v##sign##int##bits##x##size vec; \
+		__m512i avx512f; \
+	}; \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.avx512f = _mm512_load_si512((const __m512i *)in); \
+		return vec.vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \
+		return vec.vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		_mm512_store_si512((__m512i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->avx512f); \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		_mm512_storeu_si512((__m512i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->avx512f); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_ADD_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_SUB_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_MUL_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_and_si512(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_or_si512(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->avx512f = _mm512_xor_si512(vec1d->avx512f, vec2d->avx512f); \
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_LSHIFT_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_##sign##RSHIFT_##bits##x##size(sign, a); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \
+	} \
+	\
+	const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \
+		v##sign##int##bits##x##size##_generic_splat, \
+		v##sign##int##bits##x##size##_avx512f_load_aligned, \
+		v##sign##int##bits##x##size##_avx512f_load, \
+		v##sign##int##bits##x##size##_avx512f_store_aligned, \
+		v##sign##int##bits##x##size##_avx512f_store, \
+		v##sign##int##bits##x##size##_avx512f_add, \
+		v##sign##int##bits##x##size##_avx512f_sub, \
+		v##sign##int##bits##x##size##_avx512f_mul, \
+		v##sign##int##bits##x##size##_generic_div, \
+		v##sign##int##bits##x##size##_generic_avg, \
+		v##sign##int##bits##x##size##_avx512f_and, \
+		v##sign##int##bits##x##size##_avx512f_or, \
+		v##sign##int##bits##x##size##_avx512f_xor, \
+		v##sign##int##bits##x##size##_generic_not, \
+		v##sign##int##bits##x##size##_avx512f_lshift, \
+		v##sign##int##bits##x##size##_avx512f_rshift, \
+		v##sign##int##bits##x##size##_avx512f_lrshift, \
+		v##sign##int##bits##x##size##_generic_cmplt, \
+		v##sign##int##bits##x##size##_generic_cmple, \
+		v##sign##int##bits##x##size##_generic_cmpeq, \
+		v##sign##int##bits##x##size##_generic_cmpge, \
+		v##sign##int##bits##x##size##_generic_cmpgt, \
+	};
+
+#define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \
+	VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size) \
+	VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size)
+
+VEC_AVX512F_DEFINE_OPERATIONS(8, 64)
+VEC_AVX512F_DEFINE_OPERATIONS(16, 32)
+VEC_AVX512F_DEFINE_OPERATIONS(32, 16)
+VEC_AVX512F_DEFINE_OPERATIONS(64, 8)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/x86/mmx.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,172 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include "vec/vec.h"
+#include "vec/impl/x86/mmx.h"
+#include "vec/impl/generic.h"
+
+#include <mmintrin.h>
+#include <string.h>
+
+#define VEC_MMX_OPERATION_8x8(op, sign) \
+	do { \
+		/* unpack and multiply */ \
+		union v##sign##int8x8_impl_data *vec1d = (union v##sign##int8x8_impl_data *)&vec1; \
+		union v##sign##int8x8_impl_data *vec2d = (union v##sign##int8x8_impl_data *)&vec2; \
+	\
+		__m64 dst_even = _mm_##op##_pi16(vec1d->mmx, vec2d->mmx); \
+		__m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1d->mmx, 8), _mm_srli_pi16(vec2d->mmx, 8)); \
+	\
+		/* repack */ \
+		vec1d->mmx = _mm_or_si64( \
+			_mm_slli_pi16(dst_odd, 8), \
+			_mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \
+		); \
+		return vec1d->vec; \
+	} while (0)
+
+// shared between MMX variations
+#define VEC_MMX_MUL_8x8(sign) \
+	VEC_MMX_OPERATION_8x8(mullo, sign)
+
+#define VEC_MMX_MUL_16x4(sign) \
+	do { \
+		union v##sign##int16x4_impl_data *vec1d = (union v##sign##int16x4_impl_data *)&vec1; \
+		union vuint16x4_impl_data *vec2d = (union vuint16x4_impl_data *)&vec2; \
+	\
+		vec1d->mmx = _mm_mullo_pi16(vec1d->mmx, vec2d->mmx); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_MMX_MUL_32x2(sign) \
+	/* TODO implement this for real */ \
+	do { \
+		return v##sign##int32x2_generic_mul(vec1, vec2); \
+	} while (0)
+
+#define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	union v##sign##int##bits##x##size##_impl_data { \
+		v##sign##int##bits##x##size vec; \
+		__m64 mmx; \
+	}; \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		memcpy(&vec, in, sizeof(vec)); \
+		return vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		memcpy(out, &vec, sizeof(vec)); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->mmx = _mm_add_pi##bits(vec1d->mmx, vec2d->mmx); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->mmx = _mm_sub_pi##bits(vec1d->mmx, vec2d->mmx); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_MMX_MUL_##bits##x##size(sign); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->mmx = _mm_and_si64(vec1d->mmx, vec2d->mmx); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->mmx = _mm_or_si64(vec1d->mmx, vec2d->mmx); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->mmx = _mm_xor_si64(vec1d->mmx, vec2d->mmx); \
+	\
+		return vec1d->vec; \
+	} \
+	\
+	const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \
+		v##sign##int##bits##x##size##_generic_splat, \
+		v##sign##int##bits##x##size##_mmx_load_aligned, \
+		v##sign##int##bits##x##size##_mmx_load_aligned, \
+		v##sign##int##bits##x##size##_mmx_store_aligned, \
+		v##sign##int##bits##x##size##_mmx_store_aligned, \
+		v##sign##int##bits##x##size##_mmx_add, \
+		v##sign##int##bits##x##size##_mmx_sub, \
+		v##sign##int##bits##x##size##_mmx_mul, \
+		v##sign##int##bits##x##size##_generic_div, \
+		v##sign##int##bits##x##size##_generic_avg, \
+		v##sign##int##bits##x##size##_mmx_and, \
+		v##sign##int##bits##x##size##_mmx_or, \
+		v##sign##int##bits##x##size##_mmx_xor, \
+		v##sign##int##bits##x##size##_generic_not, \
+		v##sign##int##bits##x##size##_generic_lshift, \
+		v##sign##int##bits##x##size##_generic_rshift, \
+		v##sign##int##bits##x##size##_generic_lrshift, \
+		v##sign##int##bits##x##size##_generic_cmplt, \
+		v##sign##int##bits##x##size##_generic_cmple, \
+		v##sign##int##bits##x##size##_generic_cmpeq, \
+		v##sign##int##bits##x##size##_generic_cmpge, \
+		v##sign##int##bits##x##size##_generic_cmpgt, \
+	};
+
+#define VEC_MMX_DEFINE_OPERATIONS(bits, size) \
+	VEC_MMX_DEFINE_OPERATIONS_SIGN(u, bits, size) \
+	VEC_MMX_DEFINE_OPERATIONS_SIGN( , bits, size)
+
+VEC_MMX_DEFINE_OPERATIONS(8, 8)
+VEC_MMX_DEFINE_OPERATIONS(16, 4)
+VEC_MMX_DEFINE_OPERATIONS(32, 2)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/x86/sse2.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,263 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include "vec/impl/x86/sse2.h"
+#include "vec/impl/generic.h"
+
+#include <emmintrin.h>
+
+#define VEC_SSE2_OPERATION_8x16(op, sign) \
+	do { \
+		/* unpack and multiply */ \
+		union v##sign##int8x16_impl_data *vec1d = (union v##sign##int8x16_impl_data *)&vec1; \
+		union v##sign##int8x16_impl_data *vec2d = (union v##sign##int8x16_impl_data *)&vec2; \
+	\
+		__m128i dst_even = _mm_##op##_epi16(vec1d->sse, vec2d->sse); \
+		__m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1d->sse, 8), _mm_srli_epi16(vec2d->sse, 8)); \
+	\
+		/* repack */ \
+		vec1d->sse = _mm_or_si128( \
+			_mm_slli_epi16(dst_odd, 8), \
+			_mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
+		); \
+		return vec1d->vec; \
+	} while (0)
+
+// shared between SSE2 variations
+#define VEC_SSE2_MUL_8x16(sign) \
+	VEC_SSE2_OPERATION_8x16(mullo, sign)
+
+#define VEC_SSE2_MUL_16x8(sign) \
+	do { \
+		/* we have a real instruction for this */ \
+		union v##sign##int16x8_impl_data *vec1d = (union v##sign##int16x8_impl_data *)&vec1; \
+		union v##sign##int16x8_impl_data *vec2d = (union v##sign##int16x8_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_mullo_epi16(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_SSE2_MUL_32x4(sign) \
+	do { \
+		/* this was stolen from... somewhere :) */ \
+		union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \
+		union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \
+	\
+		__m128i a13    = _mm_shuffle_epi32(vec1d->sse, 0xF5); /* (-,a3,-,a1) */ \
+		__m128i b13    = _mm_shuffle_epi32(vec2d->sse, 0xF5); /* (-,b3,-,b1) */ \
+		__m128i prod02 = _mm_mul_epu32(vec1d->sse, vec2d->sse); /* (-,a2*b2,-,a0*b0) */ \
+		__m128i prod13 = _mm_mul_epu32(a13, b13);           /* (-,a3*b3,-,a1*b1) */ \
+		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
+		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
+	\
+		vec1d->sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_SSE2_MUL_64x2(sign) \
+	do { \
+		union v##sign##int64x2_impl_data *vec1d = (union v##sign##int64x2_impl_data *)&vec1; \
+		union v##sign##int64x2_impl_data *vec2d = (union v##sign##int64x2_impl_data *)&vec2; \
+	\
+		__m128i ac = _mm_mul_epu32(vec1d->sse, vec2d->sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \
+		__m128i b  = _mm_srli_epi64(vec1d->sse, 32);      /* b = vec1 >> 32; */ \
+		__m128i bc = _mm_mul_epu32(b, vec2d->sse);        /* bc = b * (vec2 & UINT32_MAX); */ \
+		__m128i d  = _mm_srli_epi64(vec2d->sse, 32);      /* d = vec2 >> 32; */ \
+		__m128i ad = _mm_mul_epu32(vec1d->sse, d);        /* ad = (vec1 & UINT32_MAX) * d; */ \
+		__m128i hi = _mm_add_epi64(bc, ad);             /* hi = bc + ad; */ \
+		hi = _mm_slli_epi64(hi, 32);                    /* hi <<= 32; */ \
+	\
+		vec1d->sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_SSE2_CMPEQ_8x16(sign) \
+	do { \
+		union v##sign##int8x16_impl_data *vec1d = (union v##sign##int8x16_impl_data *)&vec1; \
+		union v##sign##int8x16_impl_data *vec2d = (union v##sign##int8x16_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_cmpeq_epi8(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_SSE2_CMPEQ_16x8(sign) \
+	do { \
+		union v##sign##int16x8_impl_data *vec1d = (union v##sign##int16x8_impl_data *)&vec1; \
+		union v##sign##int16x8_impl_data *vec2d = (union v##sign##int16x8_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_cmpeq_epi16(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_SSE2_CMPEQ_32x4(sign) \
+	do { \
+		union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \
+		union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_cmpeq_epi32(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} while (0)
+
+// SSE2 doesn't have an intrinsic for 64x2 equality comparison,
+// so how can we take a 32x4 comparison result and turn it into
+// a 64x2 comparison result?
+//
+// well, Intel conveniently provided an operation where we can
+// shuffle around 32-bit integers (_mm_shuffle_epi32).
+//
+// this means all we have to do is simply do the 32-bit operation,
+// shuffle the parts, and then return a bitwise AND of the result.
+
+#define VEC_SSE2_CMPEQ_64x2(sign) \
+	do { \
+		union v##sign##int64x2_impl_data *vec1d = (union v##sign##int64x2_impl_data *)&vec1; \
+		union v##sign##int64x2_impl_data *vec2d = (union v##sign##int64x2_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_cmpeq_epi32(vec1d->sse, vec2d->sse); \
+		vec2d->sse = _mm_shuffle_epi32(vec1d->sse, _MM_SHUFFLE(1, 1, 3, 3)); \
+		vec1d->sse = _mm_shuffle_epi32(vec1d->sse, _MM_SHUFFLE(0, 0, 2, 2)); \
+		vec1d->sse = _mm_and_si128(vec1d->sse, vec2d->sse); \
+	\
+		return vec1d->vec; \
+	} while (0)
+
+#define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	union v##sign##int##bits##x##size##_impl_data { \
+		v##sign##int##bits##x##size vec; \
+		__m128i sse; \
+	}; \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.sse = _mm_load_si128((const __m128i *)in); \
+		return vec.vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data vec; \
+		vec.sse = _mm_loadu_si128((const __m128i *)in); \
+		return vec.vec; \
+	} \
+	\
+	void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		_mm_store_si128((__m128i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->sse); \
+	} \
+	\
+	void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		_mm_storeu_si128((__m128i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->sse); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_add_epi##bits(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_sub_epi##bits(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_SSE2_MUL_##bits##x##size(sign); \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_and_si128(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_or_si128(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
+		union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_xor_si128(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} \
+	\
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_SSE2_CMPEQ_##bits##x##size(sign); \
+	} \
+	\
+	const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \
+		v##sign##int##bits##x##size##_generic_splat, \
+		v##sign##int##bits##x##size##_sse2_load_aligned, \
+		v##sign##int##bits##x##size##_sse2_load, \
+		v##sign##int##bits##x##size##_sse2_store_aligned, \
+		v##sign##int##bits##x##size##_sse2_store, \
+		v##sign##int##bits##x##size##_sse2_add, \
+		v##sign##int##bits##x##size##_sse2_sub, \
+		v##sign##int##bits##x##size##_sse2_mul, \
+		v##sign##int##bits##x##size##_generic_div, \
+		v##sign##int##bits##x##size##_generic_avg, \
+		v##sign##int##bits##x##size##_sse2_and, \
+		v##sign##int##bits##x##size##_sse2_or, \
+		v##sign##int##bits##x##size##_sse2_xor, \
+		v##sign##int##bits##x##size##_generic_not, \
+		v##sign##int##bits##x##size##_generic_lshift, \
+		v##sign##int##bits##x##size##_generic_rshift, \
+		v##sign##int##bits##x##size##_generic_lrshift, \
+		v##sign##int##bits##x##size##_generic_cmplt, \
+		v##sign##int##bits##x##size##_generic_cmple, \
+		v##sign##int##bits##x##size##_sse2_cmpeq, \
+		v##sign##int##bits##x##size##_generic_cmpge, \
+		v##sign##int##bits##x##size##_generic_cmpgt, \
+	};
+
+#define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \
+	VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size) \
+	VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size)
+
+// SSE is *only* 128-bit
+VEC_SSE2_DEFINE_OPERATIONS(8, 16)
+VEC_SSE2_DEFINE_OPERATIONS(16, 8)
+VEC_SSE2_DEFINE_OPERATIONS(32, 4)
+VEC_SSE2_DEFINE_OPERATIONS(64, 2)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/impl/x86/sse41.c	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,73 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include "vec/impl/x86/sse41.h"
+#include "vec/impl/x86/sse2.h"
+#include "vec/impl/generic.h"
+
+#include <immintrin.h>
+
+// SSE 4.1 provides a real _mm_mullo_epi32
+#define VEC_SSE41_DEFINE_OPERATIONS(sign) \
+	union v##sign##int32x4_impl_data { \
+		v##sign##int32x4 vec; \
+		__m128i sse; \
+	}; \
+	\
+	static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \
+	{ \
+		union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \
+		union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \
+	\
+		vec1d->sse = _mm_mullo_epi32(vec1d->sse, vec2d->sse); \
+		return vec1d->vec; \
+	} \
+	\
+	const v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \
+		v##sign##int32x4_generic_splat, \
+		v##sign##int32x4_sse2_load_aligned, \
+		v##sign##int32x4_sse2_load, \
+		v##sign##int32x4_sse2_store_aligned, \
+		v##sign##int32x4_sse2_store, \
+		v##sign##int32x4_sse2_add, \
+		v##sign##int32x4_sse2_sub, \
+		v##sign##int32x4_sse41_mul, \
+		v##sign##int32x4_generic_div, \
+		v##sign##int32x4_generic_avg, \
+		v##sign##int32x4_sse2_and, \
+		v##sign##int32x4_sse2_or, \
+		v##sign##int32x4_sse2_xor, \
+		v##sign##int32x4_generic_not, \
+		v##sign##int32x4_generic_lshift, \
+		v##sign##int32x4_generic_rshift, \
+		v##sign##int32x4_generic_lrshift, \
+		v##sign##int32x4_generic_cmplt, \
+		v##sign##int32x4_generic_cmple, \
+		v##sign##int32x4_sse2_cmpeq, \
+		v##sign##int32x4_generic_cmpge, \
+		v##sign##int32x4_generic_cmpgt, \
+	};
+
+VEC_SSE41_DEFINE_OPERATIONS()
+VEC_SSE41_DEFINE_OPERATIONS(u)
--- a/src/vec.c	Sat Nov 23 04:09:44 2024 +0000
+++ b/src/vec.c	Sun Nov 24 02:52:40 2024 -0500
@@ -1,2 +1,286 @@
-#define VEC_IMPLEMENTATION
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
 #include "vec/vec.h"
+#include "vec/cpu.h"
+#include "vec/impl/generic.h"
+#include "vec/impl/fallback.h"
+#ifdef VEC_COMPILER_HAS_MMX
+# include "vec/impl/x86/mmx.h"
+#endif
+#ifdef VEC_COMPILER_HAS_SSE2
+# include "vec/impl/x86/sse2.h"
+#endif
+#ifdef VEC_COMPILER_HAS_SSE41
+# include "vec/impl/x86/sse41.h"
+#endif
+#ifdef VEC_COMPILER_HAS_AVX2
+# include "vec/impl/x86/avx2.h"
+#endif
+#ifdef VEC_COMPILER_HAS_AVX512F
+# include "vec/impl/x86/avx512f.h"
+#endif
+#ifdef VEC_COMPILER_HAS_ALTIVEC
+# include "vec/impl/ppc/altivec.h"
+#endif
+#ifdef VEC_COMPILER_HAS_NEON
+# include "vec/impl/arm/neon.h"
+#endif
+
+extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y);
+extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y);
+extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
+extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
+extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
+extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);
+
+// 16-bit
+const vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
+const vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;
+
+// 32-bit
+const vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
+const vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
+const vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
+const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
+
+// 64-bit
+const vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
+const vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
+const vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
+const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
+const vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
+const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
+
+// 128-bit
+const vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
+const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
+const vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
+const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
+const vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
+const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
+const vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
+const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
+
+// 256-bit
+const vint8x32_impl   *vint8x32_impl_cpu   = &vint8x32_impl_generic;
+const vuint8x32_impl  *vuint8x32_impl_cpu  = &vuint8x32_impl_generic;
+const vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
+const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
+const vint32x8_impl   *vint32x8_impl_cpu   = &vint32x8_impl_generic;
+const vuint32x8_impl  *vuint32x8_impl_cpu  = &vuint32x8_impl_generic;
+const vint64x4_impl   *vint64x4_impl_cpu   = &vint64x4_impl_generic;
+const vuint64x4_impl  *vuint64x4_impl_cpu  = &vuint64x4_impl_generic;
+
+// 512-bit
+const vint8x64_impl   *vint8x64_impl_cpu   = &vint8x64_impl_generic;
+const vuint8x64_impl  *vuint8x64_impl_cpu  = &vuint8x64_impl_generic;
+const vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
+const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
+const vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
+const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
+const vint64x8_impl   *vint64x8_impl_cpu   = &vint64x8_impl_generic;
+const vuint64x8_impl  *vuint64x8_impl_cpu  = &vuint64x8_impl_generic;
+
+static int vec_init_spinner = 0;
+
+// returns 0 or a negative error code on failure
+int vec_init(void)
+{
+	// This function is NOT thread safe. However, once vec
+	// is initialized, all of the vector functions are thread-safe.
+	//
+	// In fact, it's possible to use vec without calling
+	// vec_init() at all, but it would be completely useless since
+	// it would just use a generic implementation without any
+	// vectorization whatsoever (unless maybe the compiler is
+	// smart enough to optimize it into vectors)
+
+	if (vec_init_spinner)
+		return 0; // already initialized, do nothing
+
+	vec_uint32 cpu = vec_get_CPU_features();
+
+#ifdef VEC_COMPILER_HAS_ALTIVEC
+	if (cpu & VEC_CPU_HAS_ALTIVEC) {
+		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
+		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
+		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
+		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
+		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
+		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
+#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
+		if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
+			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
+			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
+		}
+#endif
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_AVX512F
+	if (cpu & VEC_CPU_HAS_AVX512F) {
+		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
+		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
+		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
+		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
+		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
+		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
+		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
+		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_AVX2
+	if (cpu & VEC_CPU_HAS_AVX2) {
+		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
+		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
+		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
+		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
+		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
+		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
+		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
+		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_SSE2
+	if (cpu & VEC_CPU_HAS_SSE2) {
+		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
+		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
+		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
+		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
+# ifdef VEC_COMPILER_HAS_SSE41
+		if (cpu & VEC_CPU_HAS_SSE41) {
+			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
+			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
+		} else
+# endif
+		{
+			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
+			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
+		}
+		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
+		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_MMX
+	if (cpu & VEC_CPU_HAS_MMX) {
+		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
+		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
+		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
+		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
+		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
+		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
+	}
+#endif
+#ifdef VEC_COMPILER_HAS_NEON
+	if (cpu & VEC_CPU_HAS_NEON) {
+		// 64-bit
+		vint8x8_impl_cpu  = &vint8x8_impl_neon;
+		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
+		vint16x4_impl_cpu  = &vint16x4_impl_neon;
+		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
+		vint32x2_impl_cpu  = &vint32x2_impl_neon;
+		vuint32x2_impl_cpu = &vuint32x2_impl_neon;
+
+		// 128-bit
+		vint8x16_impl_cpu  = &vint8x16_impl_neon;
+		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
+		vint16x8_impl_cpu  = &vint16x8_impl_neon;
+		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
+		vint32x4_impl_cpu  = &vint32x4_impl_neon;
+		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
+		vint64x2_impl_cpu  = &vint64x2_impl_neon;
+		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
+	}
+#endif
+	{
+		// do nothing, they're already set to generics
+	}
+
+	vec_init_spinner++;
+
+	return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
+	extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+	extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
+
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
+
+// 16-bit
+VEC_DEFINE_OPERATIONS(8, 2)
+
+// 32-bit
+VEC_DEFINE_OPERATIONS(8, 4)
+VEC_DEFINE_OPERATIONS(16, 2)
+
+// 64-bit
+VEC_DEFINE_OPERATIONS(8, 8)
+VEC_DEFINE_OPERATIONS(16, 4)
+VEC_DEFINE_OPERATIONS(32, 2)
+
+// 128-bit
+VEC_DEFINE_OPERATIONS(8, 16)
+VEC_DEFINE_OPERATIONS(16, 8)
+VEC_DEFINE_OPERATIONS(32, 4)
+VEC_DEFINE_OPERATIONS(64, 2)
+
+// 256-bit
+VEC_DEFINE_OPERATIONS(8, 32)
+VEC_DEFINE_OPERATIONS(16, 16)
+VEC_DEFINE_OPERATIONS(32, 8)
+VEC_DEFINE_OPERATIONS(64, 4)
+
+// 512-bit
+VEC_DEFINE_OPERATIONS(8, 64)
+VEC_DEFINE_OPERATIONS(16, 32)
+VEC_DEFINE_OPERATIONS(32, 16)
+VEC_DEFINE_OPERATIONS(64, 8)
+
+#undef VEC_DEFINE_OPERATIONS
+#undef VEC_DEFINE_OPERATIONS_SIGN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/CMakeLists.txt	Sun Nov 24 02:52:40 2024 -0500
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.23)
+
+project(vec-tests)
+
+# add main vec directory
+add_subdirectory(.. vec)
+
+add_executable(vec-tests test.c)
+
+target_link_libraries(vec-tests vec)
--- a/test/Makefile.ppc	Sat Nov 23 04:09:44 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-CPPFLAGS += -maltivec
-
-include Makefile.template
\ No newline at end of file
--- a/test/Makefile.template	Sat Nov 23 04:09:44 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,53 +0,0 @@
-CPPFLAGS += -g -O2 -I../include -Wall -Wpedantic -Werror=strict-aliasing
-CFLAGS += $(CPPFLAGS) -std=c99
-CXXFLAGS += $(CPPFLAGS) -std=c++11
-
-HEADERS = ../include/vec/vec.h \
-	../include/vec/impl/ppc/altivec.h \
-	../include/vec/impl/x86/avx2.h \
-	../include/vec/impl/x86/avx512f.h \
-	../include/vec/impl/x86/mmx.h \
-	../include/vec/impl/x86/sse2.h \
-	../include/vec/impl/x86/sse41.h \
-	../include/vec/impl/cpu.h \
-	../include/vec/impl/fallback.h \
-	../include/vec/impl/generic.h \
-	test_align.h \
-	test_arith.h \
-	test_compare.h \
-	test_shift.h
-BINS = test-generic test-host test-cxx
-OBJS = vec-generic.o vec-host.o test.o test-cxx.o
-
-.PHONY: all clean test
-
-all: $(BINS)
-
-vec-generic.o: ../src/vec.c $(HEADERS)
-	$(CC) $(CFLAGS) -DVEC_SUPPRESS_HW=1 -c -o $@ $<
-
-vec-host.o: ../src/vec.c $(HEADERS)
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-test.o: test.c
-	$(CC) $(CFLAGS) -c -o $@ $<
-
-test-cxx.o: test.cc
-	$(CXX) $(CXXFLAGS) -c -o $@ $<
-
-test-generic: vec-generic.o test.o
-	$(CC) $(LDFLAGS) -o $@ $^
-
-test-host: vec-host.o test.o
-	$(CC) $(LDFLAGS) -o $@ $^
-
-test-cxx: test-cxx.o $(HEADERS)
-	$(CXX) $(LDFLAGS) -o $@ $<
-
-clean:
-	$(RM) $(BINS) $(OBJS)
-
-test: clean $(BINS)
-	./test-generic
-	./test-host
-	./test-cxx
--- a/test/Makefile.x86	Sat Nov 23 04:09:44 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-CPPFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f
-
-include Makefile.template
\ No newline at end of file
--- a/test/test.cc	Sat Nov 23 04:09:44 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,27 +0,0 @@
-#define VEC_IMPLEMENTATION
-#include "vec/vec.h"
-
-#include <iostream>
-
-/* this test makes sure that vec can be included under C++ */
-int main(void)
-{
-	int ret = 0;
-
-	VUINT32x8_ALIGNED_ARRAY(varrin);
-	VUINT32x8_ALIGNED_ARRAY(varrout);
-
-	for (int i = 0; i < 8; i++)
-		varrin[i] = i;
-
-	vuint32x8 vec = vuint32x8_load_aligned(varrin);
-	vec = vuint32x8_add(vec, vec);
-
-	vuint32x8_store_aligned(vec, varrout);
-
-	for (int i = 0; i < 8; i++)
-		if (varrout[i] != (uint32_t)(varrin[i] + varrin[i]))
-			ret |= 1;
-
-	return ret;
-}
\ No newline at end of file
--- a/test/test_arith.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/test/test_arith.h	Sun Nov 24 02:52:40 2024 -0500
@@ -39,8 +39,8 @@
 		v##sign##int##bits##x##size##_store_aligned(c, orig_c); \
 	\
 		for (int i = 0; i < size; i++) { \
-			if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \
-				fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%" PRI ## psign ## bits "] does not equal result [%" PRI ## psign ## bits "]!\n", i, (sign##int##bits##_t)(equiv), orig_c[i]); \
+			if ((vec_##sign##int##bits)(equiv) != orig_c[i]) { \
+				fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%" PRI ## psign ## bits "] does not equal result [%" PRI ## psign ## bits "]!\n", i, (vec_##sign##int##bits)(equiv), orig_c[i]); \
 				print_v##sign##int##bits##x##size(stderr,a); \
 				print_vuint##bits##x##size(stderr,b); \
 				print_v##sign##int##bits##x##size(stderr,c); \
@@ -60,10 +60,10 @@
 	CREATE_TEST(sign, psign, csign, bits, size, and, orig_a[i] & orig_b[i]) \
 	CREATE_TEST(sign, psign, csign, bits, size, or,  orig_a[i] | orig_b[i]) \
 	CREATE_TEST(sign, psign, csign, bits, size, xor, orig_a[i] ^ orig_b[i]) \
-	CREATE_TEST(sign, psign, csign, bits, size, avg, (sign##int##bits##_t)(orig_a[i] + orig_b[i]) / 2) \
+	CREATE_TEST(sign, psign, csign, bits, size, avg, (orig_a[i] + orig_b[i] + 1) / 2) \
 	CREATE_TEST_SHIFT(sign, psign, csign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \
 	CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \
-	CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i]))
+	CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_lrshift((vec_uint##bits)orig_a[i], orig_b[i]))
 
 #define CREATE_TESTS(bits, size) \
 	CREATE_TESTS_SIGN(, d, , bits, size) \
--- a/test/test_shift.h	Sat Nov 23 04:09:44 2024 +0000
+++ b/test/test_shift.h	Sun Nov 24 02:52:40 2024 -0500
@@ -2,8 +2,6 @@
 {
 	int ret = 0;
 
-	ret |= (vec_ulrshift(0xFFFFFFFF, 16) != 0xFFFF);
-	ret |= (vec_ullshift(0xFFFF, 16) != 0xFFFF0000);
 	ret |= (vec_lrshift(0xFFFFFFFF, 16) != 0xFFFF);
 	ret |= (vec_llshift(0xFFFF, 16) != 0xFFFF0000);
 	ret |= (vec_urshift(0xFFFFFFFF, 16) != 0xFFFF);