changeset 17:41dd962abdd1

*: allow compiling vec in a C++ translation unit this is stupid, but whatever
author Paper <paper@tflc.us>
date Wed, 20 Nov 2024 12:02:15 -0500
parents 9da2aba90c87
children cf04071d2148
files CMakeLists.txt include/vec/impl/align.h include/vec/impl/cpu.h include/vec/impl/fallback.h include/vec/impl/generic.h include/vec/impl/integer.h.in include/vec/impl/ppc/altivec.h include/vec/impl/x86/avx2.h include/vec/impl/x86/avx512f.h include/vec/impl/x86/mmx.h include/vec/impl/x86/sse2.h include/vec/impl/x86/sse41.h include/vec/vec.h test/Makefile test/Makefile.ppc test/Makefile.template test/Makefile.x86 test/test.c test/test.cc test/test_align.h test/test_arith.h test/test_compare.h test/vec.pc
diffstat 23 files changed, 1390 insertions(+), 501 deletions(-) [+]
line wrap: on
line diff
--- a/CMakeLists.txt	Wed Nov 20 04:16:56 2024 -0500
+++ b/CMakeLists.txt	Wed Nov 20 12:02:15 2024 -0500
@@ -1,9 +1,16 @@
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.23)
 
 project(vec VERSION 2.0.0 DESCRIPTION "a tiny C99 SIMD vector library")
 
 add_library(vec SHARED src/vec.c)
 
+target_sources(vec PUBLIC
+	$<INSTALL_INTERFACE:vec/vec.h>
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/vec.h>
+	$<INSTALL_INTERFACE:vec/impl/integer.h>
+	$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/vec/impl/integer.h>
+)
+
 include(CheckCCompilerFlag)
 
 if(MSVC)
@@ -35,18 +42,121 @@
 	endif()
 endif()
 
+#########################################################################
+# integer types
 
-set_target_properties(vec PROPERTIES PUBLIC_HEADER include/vec/vec.h C_STANDARD 99)
+include(CheckTypeSize)
+
+check_type_size("int16_t"   INT16_T_SIZE   LANGUAGE C)
+check_type_size("uint16_t"  UINT16_T_SIZE  LANGUAGE C)
+check_type_size("u_int16_t" U_INT16_T_SIZE LANGUAGE C)
+check_type_size("int32_t"   INT32_T_SIZE   LANGUAGE C)
+check_type_size("uint32_t"  UINT32_T_SIZE  LANGUAGE C)
+check_type_size("u_int32_t" U_INT32_T_SIZE LANGUAGE C)
+check_type_size("int64_t"   INT64_T_SIZE   LANGUAGE C)
+check_type_size("uint64_t"  UINT64_T_SIZE  LANGUAGE C)
+check_type_size("u_int64_t" U_INT64_T_SIZE LANGUAGE C)
+check_type_size("short"     SHORT_SIZE     LANGUAGE C)
+check_type_size("int"       INT_SIZE       LANGUAGE C)
+check_type_size("long"      LONG_SIZE      LANGUAGE C)
+check_type_size("long long" LONG_LONG_SIZE LANGUAGE C)
+check_type_size("uintptr_t" UINTPTR_T_SIZE LANGUAGE C)
+
+if(INT16_T_SIZE EQUAL 2)
+	set(SIZE16 "int16_t")
+elseif(SHORT_SIZE EQUAL 2)
+	set(SIZE16 "short")
+elseif(INT_SIZE EQUAL 2)
+	set(SIZE16 "int")
+endif()
+
+if(UINT16_T_SIZE EQUAL 2)
+	set(USIZE16 "uint16_t")
+elseif(U_INT16_T_SIZE EQUAL 2)
+	set(USIZE16 "u_int16_t")
+elseif(SHORT_SIZE EQUAL 2)
+	set(USIZE16 "unsigned short")
+elseif(INT_SIZE EQUAL 2)
+	set(USIZE16 "unsigned int")
+endif()
+
+if(INT32_T_SIZE EQUAL 4)
+	set(SIZE32 "int32_t")
+elseif(SHORT_SIZE EQUAL 4)
+	set(SIZE32 "short")
+elseif(INT_SIZE EQUAL 4)
+	set(SIZE32 "int")
+elseif(LONG_SIZE EQUAL 4)
+	set(SIZE32 "long")
+endif()
 
-target_include_directories(vec PRIVATE include)
+if(UINT32_T_SIZE EQUAL 4)
+	set(USIZE32 "uint32_t")
+elseif(U_INT32_T_SIZE EQUAL 4)
+	set(USIZE32 "u_int32_t")
+elseif(SHORT_SIZE EQUAL 4)
+	set(USIZE32 "unsigned short")
+elseif(INT_SIZE EQUAL 4)
+	set(USIZE32 "unsigned int")
+elseif(LONG_SIZE EQUAL 4)
+	set(USIZE32 "unsigned long")
+endif()
+
+if(INT64_T_SIZE EQUAL 8)
+	set(SIZE64 "int64_t")
+elseif(SHORT_SIZE EQUAL 8)
+	set(SIZE64 "short")
+elseif(INT_SIZE EQUAL 8)
+	set(SIZE64 "int")
+elseif(LONG_SIZE EQUAL 8)
+	set(SIZE64 "long")
+elseif(LONG_LONG_SIZE EQUAL 8)
+	set(SIZE64 "long long")
+endif()
+
+if(UINT64_T_SIZE EQUAL 8)
+	set(USIZE64 "uint64_t")
+elseif(U_INT64_T_SIZE EQUAL 8)
+	set(USIZE64 "u_int64_t")
+elseif(SHORT_SIZE EQUAL 8)
+	set(USIZE64 "unsigned short")
+elseif(INT_SIZE EQUAL 8)
+	set(USIZE64 "unsigned int")
+elseif(LONG_SIZE EQUAL 8)
+	set(USIZE64 "unsigned long")
+elseif(LONG_LONG_SIZE EQUAL 8)
+	set(USIZE64 "unsigned long long")
+endif()
+
+if(CMAKE_SIZEOF_VOID_P EQUAL UINTPTR_T_SIZE)
+	set(USIZEPTR "uintptr_t")
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 1)
+	set(USIZEPTR "unsigned char")
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 2)
+	set(USIZEPTR "${USIZE16}")
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+	set(USIZEPTR "${USIZE32}")
+elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+	set(USIZEPTR "${USIZE64}")
+endif()
+
+configure_file(include/vec/impl/integer.h.in include/vec/impl/integer.h @ONLY)
+
+target_compile_definitions(vec PRIVATE "VEC_HAVE_IMPL_INTEGER_H")
+
+#########################################################################
+
+target_compile_features(vec PRIVATE $<IF:$<COMPILE_FEATURES:c_std_11>,c_std_11,c_std_99>)
+target_include_directories(vec PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_BINARY_DIR}/include/vec")
 
 # Installing
 
 include(GNUInstallDirs)
 
-install(TARGETS vec
-	LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-	PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(TARGETS vec LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/include/vec/vec.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/vec")
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/include/vec/impl/integer.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/vec/impl")
 
 # pkg-config
 configure_file(vec.pc.in vec.pc @ONLY)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/align.h	Wed Nov 20 12:02:15 2024 -0500
@@ -0,0 +1,267 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_ALIGN_H_
+#define VEC_IMPL_ALIGN_H_
+
+// Array alignment macros
+
+#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
+# define VEC_ALIGNAS(x) alignas(x)
+#elif (__STDC_VERSION__ >= 201112L)
+# define VEC_ALIGNAS(x) _Alignas(x)
+#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
+# define VEC_ALIGNAS(x) __attribute__((__aligned__(x)))
+#endif
+
+/* the alignment must be specified in bytes and must be a multiple of the
+ * type size. it is always assumed that the type will be on a boundary of
+ * its size, which may or may not be true */
+#ifdef VEC_ALIGNAS
+# define VEC_ALIGNED_ARRAY(type, var, length, align) \
+	VEC_ALIGNAS(align) type var[length]
+# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
+	(sizeof(var))
+#else
+// use unions to get an aligned offset without triggering strict aliasing
+# define VEC_ALIGNED_ARRAY(type, var, length, align) \
+	VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \
+	union vec_aligned_union_##var##_ { \
+		type arr[length]; \
+		unsigned char bytes[sizeof(type) * length]; \
+	}; \
+	unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \
+	type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \
+	VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
+# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
+	(sizeof(vec_unaligned_##var##_) - (align - 1))
+#endif
+
+#define VEC_ALIGNED_ARRAY_LENGTH(var) \
+	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))
+
+//////////////////////////////////////////////////////////////////////////////////////
+// predefined variants for each vector type
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 16-bit
+
+#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT)
+#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT)
+#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT)
+#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0)
+
+#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT)
+#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT)
+#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT)
+#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 32-bit
+
+#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT)
+#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT)
+#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT)
+#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0)
+
+#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT)
+#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT)
+#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT)
+#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0)
+
+#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT)
+#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT)
+#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT)
+#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0)
+
+#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT)
+#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT)
+#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT)
+#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 64-bit
+
+#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT)
+#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
+#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
+#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)
+
+#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT)
+#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
+#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
+#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)
+
+#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT)
+#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
+#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
+#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)
+
+#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
+#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)
+
+#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
+#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)
+
+#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
+#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 128-bit
+
+#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
+
+#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)
+
+#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)
+
+#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)
+
+#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)
+
+#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)
+
+#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)
+
+#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 256-bit
+
+#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT)
+#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
+#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
+#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)
+
+#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT)
+#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
+#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
+#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
+
+#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT)
+#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
+#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
+#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)
+
+#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT)
+#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
+#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
+#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)
+
+#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
+#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)
+
+#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
+#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
+
+#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
+#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)
+
+#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
+#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+// 512-bit
+
+#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT)
+#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
+#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
+#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)
+
+#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT)
+#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
+#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
+#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
+
+#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT)
+#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
+#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
+#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)
+
+#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT)
+#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
+#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
+#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)
+
+#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
+#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)
+
+#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
+#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
+
+#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
+#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)
+
+#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
+#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)
+
+//////////////////////////////////////////////////////////////////////////////////////
+
+#endif /* VEC_IMPL_ALIGN_H_ */
--- a/include/vec/impl/cpu.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/cpu.h	Wed Nov 20 12:02:15 2024 -0500
@@ -280,7 +280,7 @@
 # endif
 	};
 	int hasVectorUnit = 0;
-	size_t length = sizeof(hasVectorUnit);
+	vec_uintsize length = sizeof(hasVectorUnit);
 	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
 	if (!error)
 		altivec = (hasVectorUnit != 0);
@@ -289,14 +289,14 @@
 	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
 	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
 #elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
-    void (*handler)(int sig);
-    handler = signal(SIGILL, vec_CPU_illegal_instruction);
-    if (!setjmp(vec_jmpbuf)) {
-        asm volatile("mtspr 256, %0\n\t"
-                     "vand %%v0, %%v0, %%v0" ::"r"(-1));
-        altivec = 1;
-    }
-    signal(SIGILL, handler);
+	void (*handler)(int sig);
+	handler = signal(SIGILL, vec_CPU_illegal_instruction);
+	if (!setjmp(vec_jmpbuf)) {
+		asm volatile("mtspr 256, %0\n\t"
+					 "vand %%v0, %%v0, %%v0" ::"r"(-1));
+		altivec = 1;
+	}
+	signal(SIGILL, handler);
 #endif
 	return altivec;
 }
@@ -364,7 +364,7 @@
 
 #define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF)
 
-static uint32_t vec_CPU_features = VEC_CPU_FEATURES_RESET;
+static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;
 
 static void vec_get_CPU_features(void)
 {
@@ -374,8 +374,8 @@
 		vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
 	if (vec_CPU_have_ALTIVEC_VSX())
 		vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
-    if (vec_CPU_have_MMX())
-        vec_CPU_features |= VEC_CPU_HAS_MMX;
+	if (vec_CPU_have_MMX())
+		vec_CPU_features |= VEC_CPU_HAS_MMX;
 	if (vec_CPU_have_SSE())
 		vec_CPU_features |= VEC_CPU_HAS_SSE;
 	if (vec_CPU_have_SSE2())
--- a/include/vec/impl/fallback.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/fallback.h	Wed Nov 20 12:02:15 2024 -0500
@@ -25,6 +25,8 @@
 #ifndef VEC_IMPL_FALLBACK_H_
 #define VEC_IMPL_FALLBACK_H_
 
+#include <string.h>
+
 // Fallback implementations - this is what an implementation should use if it
 // doesn't support a specific function. Note that the load_aligned and
 // store_aligned functions are not implemented here - this is on purpose;
@@ -60,25 +62,25 @@
 	} while (0)
 
 #define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(sign##int##bits##_t x) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x) \
 	{ \
 		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
 		for (int i = 0; i < size; i++) arr[i] = x; \
 		return v##sign##int##bits##x##size##_load_aligned(arr); \
 	} \
 	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]) \
 	{ \
 		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
-		memcpy(arr, in, sizeof(sign##int##bits##_t) * size); \
+		memcpy(arr, in, sizeof(vec_##sign##int##bits) * size); \
 		return v##sign##int##bits##x##size##_load_aligned(arr); \
 	} \
 	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \
 		v##sign##int##bits##x##size##_store_aligned(vec, arr); \
-		memcpy(out, arr, sizeof(sign##int##bits##_t) * size); \
+		memcpy(out, arr, sizeof(vec_##sign##int##bits) * size); \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
@@ -123,7 +125,7 @@
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \
 	{ \
-		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \
+		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)UINT##bits##_MAX)); \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
@@ -170,6 +172,13 @@
 	VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( ,  , bits, size) \
 	VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(u, U, bits, size)
 
+// 16-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 2)
+
+// 32-bit
+VEC_DEFINE_FALLBACK_OPERATIONS(8, 4)
+VEC_DEFINE_FALLBACK_OPERATIONS(16, 2)
+
 // 64-bit
 VEC_DEFINE_FALLBACK_OPERATIONS(8, 8)
 VEC_DEFINE_FALLBACK_OPERATIONS(16, 4)
--- a/include/vec/impl/generic.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/generic.h	Wed Nov 20 12:02:15 2024 -0500
@@ -27,7 +27,6 @@
 #ifndef VEC_IMPL_GENERIC_H_
 #define VEC_IMPL_GENERIC_H_
 
-#include <stdint.h>
 #include <string.h>
 
 // -----------------------------------------------------------------
@@ -35,29 +34,32 @@
 // TODO implement these so we don't waste stack space by doing the
 // fallbacks
 #define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
 		v##sign##int##bits##x##size vec; \
-		memcpy(vec.generic, in, sizeof(sign##int##bits##_t) * size); \
+		memcpy(vec.generic, in, sizeof(vec_##sign##int##bits) * size); \
 		return vec; \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
-		memcpy(out, vec.generic, sizeof(sign##int##bits##_t) * size); \
+		memcpy(out, vec.generic, sizeof(vec_##sign##int##bits) * size); \
 	} \
 	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
-		.load_aligned  = v##sign##int##bits##x##size##_generic_load_aligned, \
-		.store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_generic_load_aligned, \
+		v##sign##int##bits##x##size##_generic_load_aligned, \
+		v##sign##int##bits##x##size##_generic_store_aligned, \
+		v##sign##int##bits##x##size##_generic_store_aligned, \
 	};
 
 #define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \
 	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
 	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
 
-VEC_GENERIC_DEFINE_OPERATIONS(8, 8)
-VEC_GENERIC_DEFINE_OPERATIONS(16, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(8, 2)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 2)
 VEC_GENERIC_DEFINE_OPERATIONS(32, 2)
 VEC_GENERIC_DEFINE_OPERATIONS(64, 2)
 
@@ -68,7 +70,7 @@
 // now we can just keep doubling the same implementation
 
 #define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
 		v##sign##int##bits##x##size vec; \
 		vec.generic[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \
@@ -76,21 +78,31 @@
 		return vec; \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[0], out); \
 		v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[1], out + halfsize); \
 	} \
 	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \
-		.load_aligned  = v##sign##int##bits##x##size##_generic_load_aligned, \
-		.store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_generic_load_aligned, \
+		v##sign##int##bits##x##size##_generic_load_aligned, \
+		v##sign##int##bits##x##size##_generic_store_aligned, \
+		v##sign##int##bits##x##size##_generic_store_aligned, \
 	};
 
 #define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \
 	VEC_GENERIC_DEFINE_OPERATIONS_SIGN( ,  , bits, size, halfsize) \
 	VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize)
 
+// 32-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 4, 2)
+
+// 64-bit
+VEC_GENERIC_DEFINE_OPERATIONS(8, 8, 4)
+VEC_GENERIC_DEFINE_OPERATIONS(16, 4, 2)
+
 // 128-bit
 VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8)
 VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/integer.h.in	Wed Nov 20 12:02:15 2024 -0500
@@ -0,0 +1,58 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_INTEGER_H_
+#define VEC_IMPL_INTEGER_H_
+
+#cmakedefine HAVE_SYS_TYPES_H
+#cmakedefine HAVE_STDDEF_H
+#cmakedefine HAVE_STDINT_H
+
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+
+typedef signed char   vec_int8;
+typedef @SIZE16@      vec_int16;
+typedef @SIZE32@      vec_int32;
+typedef @SIZE64@      vec_int64;
+
+typedef unsigned char vec_uint8;
+typedef @USIZE16@     vec_uint16;
+typedef @USIZE32@     vec_uint32;
+typedef @USIZE64@     vec_uint64;
+
+/* this is only used for bitshifting right now */
+typedef vec_int64     vec_intmax;
+typedef vec_uint64    vec_uintmax;
+
+typedef @USIZEPTR@    vec_uintptr;
+
+#endif /* VEC_IMPL_INTEGER_H_ */
\ No newline at end of file
--- a/include/vec/impl/ppc/altivec.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/ppc/altivec.h	Wed Nov 20 12:02:15 2024 -0500
@@ -27,7 +27,6 @@
 #ifndef VEC_IMPL_PPC_ALTIVEC_H_
 #define VEC_IMPL_PPC_ALTIVEC_H_
 
-#include <stdint.h>
 #include <string.h>
 
 #include <altivec.h>
@@ -39,26 +38,30 @@
 # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_mul(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_mul(vec1.altivec, vec2.altivec); \
+		return vec; \
 	}
 # define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \
-	.mul = v##sign##int##bits##x##size##_altivec_mul,
+	v##sign##int##bits##x##size##_altivec_mul
 #else
 # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size)
-# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size)
+# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) NULL
 #endif
 
 #ifdef vec_splats
 # define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(sign##int##bits##_t x) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(vec_##sign##int##bits x) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_splats(x) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_splats(x); \
+		return vec; \
 	}
 # define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \
-	.splat = v##sign##int##bits##x##size##_altivec_splat,
+	v##sign##int##bits##x##size##_altivec_splat
 #else
 # define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size)
-# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size)
+# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) NULL
 #endif
 
 #define VEC_ALTIVEC_uRSHIFT vec_sr
@@ -67,93 +70,118 @@
 #define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_sr(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_sr(vec1.altivec, vec2.altivec); \
+		return vec; \
 	}
 #define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \
-	.lrshift = v##sign##int##bits##x##size##_altivec_lrshift,
+	v##sign##int##bits##x##size##_altivec_lrshift
 
 #define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size)
-#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size)
+#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) NULL
 
 /* Since altivec conveniently made their API super user friendly, we can just use
  * one giant macro to define literally everything */
 #define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_ld(0, in) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_ld(0, in); \
+		return vec; \
 	} \
 	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)); \
+		return vec; \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		vec_st(vec.altivec, 0, out); \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_add(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_add(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_sub(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_sub(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_sl(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_sl(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_avg(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_avg(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_and(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_and(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_or(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_or(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .altivec = vec_xor(vec1.altivec, vec2.altivec) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.altivec = vec_xor(vec1.altivec, vec2.altivec); \
+		return vec; \
 	} \
 	\
 	VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \
 	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \
-		.load_aligned  = v##sign##int##bits##x##size##_altivec_load_aligned, \
-		.load          = v##sign##int##bits##x##size##_altivec_load, \
-		.store_aligned = v##sign##int##bits##x##size##_altivec_store_aligned, \
-		.add           = v##sign##int##bits##x##size##_altivec_add, \
-		.sub           = v##sign##int##bits##x##size##_altivec_sub, \
-		VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \
-		.lshift        = v##sign##int##bits##x##size##_altivec_lshift, \
-		.rshift        = v##sign##int##bits##x##size##_altivec_rshift, \
-		VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size) \
-		.avg           = v##sign##int##bits##x##size##_altivec_avg, \
-		.and           = v##sign##int##bits##x##size##_altivec_and, \
-		.or            = v##sign##int##bits##x##size##_altivec_or, \
-		.xor           = v##sign##int##bits##x##size##_altivec_xor, \
-		VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \
+		VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size), \
+		v##sign##int##bits##x##size##_altivec_load_aligned, \
+		v##sign##int##bits##x##size##_altivec_load, \
+		v##sign##int##bits##x##size##_altivec_store_aligned, \
+		/* .store = */ NULL, \
+		v##sign##int##bits##x##size##_altivec_add, \
+		v##sign##int##bits##x##size##_altivec_sub, \
+		VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size), \
+		/* .div = */ NULL, \
+		v##sign##int##bits##x##size##_altivec_avg, \
+		v##sign##int##bits##x##size##_altivec_and, \
+		v##sign##int##bits##x##size##_altivec_or, \
+		v##sign##int##bits##x##size##_altivec_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_altivec_lshift, \
+		v##sign##int##bits##x##size##_altivec_rshift, \
+		VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size), \
 	};
 
 #define VEC_DEFINE_OPERATIONS(bits, size) \
--- a/include/vec/impl/x86/avx2.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/x86/avx2.h	Wed Nov 20 12:02:15 2024 -0500
@@ -32,10 +32,12 @@
 		__m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1.avx2, 8), _mm256_srli_epi16(vec2.avx2, 8)); \
 	\
 		/* repack */ \
-		return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \
+		v##sign##int8x32 vec; \
+		vec.avx2 = _mm256_or_si256( \
 			_mm256_slli_epi16(dst_odd, 8), \
 			_mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \
-		)}; \
+		); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_OPERATION_8x32_32x8(op, sign) \
@@ -47,7 +49,8 @@
 		__m256i dst_4 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 24), _mm256_srli_epi32(vec2.avx2, 24)); \
 	\
 		/* repack */ \
-		return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \
+		v##sign##int8x32 vec; \
+		vec.avx2 = _mm256_or_si256( \
 			_mm256_or_si256( \
 				_mm256_slli_epi32(dst_4, 8), \
 				_mm256_srli_epi32(_mm256_slli_epi32(dst_3, 8), 8) \
@@ -56,7 +59,8 @@
 				_mm256_slli_epi32(_mm256_slli_epi32(dst_2, 8), 16), \
 				_mm256_srli_epi32(_mm256_slli_epi32(dst_1, 8), 24) \
 			) \
-		)}; \
+		); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_OPERATION_16x16(op, sign) \
@@ -66,10 +70,12 @@
 		__m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \
 	\
 		/* repack */ \
-		return (v##sign##int16x16){ .avx2 = _mm256_or_si256( \
+		v##sign##int16x16 vec; \
+		vec.avx2 = _mm256_or_si256( \
 			_mm256_slli_epi32(dst_odd, 16), \
 			_mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \
-		)}; \
+		); \
+		return vec; \
 	} while (0)
 
 // shifting
@@ -82,12 +88,16 @@
 
 #define VEC_AVX2_LSHIFT_32x8(sign) \
 	do { \
-		return (v##sign##int32x8){ .avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2) }; \
+		v##sign##int32x8 vec; \
+		vec.avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_LSHIFT_64x4(sign) \
 	do { \
-		return (v##sign##int64x4){ .avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2) }; \
+		v##sign##int64x4 vec; \
+		vec.avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_RSHIFT_8x32(sign, aORl) \
@@ -98,7 +108,9 @@
 
 #define VEC_AVX2_RSHIFT_32x8(sign, aORl) \
 	do { \
-		return (v##sign##int32x8){ .avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2) }; \
+		v##sign##int32x8 vec; \
+		vec.avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_aRSHIFT_64x4(sign) \
@@ -108,7 +120,9 @@
 
 #define VEC_AVX2_lRSHIFT_64x4(sign) \
 	do { \
-		return (v##sign##int64x4){ .avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2) }; \
+		v##sign##int64x4 vec; \
+		vec.avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_RSHIFT_64x4(sign, aORl) \
@@ -121,12 +135,16 @@
 
 #define VEC_AVX2_MUL_16x16(sign) \
 	do { \
-		return (v##sign##int16x16){ .avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2) }; \
+		v##sign##int16x16 vec; \
+		vec.avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_MUL_32x8(sign) \
 	do { \
-		return (v##sign##int32x8) { .avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2) }; \
+		v##sign##int32x8 vec; \
+		vec.avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX2_MUL_64x4(sign) \
@@ -138,40 +156,51 @@
 		__m256i ad = _mm256_mul_epu32(vec1.avx2, d); \
 		__m256i hi = _mm256_add_epi64(bc, ad); \
 		hi = _mm256_slli_epi64(hi, 32); \
-		return (v##sign##int64x4) { .avx2 = _mm256_add_epi64(hi, ac) }; \
+	\
+		v##sign##int64x4 vec; \
+		vec.avx2 = _mm256_add_epi64(hi, ac); \
+		return vec; \
 	} while (0)
 
 // operations
 
 #define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx2 = _mm256_load_si256((const __m256i *)in) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_load_si256((const __m256i *)in); \
+		return vec; \
 	} \
 	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx2 = _mm256_loadu_si256((const __m256i *)in) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_loadu_si256((const __m256i *)in); \
+		return vec; \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		_mm256_store_si256((__m256i *)out, vec.avx2); \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		_mm256_storeu_si256((__m256i *)out, vec.avx2); \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
@@ -181,17 +210,23 @@
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
@@ -210,19 +245,23 @@
 	} \
 	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \
-		.load_aligned  = v##sign##int##bits##x##size##_avx2_load_aligned, \
-		.load          = v##sign##int##bits##x##size##_avx2_load, \
-		.store_aligned = v##sign##int##bits##x##size##_avx2_store_aligned, \
-		.store         = v##sign##int##bits##x##size##_avx2_store, \
-		.add           = v##sign##int##bits##x##size##_avx2_add, \
-		.sub           = v##sign##int##bits##x##size##_avx2_sub, \
-		.mul           = v##sign##int##bits##x##size##_avx2_mul, \
-		.and           = v##sign##int##bits##x##size##_avx2_and, \
-		.or            = v##sign##int##bits##x##size##_avx2_or, \
-		.xor           = v##sign##int##bits##x##size##_avx2_xor, \
-		.lshift        = v##sign##int##bits##x##size##_avx2_lshift, \
-		.rshift        = v##sign##int##bits##x##size##_avx2_rshift, \
-		.lrshift       = v##sign##int##bits##x##size##_avx2_lrshift, \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_avx2_load_aligned, \
+		v##sign##int##bits##x##size##_avx2_load, \
+		v##sign##int##bits##x##size##_avx2_store_aligned, \
+		v##sign##int##bits##x##size##_avx2_store, \
+		v##sign##int##bits##x##size##_avx2_add, \
+		v##sign##int##bits##x##size##_avx2_sub, \
+		v##sign##int##bits##x##size##_avx2_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_avx2_and, \
+		v##sign##int##bits##x##size##_avx2_or, \
+		v##sign##int##bits##x##size##_avx2_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_avx2_lshift, \
+		v##sign##int##bits##x##size##_avx2_rshift, \
+		v##sign##int##bits##x##size##_avx2_lrshift, \
 	};
 
 #define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \
--- a/include/vec/impl/x86/avx512f.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/x86/avx512f.h	Wed Nov 20 12:02:15 2024 -0500
@@ -34,7 +34,8 @@
 		__m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 24), _mm512_srli_epi32(vec2.avx512f, 24)); \
 	\
 		/* repack */ \
-		return (v##sign##int8x64){ .avx512f = _mm512_or_si512( \
+		v##sign##int8x64 vec; \
+		vec.avx512f = _mm512_or_si512( \
 			_mm512_or_si512( \
 				_mm512_slli_epi32(dst_4, 8), \
 				_mm512_srli_epi32(_mm512_slli_epi32(dst_3, 8), 8) \
@@ -43,7 +44,8 @@
 				_mm512_slli_epi32(_mm512_slli_epi32(dst_2, 8), 16), \
 				_mm512_srli_epi32(_mm512_slli_epi32(dst_1, 8), 24) \
 			) \
-		)}; \
+		); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_OPERATION_16x32(op, sign) \
@@ -53,10 +55,12 @@
 		__m512i dst_odd = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \
 	\
 		/* repack */ \
-		return (v##sign##int16x32){ .avx512f = _mm512_or_si512( \
+		v##sign##int16x32 vec; \
+		vec.avx512f = _mm512_or_si512( \
 			_mm512_slli_epi32(dst_odd, 16), \
 			_mm512_srli_epi32(_mm512_slli_epi32(dst_even, 16), 16) \
-		)}; \
+		); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_ADD_8x64(sign) \
@@ -67,12 +71,16 @@
 
 #define VEC_AVX512F_ADD_32x16(sign) \
 	do { \
-		return (v##sign##int32x16) { .avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int32x16 vec; \
+		vec.avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_ADD_64x8(sign) \
 	do { \
-		return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int64x8 vec; \
+		vec.avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_SUB_8x64(sign) \
@@ -83,12 +91,16 @@
 
 #define VEC_AVX512F_SUB_32x16(sign) \
 	do { \
-		return (v##sign##int32x16) { .avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int32x16 vec; \
+		vec.avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_SUB_64x8(sign) \
 	do { \
-		return (v##sign##int64x8) { .avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int64x8 vec; \
+		vec.avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_MUL_8x64(sign) \
@@ -99,7 +111,9 @@
 
 #define VEC_AVX512F_MUL_32x16(sign) \
 	do { \
-		return (v##sign##int32x16) { .avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int32x16 vec; \
+		vec.avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_MUL_64x8(sign) \
@@ -111,7 +125,10 @@
 		__m512i ad = _mm512_mul_epu32(vec1.avx512f, d); \
 		__m512i hi = _mm512_add_epi64(bc, ad); \
 		hi = _mm512_slli_epi64(hi, 32); \
-		return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(hi, ac) }; \
+	\
+		v##sign##int64x8 vec; \
+		vec.avx512f = _mm512_add_epi64(hi, ac); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_LSHIFT_8x64(sign) \
@@ -122,12 +139,16 @@
 
 #define VEC_AVX512F_LSHIFT_32x16(sign) \
 	do { \
-		return (v##sign##int32x16){ .avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int32x16 vec; \
+		vec.avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_LSHIFT_64x8(sign) \
 	do { \
-		return (v##sign##int64x8){ .avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int64x8 vec; \
+		vec.avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_RSHIFT_8x64(sign, aORl) \
@@ -138,31 +159,39 @@
 
 #define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \
 	do { \
-		return (v##sign##int32x16){ .avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int32x16 vec; \
+		vec.avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \
 	do { \
-		return (v##sign##int64x8){ .avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int64x8 vec; \
+		vec.avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} while (0)
 
 #define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx512f = _mm512_load_si512((const __m512i *)in) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx512f = _mm512_load_si512((const __m512i *)in); \
+		return vec; \
 	} \
 	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx512f = _mm512_loadu_si512((const __m512i *)in) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \
+		return vec; \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		_mm512_store_si512((__m512i *)out, vec.avx512f); \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		_mm512_storeu_si512((__m512i *)out, vec.avx512f); \
 	} \
@@ -184,17 +213,23 @@
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
@@ -213,16 +248,23 @@
 	} \
 	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \
-		.load_aligned  = v##sign##int##bits##x##size##_avx512f_load_aligned, \
-		.load          = v##sign##int##bits##x##size##_avx512f_load, \
-		.store_aligned = v##sign##int##bits##x##size##_avx512f_store_aligned, \
-		.store         = v##sign##int##bits##x##size##_avx512f_store, \
-		.add           = v##sign##int##bits##x##size##_avx512f_add, \
-		.sub           = v##sign##int##bits##x##size##_avx512f_sub, \
-		.mul           = v##sign##int##bits##x##size##_avx512f_mul, \
-		.and           = v##sign##int##bits##x##size##_avx512f_and, \
-		.or            = v##sign##int##bits##x##size##_avx512f_or, \
-		.xor           = v##sign##int##bits##x##size##_avx512f_xor, \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_avx512f_load_aligned, \
+		v##sign##int##bits##x##size##_avx512f_load, \
+		v##sign##int##bits##x##size##_avx512f_store_aligned, \
+		v##sign##int##bits##x##size##_avx512f_store, \
+		v##sign##int##bits##x##size##_avx512f_add, \
+		v##sign##int##bits##x##size##_avx512f_sub, \
+		v##sign##int##bits##x##size##_avx512f_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_avx512f_and, \
+		v##sign##int##bits##x##size##_avx512f_or, \
+		v##sign##int##bits##x##size##_avx512f_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_avx512f_lshift, \
+		v##sign##int##bits##x##size##_avx512f_rshift, \
+		v##sign##int##bits##x##size##_avx512f_lrshift, \
 	};
 
 #define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \
--- a/include/vec/impl/x86/mmx.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/x86/mmx.h	Wed Nov 20 12:02:15 2024 -0500
@@ -32,10 +32,12 @@
 		__m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1.mmx, 8), _mm_srli_pi16(vec2.mmx, 8)); \
 	\
 		/* repack */ \
-		return (v##sign##int8x8){ .mmx = _mm_or_si64( \
+		v##sign##int8x8 vec; \
+		vec.mmx = _mm_or_si64( \
 			_mm_slli_pi16(dst_odd, 8), \
 			_mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \
-		)}; \
+		); \
+		return vec; \
 	} while (0)
 
 // shifting
@@ -44,12 +46,16 @@
 
 #define VEC_MMX_LSHIFT_16x4(sign) \
 	do { \
-		return (v##sign##int16x4){ .mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx) }; \
+		v##sign##int16x4 vec; \
+		vec.mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} while (0)
 
 #define VEC_MMX_LSHIFT_32x2(sign) \
 	do { \
-		return (v##sign##int32x2){ .mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx) }; \
+		v##sign##int32x2 vec; \
+		vec.mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} while (0)
 
 #define VEC_MMX_RSHIFT_8x8(sign, aORl) \
@@ -57,12 +63,16 @@
 
 #define VEC_MMX_RSHIFT_16x4(sign, aORl) \
 	do { \
-		return (v##sign##int16x4){ .mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx) }; \
+		v##sign##int16x4 vec; \
+		vec.mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} while (0)
 
 #define VEC_MMX_RSHIFT_32x2(sign, aORl) \
 	do { \
-		return (v##sign##int32x2){ .mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx) }; \
+		v##sign##int32x2 vec; \
+		vec.mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} while (0)
 
 // shared between MMX variations
@@ -72,7 +82,9 @@
 #define VEC_MMX_MUL_16x4(sign) \
 	do { \
 		/* we have a real instruction for this */ \
-		return (v##sign##int16x4){ .mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx) }; \
+		v##sign##int16x4 vec; \
+		vec.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} while (0)
 
 #define VEC_MMX_MUL_32x2(sign) \
@@ -84,30 +96,37 @@
 		__m64 ad = _mm_mullo_pi16(vec1.mmx, d); \
 		__m64 hi = _mm_add_pi32(bc, ad); \
 		hi = _mm_slli_pi32(hi, 16); \
-		return (v##sign##int32x2) { .mmx = _mm_add_pi32(hi, ac) }; /* return ac + hi; */ \
+	\
+		v##sign##int32x2 vec; \
+		vec.mmx = _mm_add_pi32(hi, ac); \
+		return vec; \
 	} while (0)
 
 #define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
 		v##sign##int##bits##x##size vec; \
 		memcpy(&vec.mmx, in, sizeof(vec.mmx)); \
 		return vec; \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		memcpy(out, &vec.mmx, sizeof(vec.mmx)); \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
@@ -117,17 +136,23 @@
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .mmx = _mm_and_si64(vec1.mmx, vec2.mmx) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.mmx = _mm_and_si64(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .mmx = _mm_or_si64(vec1.mmx, vec2.mmx) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.mmx = _mm_or_si64(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .mmx = _mm_xor_si64(vec1.mmx, vec2.mmx) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.mmx = _mm_xor_si64(vec1.mmx, vec2.mmx); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
@@ -146,19 +171,23 @@
 	} \
 	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \
-		.load_aligned  = v##sign##int##bits##x##size##_mmx_load_aligned, \
-		.load          = v##sign##int##bits##x##size##_mmx_load_aligned, \
-		.store_aligned = v##sign##int##bits##x##size##_mmx_store_aligned, \
-		.store         = v##sign##int##bits##x##size##_mmx_store_aligned, \
-		.add           = v##sign##int##bits##x##size##_mmx_add, \
-		.sub           = v##sign##int##bits##x##size##_mmx_sub, \
-		.mul           = v##sign##int##bits##x##size##_mmx_mul, \
-		.and           = v##sign##int##bits##x##size##_mmx_and, \
-		.or            = v##sign##int##bits##x##size##_mmx_or, \
-		.xor           = v##sign##int##bits##x##size##_mmx_xor, \
-		.lshift        = v##sign##int##bits##x##size##_mmx_lshift, \
-		.rshift        = v##sign##int##bits##x##size##_mmx_rshift, \
-		.lrshift       = v##sign##int##bits##x##size##_mmx_lrshift, \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_mmx_load_aligned, \
+		v##sign##int##bits##x##size##_mmx_load_aligned, \
+		v##sign##int##bits##x##size##_mmx_store_aligned, \
+		v##sign##int##bits##x##size##_mmx_store_aligned, \
+		v##sign##int##bits##x##size##_mmx_add, \
+		v##sign##int##bits##x##size##_mmx_sub, \
+		v##sign##int##bits##x##size##_mmx_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_mmx_and, \
+		v##sign##int##bits##x##size##_mmx_or, \
+		v##sign##int##bits##x##size##_mmx_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_mmx_lshift, \
+		v##sign##int##bits##x##size##_mmx_rshift, \
+		v##sign##int##bits##x##size##_mmx_lrshift, \
 	};
 
 #define VEC_MMX_DEFINE_OPERATIONS(bits, size) \
--- a/include/vec/impl/x86/sse2.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/x86/sse2.h	Wed Nov 20 12:02:15 2024 -0500
@@ -32,10 +32,12 @@
 		__m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \
 	\
 		/* repack */ \
-		return (v##sign##int8x16){ .sse = _mm_or_si128( \
+		v##sign##int8x16 vec; \
+		vec.sse = _mm_or_si128( \
 			_mm_slli_epi16(dst_odd, 8), \
 			_mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
-		)}; \
+		); \
+		return vec; \
 	} while (0)
 
 // shifting
@@ -44,17 +46,23 @@
 
 #define VEC_SSE2_LSHIFT_16x8(sign) \
 	do { \
-		return (v##sign##int16x8){ .sse = _mm_sll_epi16(vec1.sse, vec2.sse) }; \
+		v##sign##int16x8 vec; \
+		vec.sse = _mm_sll_epi16(vec1.sse, vec2.sse); \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_LSHIFT_32x4(sign) \
 	do { \
-		return (v##sign##int32x4){ .sse = _mm_sll_epi32(vec1.sse, vec2.sse) }; \
+		v##sign##int32x4 vec; \
+		vec.sse = _mm_sll_epi32(vec1.sse, vec2.sse); \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_LSHIFT_64x2(sign) \
 	do { \
-		return (v##sign##int64x2){ .sse = _mm_sll_epi64(vec1.sse, vec2.sse) }; \
+		v##sign##int64x2 vec; \
+		vec.sse = _mm_sll_epi64(vec1.sse, vec2.sse); \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_RSHIFT_8x16(sign, aORl) \
@@ -62,12 +70,16 @@
 
 #define VEC_SSE2_RSHIFT_16x8(sign, aORl) \
 	do { \
-		return (v##sign##int16x8){ .sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse) }; \
+		v##sign##int16x8 vec; \
+		vec.sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse); \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_RSHIFT_32x4(sign, aORl) \
 	do { \
-		return (v##sign##int32x4){ .sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse) }; \
+		v##sign##int32x4 vec; \
+		vec.sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse); \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_aRSHIFT_64x2(sign) \
@@ -77,7 +89,9 @@
 
 #define VEC_SSE2_lRSHIFT_64x2(sign) \
 	do { \
-		return (v##sign##int64x2){ .sse = _mm_srl_epi64(vec1.sse, vec2.sse) }; \
+		v##sign##int64x2 vec; \
+		vec.sse = _mm_srl_epi64(vec1.sse, vec2.sse); \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_RSHIFT_64x2(sign, aORl) \
@@ -90,7 +104,9 @@
 #define VEC_SSE2_MUL_16x8(sign) \
 	do { \
 		/* we have a real instruction for this */ \
-		return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \
+		v##sign##int16x8 vec; \
+		vec.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_MUL_32x4(sign) \
@@ -102,7 +118,10 @@
 		__m128i prod13 = _mm_mul_epu32(a13, b13);           /* (-,a3*b3,-,a1*b1) */ \
 		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
 		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
-		return (v##sign##int32x4) { .sse = _mm_unpacklo_epi64(prod01, prod23) }; /* (ab3,ab2,ab1,ab0) */ \
+	\
+		v##sign##int32x4 vec; \
+		vec.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_MUL_64x2(sign) \
@@ -114,38 +133,49 @@
 		__m128i ad = _mm_mul_epu32(vec1.sse, d);        /* ad = (vec1 & UINT32_MAX) * d; */ \
 		__m128i hi = _mm_add_epi64(bc, ad);             /* hi = bc + ad; */ \
 		hi = _mm_slli_epi64(hi, 32);                    /* hi <<= 32; */ \
-		return (v##sign##int64x2) { .sse = _mm_add_epi64(hi, ac) }; /* return ac + hi; */ \
+	\
+		v##sign##int64x2 vec; \
+		vec.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \
+		return vec; \
 	} while (0)
 
 #define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .sse = _mm_load_si128((const __m128i *)in) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.sse = _mm_load_si128((const __m128i *)in); \
+		return vec; \
 	} \
 	\
-	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const sign##int##bits##_t in[size]) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \
 	{ \
-		return (v##sign##int##bits##x##size) { .sse = _mm_loadu_si128((const __m128i *)in) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.sse = _mm_loadu_si128((const __m128i *)in); \
+		return vec; \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		_mm_store_si128((__m128i *)out, vec.sse); \
 	} \
 	\
-	static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		_mm_storeu_si128((__m128i *)out, vec.sse); \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .sse = _mm_add_epi##bits(vec1.sse, vec2.sse) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.sse = _mm_add_epi##bits(vec1.sse, vec2.sse); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .sse = _mm_sub_epi##bits(vec1.sse, vec2.sse) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.sse = _mm_sub_epi##bits(vec1.sse, vec2.sse); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
@@ -155,17 +185,23 @@
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.sse = _mm_and_si128(vec1.sse, vec2.sse); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.sse = _mm_or_si128(vec1.sse, vec2.sse); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		return (v##sign##int##bits##x##size) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; \
+		v##sign##int##bits##x##size vec; \
+		vec.sse = _mm_xor_si128(vec1.sse, vec2.sse); \
+		return vec; \
 	} \
 	\
 	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
@@ -184,19 +220,23 @@
 	} \
 	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \
-		.load_aligned  = v##sign##int##bits##x##size##_sse2_load_aligned, \
-		.load          = v##sign##int##bits##x##size##_sse2_load, \
-		.store_aligned = v##sign##int##bits##x##size##_sse2_store_aligned, \
-		.store         = v##sign##int##bits##x##size##_sse2_store, \
-		.add           = v##sign##int##bits##x##size##_sse2_add, \
-		.sub           = v##sign##int##bits##x##size##_sse2_sub, \
-		.mul           = v##sign##int##bits##x##size##_sse2_mul, \
-		.and           = v##sign##int##bits##x##size##_sse2_and, \
-		.or            = v##sign##int##bits##x##size##_sse2_or, \
-		.xor           = v##sign##int##bits##x##size##_sse2_xor, \
-		.lshift        = v##sign##int##bits##x##size##_sse2_lshift, \
-		.rshift        = v##sign##int##bits##x##size##_sse2_rshift, \
-		.lrshift       = v##sign##int##bits##x##size##_sse2_lrshift, \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_sse2_load_aligned, \
+		v##sign##int##bits##x##size##_sse2_load, \
+		v##sign##int##bits##x##size##_sse2_store_aligned, \
+		v##sign##int##bits##x##size##_sse2_store, \
+		v##sign##int##bits##x##size##_sse2_add, \
+		v##sign##int##bits##x##size##_sse2_sub, \
+		v##sign##int##bits##x##size##_sse2_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_sse2_and, \
+		v##sign##int##bits##x##size##_sse2_or, \
+		v##sign##int##bits##x##size##_sse2_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_sse2_lshift, \
+		v##sign##int##bits##x##size##_sse2_rshift, \
+		v##sign##int##bits##x##size##_sse2_lrshift, \
 	};
 
 #define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \
--- a/include/vec/impl/x86/sse41.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/impl/x86/sse41.h	Wed Nov 20 12:02:15 2024 -0500
@@ -28,23 +28,29 @@
 #define VEC_SSE41_DEFINE_OPERATIONS(sign) \
 	static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \
 	{ \
-		return (v##sign##int32x4){ .sse = _mm_mullo_epi32(vec1.sse, vec2.sse) }; \
+		v##sign##int32x4 vec; \
+		vec.sse = _mm_mullo_epi32(vec1.sse, vec2.sse); \
+		return vec; \
 	} \
 	\
 	static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \
-		.load_aligned  = v##sign##int32x4_sse2_load_aligned, \
-		.load          = v##sign##int32x4_sse2_load, \
-		.store_aligned = v##sign##int32x4_sse2_store_aligned, \
-		.store         = v##sign##int32x4_sse2_store, \
-		.add           = v##sign##int32x4_sse2_add, \
-		.sub           = v##sign##int32x4_sse2_sub, \
-		.mul           = v##sign##int32x4_sse41_mul, \
-		.and           = v##sign##int32x4_sse2_and, \
-		.or            = v##sign##int32x4_sse2_or, \
-		.xor           = v##sign##int32x4_sse2_xor, \
-		.lshift        = v##sign##int32x4_sse2_lshift, \
-		.rshift        = v##sign##int32x4_sse2_rshift, \
-		.lrshift       = v##sign##int32x4_sse2_lrshift, \
+		/* .splat = */ NULL, \
+		v##sign##int32x4##_sse2_load_aligned, \
+		v##sign##int32x4##_sse2_load, \
+		v##sign##int32x4##_sse2_store_aligned, \
+		v##sign##int32x4##_sse2_store, \
+		v##sign##int32x4##_sse2_add, \
+		v##sign##int32x4##_sse2_sub, \
+		v##sign##int32x4##_sse41_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int32x4##_sse2_and, \
+		v##sign##int32x4##_sse2_or, \
+		v##sign##int32x4##_sse2_xor, \
+		/* .not = */ NULL, \
+		v##sign##int32x4##_sse2_lshift, \
+		v##sign##int32x4##_sse2_rshift, \
+		v##sign##int32x4##_sse2_lrshift, \
 	};
 
 VEC_SSE41_DEFINE_OPERATIONS()
--- a/include/vec/vec.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/include/vec/vec.h	Wed Nov 20 12:02:15 2024 -0500
@@ -25,42 +25,75 @@
 #ifndef VEC_VEC_H_
 #define VEC_VEC_H_
 
-#include <stdint.h>
-#include <string.h>
-#include <limits.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef VEC_HAVE_IMPL_INTEGER_H
+# include "impl/integer.h"
+#else
+# if __cplusplus >= (201103L)
+#  include <cstdint>
+#  include <cstddef>
+typedef std::size_t    vec_uintsize;
+
+typedef std::uint8_t   vec_uint8;
+typedef std::uint16_t  vec_uint16;
+typedef std::uint32_t  vec_uint32;
+typedef std::uint64_t  vec_uint64;
+typedef std::uintmax_t vec_uintmax;
+typedef std::uintptr_t vec_uintptr;
 
-#define VEC_MAX(a, b) (((a) > (b)) ? (a) : (b))
-#define VEC_MIN(a, b) (((a) < (b)) ? (a) : (b))
-#define VEC_CLAMP(x, min, max) (VEC_MIN(VEC_MAX((x), (min)), (max)))
+typedef std::int8_t    vec_int8;
+typedef std::int16_t   vec_int16;
+typedef std::int32_t   vec_int32;
+typedef std::int64_t   vec_int64;
+typedef std::intmax_t  vec_intmax;
+# elif __STDC_VERSION__ >= 199901L
+#  include <stdint.h>
+#  include <stddef.h>
+typedef uint8_t   vec_uint8;
+typedef uint16_t  vec_uint16;
+typedef uint32_t  vec_uint32;
+typedef uint64_t  vec_uint64;
+typedef uintmax_t vec_uintmax;
+typedef uintptr_t vec_uintptr;
+typedef size_t    vec_uintsize;
+typedef int8_t    vec_int8;
+typedef int16_t   vec_int16;
+typedef int32_t   vec_int32;
+typedef int64_t   vec_int64;
+typedef intmax_t  vec_intmax;
+# else
+#  error Unable to find integer types with known size.
+# endif
+#endif
 
 #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
 	(((a) >= (x)) && \
 	 ((a) > x || (b) >= (y)) && \
 	 ((a) > x || (b) > (y) || (c) >= (z)))
 
-#define VEC_GNUC_ATLEAST(x, y, z) \
+#ifdef __GNUC__
+# define VEC_GNUC_ATLEAST(x, y, z) \
 	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)
+#else
+# define VEC_GNUC_ATLEAST(x, y, z) (0)
+#endif
 
 /* GCC/clang attributes */
 #if defined(__has_attribute)
-# if __has_attribute(__aligned__)
-#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
-# endif
-# if __has_attribute(__vector_size__)
-#  define VEC_COMPILER_HAS_GNUC_VECTORS
-# endif
+# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x)
+#else
+# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch)
 #endif
 
-#ifndef VEC_ALIGNED
-# if VEC_GNUC_ATLEAST(2, 7, 0)
-#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
-# endif
-#endif
-
-#if (__STDC_VERSION__ >= 201112L)
+#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
+# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg)
+#elif (__STDC_VERSION__ >= 201112L)
 # define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
 #else
-// C99 static assertion
 # define VEC_STATIC_ASSERT(x, msg) \
 	extern int (*vec_impl_Static_assert_function_(void)) \
 		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
@@ -78,28 +111,35 @@
 /* --------------------------------------------------------------- */
 /* Detect compiler SIMD support */
 
-#define VEC_GENERIC_ALIGNMENT 1
 #define VEC_ALTIVEC_ALIGNMENT 16
 #define VEC_SSE2_ALIGNMENT    16
 #define VEC_AVX2_ALIGNMENT    32
 #define VEC_AVX512F_ALIGNMENT 64
 
-// for the generic implementation, 64-bit
-#define VINT8x8_ALIGNMENT   VEC_GENERIC_ALIGNMENT
-#define VINT16x4_ALIGNMENT  VEC_GENERIC_ALIGNMENT
-#define VINT32x2_ALIGNMENT  VEC_GENERIC_ALIGNMENT
-#define VUINT8x8_ALIGNMENT  VEC_GENERIC_ALIGNMENT
-#define VUINT16x4_ALIGNMENT VEC_GENERIC_ALIGNMENT
-#define VUINT32x2_ALIGNMENT VEC_GENERIC_ALIGNMENT
+// for the generic implementation
+#define VINT8x2_ALIGNMENT   1
+#define VUINT8x2_ALIGNMENT  1
+
+#define VINT8x4_ALIGNMENT   VINT8x2_ALIGNMENT
+#define VINT16x2_ALIGNMENT  2
+#define VUINT8x4_ALIGNMENT  VUINT8x2_ALIGNMENT
+#define VUINT16x2_ALIGNMENT 2
+
+#define VINT8x8_ALIGNMENT   VINT8x4_ALIGNMENT
+#define VINT16x4_ALIGNMENT  VINT16x2_ALIGNMENT
+#define VINT32x2_ALIGNMENT  4
+#define VUINT8x8_ALIGNMENT  VUINT8x4_ALIGNMENT
+#define VUINT16x4_ALIGNMENT VUINT16x2_ALIGNMENT
+#define VUINT32x2_ALIGNMENT 4
 
 #define VINT8x16_ALIGNMENT  VINT8x8_ALIGNMENT
 #define VINT16x8_ALIGNMENT  VINT16x4_ALIGNMENT
 #define VINT32x4_ALIGNMENT  VINT32x2_ALIGNMENT
-#define VINT64x2_ALIGNMENT  VEC_GENERIC_ALIGNMENT
+#define VINT64x2_ALIGNMENT  8
 #define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT
 #define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT
 #define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT
-#define VUINT64x2_ALIGNMENT VEC_GENERIC_ALIGNMENT
+#define VUINT64x2_ALIGNMENT 8
 
 #define VINT8x32_ALIGNMENT   VINT8x16_ALIGNMENT
 #define VINT16x16_ALIGNMENT  VINT16x8_ALIGNMENT
@@ -287,48 +327,48 @@
 /* --------------------------------------------------------------- */
 /* bit shift */
 
-inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y)
+inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y)
 {
 	return x >> y;
 }
 
-inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y)
+inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y)
 {
 	return x << y;
 }
 
-inline intmax_t vec_lrshift(intmax_t x, unsigned int y)
+inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y)
 {
 	// reinterpret as unsigned integer and then shift
 	union {
-		intmax_t d;
-		uintmax_t u;
+		vec_intmax d;
+		vec_uintmax u;
 	} xx;
 
 	xx.d = x;
-	xx.u >> y;
+	xx.u >>= y;
 	return xx.d;
 }
 
-inline intmax_t vec_llshift(intmax_t x, unsigned int y)
+inline vec_intmax vec_llshift(vec_intmax x, unsigned int y)
 {
 	// reinterpret as unsigned integer and then shift
 	union {
-		intmax_t d;
-		uintmax_t u;
+		vec_intmax d;
+		vec_uintmax u;
 	} xx;
 
 	xx.d = x;
-	xx.u << y;
+	xx.u <<= y;
 	return xx.d;
 }
 
-inline uintmax_t vec_urshift(uintmax_t x, unsigned int y)
+inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y)
 {
 	return x >> y;
 }
 
-inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y)
+inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y)
 {
 	return x << y;
 }
@@ -359,13 +399,13 @@
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
 **/
-inline intmax_t vec_rshift(intmax_t x, unsigned int y)
+inline vec_intmax vec_rshift(vec_intmax x, unsigned int y)
 {
-	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);
+	static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1);
 
 	union {
-		intmax_t d;
-		uintmax_t u;
+		vec_intmax d;
+		vec_uintmax u;
 	} xx;
 
 	xx.d = x;
@@ -378,13 +418,13 @@
 	return xx.d;
 }
 
-inline intmax_t vec_lshift(intmax_t x, unsigned int y)
+inline vec_intmax vec_lshift(vec_intmax x, unsigned int y)
 {
-	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);
+	static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1);
 
 	union {
-		intmax_t d;
-		uintmax_t u;
+		vec_intmax d;
+		vec_uintmax u;
 	} xx;
 
 	xx.d = x;
@@ -397,203 +437,56 @@
 }
 
 #ifdef VEC_IMPLEMENTATION
-extern inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y);
-extern inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y);
-extern inline intmax_t vec_lrshift(intmax_t x, unsigned int y);
-extern inline intmax_t vec_llshift(intmax_t x, unsigned int y);
-extern inline uintmax_t vec_urshift(uintmax_t x, unsigned int y);
-extern inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y);
-extern inline intmax_t vec_rshift(intmax_t x, unsigned int y);
-extern inline intmax_t vec_lshift(intmax_t x, unsigned int y);
+extern inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y);
+extern inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y);
+extern inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y);
+extern inline vec_intmax vec_llshift(vec_intmax x, unsigned int y);
+extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
+extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
+extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
+extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);
 #endif
 
 /* --------------------------------------------------------------- */
-/* Array alignment macros */
 
-/* the alignment must be specified in bytes and must be a multiple of the
- * type size. it is always assumed that the type will be on a boundary of
- * its size, which may or may not be true */
-#ifdef VEC_ALIGNED
-# define VEC_ALIGNED_ARRAY(type, var, length, align) \
-	VEC_ALIGNED(align) type var[length]
-# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
-	(sizeof(var))
-#else
-# define VEC_ALIGNED_ARRAY(type, var, length, align) \
-	VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \
-	type vec_##var##_unaligned_[(length) + (align / sizeof(type))]; \
-	type *var = (type *)(((uintptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)); \
-	VEC_ASSERT(((uintptr_t)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
-# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
-	(sizeof(vec_##var##_unaligned_) - (align - 1))
-#endif
-
-#define VEC_ALIGNED_ARRAY_LENGTH(var) \
-	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))
-
-// ------------------------------------------------------------
-// predefined variants for each vector type
-
-#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 8, VINT8x8_ALIGNMENT)
-#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
-#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
-#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)
-
-#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 4, VINT16x4_ALIGNMENT)
-#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
-#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
-#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)
-
-#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 2, VINT32x2_ALIGNMENT)
-#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
-#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
-#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)
-
-#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 8, VUINT8x8_ALIGNMENT)
-#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
-#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
-#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)
-
-#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 4, VUINT16x4_ALIGNMENT)
-#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
-#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
-#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)
-
-#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 2, VUINT32x2_ALIGNMENT)
-#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
-#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
-#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)
-
-#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
-#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
-#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
-#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
-
-#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
-#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
-#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
-#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)
-
-#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
-#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
-#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
-#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)
-
-#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
-#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
-#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
-#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)
-
-#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)
-
-#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)
-
-#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)
-
-#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)
-
-#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT)
-#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
-#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
-#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)
-
-#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT)
-#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
-#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
-#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
-
-#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT)
-#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
-#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
-#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)
-
-#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT)
-#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
-#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
-#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)
-
-#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT)
-#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
-#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
-#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)
-
-#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT)
-#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
-#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
-#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
-
-#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT)
-#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
-#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
-#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)
-
-#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT)
-#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
-#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
-#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)
-
-#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT)
-#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
-#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
-#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)
-
-#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x32_ALIGNMENT)
-#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
-#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
-#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)
-
-#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT)
-#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
-#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
-#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)
-
-#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT)
-#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
-#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
-#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)
-
-#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT)
-#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
-#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
-#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)
-
-#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x32_ALIGNMENT)
-#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
-#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
-#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)
-
-#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT)
-#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
-#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
-#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)
-
-#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT)
-#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
-#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
-#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)
+#include "impl/align.h"
 
 /* --------------------------------------------------------------- */
 /* Defines the structures for each vector type */
 
+// 16-bit
+typedef union {
+	vec_uint8 generic[2];
+} vuint8x2;
+
+typedef union {
+	vec_int8 generic[2];
+} vint8x2;
+
+// 32-bit
+typedef union {
+	vuint8x2 generic[2];
+} vuint8x4;
+
+typedef union {
+	vec_uint16 generic[2];
+} vuint16x2;
+
+typedef union {
+	vint8x2 generic[2];
+} vint8x4;
+
+typedef union {
+	vec_int16 generic[2];
+} vint16x2;
+
 // 64-bit
 typedef union {
 #ifdef VEC_COMPILER_HAS_MMX
 	__m64 mmx;
 #endif
 
-	uint8_t generic[8];
+	vuint8x4 generic[2];
 } vuint8x8;
 
 typedef union {
@@ -601,7 +494,7 @@
 	__m64 mmx;
 #endif
 
-	uint16_t generic[4];
+	vuint16x2 generic[2];
 } vuint16x4;
 
 typedef union {
@@ -609,7 +502,7 @@
 	__m64 mmx;
 #endif
 
-	uint32_t generic[2];
+	vec_uint32 generic[2];
 } vuint32x2;
 
 typedef union {
@@ -617,7 +510,7 @@
 	__m64 mmx;
 #endif
 
-	int8_t generic[8];
+	vint8x4 generic[2];
 } vint8x8;
 
 typedef union {
@@ -625,7 +518,7 @@
 	__m64 mmx;
 #endif
 
-	int16_t generic[4];
+	vint16x2 generic[2];
 } vint16x4;
 
 typedef union {
@@ -633,7 +526,7 @@
 	__m64 mmx;
 #endif
 
-	int32_t generic[2];
+	vec_int32 generic[2];
 } vint32x2;
 
 // 128-bit
@@ -674,7 +567,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
 	vector unsigned long long altivec;
 #endif
-	uint64_t generic[2];
+	vec_uint64 generic[2];
 } vuint64x2;
 
 typedef union {
@@ -714,7 +607,7 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
 	vector signed long long altivec;
 #endif
-	int64_t generic[2];
+	vec_int64 generic[2];
 } vint64x2;
 
 // 256-bit
@@ -837,11 +730,11 @@
 int vec_init(void);
 
 #define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]); \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]); \
-	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
-	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
+	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
 	v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 	v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
@@ -863,6 +756,13 @@
 	VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \
 	VEC_DECLARE_OPERATIONS_SIGN(u, bits, size)
 
+// 16-bit
+VEC_DECLARE_OPERATIONS(8, 2)
+
+// 32-bit
+VEC_DECLARE_OPERATIONS(8, 4)
+VEC_DECLARE_OPERATIONS(16, 2)
+
 // 64-bit
 VEC_DECLARE_OPERATIONS(8, 8)
 VEC_DECLARE_OPERATIONS(16, 4)
@@ -897,37 +797,46 @@
 // Fallback functions, need to be defined before everything else.
 #include "impl/fallback.h"
 
-// okay, these are filled in for each supported backend
+// okay, these are filled in for each supported backend.
+// `and', `or', `xor', and `nor' have to be prefixed with
+// `b' because of <iso646.h>
 #define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \
 	typedef struct { \
-		v##sign##int##bits##x##size (*splat)(sign##int##bits##_t x); \
-		v##sign##int##bits##x##size (*load_aligned)(const sign##int##bits##_t in[size]); \
-		v##sign##int##bits##x##size (*load)(const sign##int##bits##_t in[size]); \
-		void (*store_aligned)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
-		void (*store)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
+		v##sign##int##bits##x##size (*splat)(vec_##sign##int##bits x); \
+		v##sign##int##bits##x##size (*load_aligned)(const vec_##sign##int##bits in[size]); \
+		v##sign##int##bits##x##size (*load)(const vec_##sign##int##bits in[size]); \
+		void (*store_aligned)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
+		void (*store)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
 		v##sign##int##bits##x##size (*add)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*sub)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*mul)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*div)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*avg)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-		v##sign##int##bits##x##size (*and)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-		v##sign##int##bits##x##size (*or)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-		v##sign##int##bits##x##size (*xor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-		v##sign##int##bits##x##size (*not)(v##sign##int##bits##x##size vec); \
+		v##sign##int##bits##x##size (*band)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*bor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*bxor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*bnot)(v##sign##int##bits##x##size vec); \
+		v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
+		v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*cmplt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*cmple)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*cmpeq)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*cmpge)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
 		v##sign##int##bits##x##size (*cmpgt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-		v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-		v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-		v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
 	} v##sign##int##bits##x##size##_impl;
 
 #define VEC_DEFINE_IMPL_STRUCT(bits, size) \
 	VEC_DEFINE_IMPL_STRUCT_SIGN( , bits, size) \
 	VEC_DEFINE_IMPL_STRUCT_SIGN(u, bits, size)
 
+// 16-bit
+VEC_DEFINE_IMPL_STRUCT(8, 2)
+
+// 32-bit
+VEC_DEFINE_IMPL_STRUCT(8, 4)
+VEC_DEFINE_IMPL_STRUCT(16, 2)
+
 // 64-bit
 VEC_DEFINE_IMPL_STRUCT(8, 8)
 VEC_DEFINE_IMPL_STRUCT(16, 4)
@@ -988,6 +897,16 @@
 
 #include "impl/cpu.h" // CPU detection crap
 
+// 16-bit
+static vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
+static vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;
+
+// 32-bit
+static vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
+static vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
+static vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
+static vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
+
 // 64-bit
 static vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
 static vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
@@ -1026,6 +945,7 @@
 static vint64x8_impl  *vint64x8_impl_cpu  = &vint64x8_impl_generic;
 static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic;
 
+// returns 0 or a negative error code on failure
 int vec_init(void)
 {
 	// This function is NOT thread safe. However, once vec
@@ -1112,12 +1032,14 @@
 	{
 		// do nothing, they're already set to generics
 	}
+
+	return 0;
 }
 
 /* ---------------------------------------------------------------- */
 
 #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \
 	{ \
 		if (v##sign##int##bits##x##size##_impl_cpu->splat) \
 			return v##sign##int##bits##x##size##_impl_cpu->splat(x); \
@@ -1125,16 +1047,19 @@
 		return v##sign##int##bits##x##size##_fallback_splat(x); \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \
 	{ \
+		v##sign##int##bits##x##size err = {0}; \
+	\
 		if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \
 			return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \
 	\
 		VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \
-		return (v##sign##int##bits##x##size){0}; \
+	\
+		return err; \
 	} \
 	\
-	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \
+	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \
 	{ \
 		if (v##sign##int##bits##x##size##_impl_cpu->load) \
 			return v##sign##int##bits##x##size##_impl_cpu->load(in); \
@@ -1142,7 +1067,7 @@
 		return v##sign##int##bits##x##size##_fallback_load(in); \
 	} \
 	\
-	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \
 			v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \
@@ -1152,7 +1077,7 @@
 		VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \
 	} \
 	\
-	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
 	{ \
 		if (v##sign##int##bits##x##size##_impl_cpu->store) { \
 			v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \
@@ -1204,32 +1129,32 @@
 	\
 	v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->and) \
-			v##sign##int##bits##x##size##_impl_cpu->and(vec1, vec2); \
+		if (v##sign##int##bits##x##size##_impl_cpu->band) \
+			v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \
 	\
 		return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \
 	} \
 	\
 	v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->or) \
-			v##sign##int##bits##x##size##_impl_cpu->or(vec1, vec2); \
+		if (v##sign##int##bits##x##size##_impl_cpu->bor) \
+			v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \
 	\
 		return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \
 	} \
 	\
 	v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->xor) \
-			v##sign##int##bits##x##size##_impl_cpu->xor(vec1, vec2); \
+		if (v##sign##int##bits##x##size##_impl_cpu->bxor) \
+			v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \
 	\
 		return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \
 	} \
 	\
 	v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
 	{ \
-		if (v##sign##int##bits##x##size##_impl_cpu->not) \
-			v##sign##int##bits##x##size##_impl_cpu->not(vec); \
+		if (v##sign##int##bits##x##size##_impl_cpu->bnot) \
+			v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \
 	\
 		return v##sign##int##bits##x##size##_fallback_not(vec); \
 	} \
@@ -1302,6 +1227,13 @@
 	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
 	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
 
+// 16-bit
+VEC_DEFINE_OPERATIONS(8, 2)
+
+// 32-bit
+VEC_DEFINE_OPERATIONS(8, 4)
+VEC_DEFINE_OPERATIONS(16, 2)
+
 // 64-bit
 VEC_DEFINE_OPERATIONS(8, 8)
 VEC_DEFINE_OPERATIONS(16, 4)
@@ -1330,4 +1262,8 @@
 
 #endif /* VEC_IMPLEMENTATION */
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* VEC_VEC_H_ */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/Makefile	Wed Nov 20 12:02:15 2024 -0500
@@ -0,0 +1,230 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.25
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/paper/Documents/src/hg/vec
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/paper/Documents/src/hg/vec/test
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles /home/paper/Documents/src/hg/vec/test//CMakeFiles/progress.marks
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+#=============================================================================
+# Target rules for targets named vec
+
+# Build rule for target.
+vec: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 vec
+.PHONY : vec
+
+# fast build rule for target.
+vec/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/build
+.PHONY : vec/fast
+
+src/vec.o: src/vec.c.o
+.PHONY : src/vec.o
+
+# target to build an object file
+src/vec.c.o:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.o
+.PHONY : src/vec.c.o
+
+src/vec.i: src/vec.c.i
+.PHONY : src/vec.i
+
+# target to preprocess a source file
+src/vec.c.i:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.i
+.PHONY : src/vec.c.i
+
+src/vec.s: src/vec.c.s
+.PHONY : src/vec.s
+
+# target to generate assembly for a file
+src/vec.c.s:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.s
+.PHONY : src/vec.c.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... vec"
+	@echo "... src/vec.o"
+	@echo "... src/vec.i"
+	@echo "... src/vec.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
--- a/test/Makefile.ppc	Wed Nov 20 04:16:56 2024 -0500
+++ b/test/Makefile.ppc	Wed Nov 20 12:02:15 2024 -0500
@@ -1,3 +1,3 @@
-CFLAGS += -maltivec
+CPPFLAGS += -maltivec
 
 include Makefile.template
\ No newline at end of file
--- a/test/Makefile.template	Wed Nov 20 04:16:56 2024 -0500
+++ b/test/Makefile.template	Wed Nov 20 12:02:15 2024 -0500
@@ -1,4 +1,6 @@
-CFLAGS += -g -O2 -std=c99 -I../include
+CPPFLAGS += -g -O2 -I../include -Wall -Wpedantic -Werror=strict-aliasing
+CFLAGS += $(CPPFLAGS) -std=c99
+CXXFLAGS += $(CPPFLAGS) -std=c++11
 
 HEADERS = ../include/vec/vec.h \
 	../include/vec/impl/ppc/altivec.h \
@@ -9,9 +11,12 @@
 	../include/vec/impl/x86/sse41.h \
 	../include/vec/impl/cpu.h \
 	../include/vec/impl/fallback.h \
-	../include/vec/impl/generic.h
-BINS = test-generic test-host
-OBJS = vec-generic.o vec-host.o test.o
+	../include/vec/impl/generic.h \
+	test_align.h \
+	test_arith.h \
+	test_compare.h
+BINS = test-generic test-host test-cxx
+OBJS = vec-generic.o vec-host.o test.o test-cxx.o
 
 .PHONY: all clean test
 
@@ -26,15 +31,22 @@
 test.o: test.c
 	$(CC) $(CFLAGS) -c -o $@ $<
 
+test-cxx.o: test.cc
+	$(CXX) $(CXXFLAGS) -c -o $@ $<
+
 test-generic: vec-generic.o test.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
 test-host: vec-host.o test.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
+test-cxx: test-cxx.o
+	$(CXX) $(LDFLAGS) -o $@ $^
+
 clean:
 	$(RM) $(BINS) $(OBJS)
 
 test: clean $(BINS)
 	./test-generic
 	./test-host
+	./test-cxx
--- a/test/Makefile.x86	Wed Nov 20 04:16:56 2024 -0500
+++ b/test/Makefile.x86	Wed Nov 20 12:02:15 2024 -0500
@@ -1,3 +1,3 @@
-CFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f
+CPPFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f
 
 include Makefile.template
\ No newline at end of file
--- a/test/test.c	Wed Nov 20 04:16:56 2024 -0500
+++ b/test/test.c	Wed Nov 20 12:02:15 2024 -0500
@@ -1,6 +1,7 @@
 #include "vec/vec.h"
 
 #include <stdio.h>
+#include <string.h>
 #include <inttypes.h>
 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
@@ -78,6 +79,11 @@
 	VTEST(, , bits, size)     VTEST(u, U, bits, size) \
 	VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size)
 
+DEF_VEC_TEST_FUNCS(8, 2)
+
+DEF_VEC_TEST_FUNCS(8, 4)
+DEF_VEC_TEST_FUNCS(16, 2)
+
 DEF_VEC_TEST_FUNCS(8, 8)
 DEF_VEC_TEST_FUNCS(16, 4)
 DEF_VEC_TEST_FUNCS(32, 2)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test.cc	Wed Nov 20 12:02:15 2024 -0500
@@ -0,0 +1,27 @@
+#define VEC_IMPLEMENTATION
+#include "vec/vec.h"
+
+#include <iostream>
+
+/* this test makes sure that vec can be included under C++ */
+int main(void)
+{
+	int ret = 0;
+
+	VUINT32x8_ALIGNED_ARRAY(varrin);
+	VUINT32x8_ALIGNED_ARRAY(varrout);
+
+	for (int i = 0; i < 8; i++)
+		varrin[i] = i;
+
+	vuint32x8 vec = vuint32x8_load_aligned(varrin);
+	vec = vuint32x8_add(vec, vec);
+
+	vuint32x8_store_aligned(vec, varrout);
+
+	for (int i = 0; i < 8; i++)
+		if (varrout[i] != (uint32_t)(varrin[i] + varrin[i]))
+			ret |= 1;
+
+	return ret;
+}
\ No newline at end of file
--- a/test/test_align.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/test/test_align.h	Wed Nov 20 12:02:15 2024 -0500
@@ -31,6 +31,11 @@
 	RUN_TEST( ,  , bits, size) \
 	RUN_TEST(u, U, bits, size)
 
+	RUN_TESTS(8, 2)
+
+	RUN_TESTS(8, 4)
+	RUN_TESTS(16, 2)
+
 	RUN_TESTS(8, 8)
 	RUN_TESTS(16, 4)
 	RUN_TESTS(32, 2)
--- a/test/test_arith.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/test/test_arith.h	Wed Nov 20 12:02:15 2024 -0500
@@ -69,6 +69,11 @@
 	CREATE_TESTS_SIGN(, d, , bits, size) \
 	CREATE_TESTS_SIGN(u, u, U, bits, size)
 
+CREATE_TESTS(8, 2)
+
+CREATE_TESTS(8, 4)
+CREATE_TESTS(16, 2)
+
 CREATE_TESTS(8, 8)
 CREATE_TESTS(16, 4)
 CREATE_TESTS(32, 2)
@@ -91,6 +96,7 @@
 #undef CREATE_TESTS_SIGN
 #undef CREATE_TESTS
 #undef CREATE_TEST
+#undef CREATE_TEST_SHIFT
 
 static int test_arith(void)
 {
@@ -126,6 +132,11 @@
 	RUN_TESTS_SIGN( , bits, size) \
 	RUN_TESTS_SIGN(u, bits, size)
 
+	RUN_TESTS(8, 2)
+
+	RUN_TESTS(8, 4)
+	RUN_TESTS(16, 2)
+
 	RUN_TESTS(8, 8)
 	RUN_TESTS(16, 4)
 	RUN_TESTS(32, 2)
--- a/test/test_compare.h	Wed Nov 20 04:16:56 2024 -0500
+++ b/test/test_compare.h	Wed Nov 20 12:02:15 2024 -0500
@@ -32,6 +32,11 @@
 
 #define CREATE_TESTS(bits, size) CREATE_TESTS_SIGN(, d, bits, size) CREATE_TESTS_SIGN(u, u, bits, size)
 
+CREATE_TESTS(8, 2)
+
+CREATE_TESTS(8, 4)
+CREATE_TESTS(16, 2)
+
 CREATE_TESTS(8, 8)
 CREATE_TESTS(16, 4)
 CREATE_TESTS(32, 2)
@@ -76,6 +81,11 @@
 	RUN_TESTS_SIGN( , bits, size) \
 	RUN_TESTS_SIGN(u, bits, size)
 
+	RUN_TESTS(8, 2)
+
+	RUN_TESTS(8, 4)
+	RUN_TESTS(16, 2)
+
 	RUN_TESTS(8, 8)
 	RUN_TESTS(16, 4)
 	RUN_TESTS(32, 2)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/vec.pc	Wed Nov 20 12:02:15 2024 -0500
@@ -0,0 +1,12 @@
+prefix=/usr/local
+exec_prefix=/usr/local
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: vec
+Description: a tiny C99 SIMD vector library
+Version: 2.0.0
+
+Requires:
+Libs: -L${libdir} -lvec
+Cflags: -I${includedir}