changeset 4:75ab77f874e2

*: aligned generics, fixed altivec, aligned tests...
author Paper <paper@tflc.us>
date Wed, 23 Oct 2024 10:13:25 -0400
parents 3c5545b1568f
children 1f070512497f
files include/vec/impl/altivec.h include/vec/impl/gcc.h include/vec/impl/generic.h include/vec/impl/sse2.h include/vec/vec.h test/main.c test/test_align.h test/test_arith.h test/test_compare.h
diffstat 9 files changed, 258 insertions(+), 239 deletions(-) [+]
line wrap: on
line diff
--- a/include/vec/impl/altivec.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/include/vec/impl/altivec.h	Wed Oct 23 10:13:25 2024 -0400
@@ -31,12 +31,46 @@
 
 #define VEC_ALTIVEC_ALIGNMENT 16
 
-/* Since altivec conveniently made their API super user friendly, we can just use
- * one giant macro to define literally everything */
-#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */
+#ifdef vec_mul
+# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \
+	VEC_DECL_MUL(sign, csign, bits, size) \
+	{ \
+		return vec_mul(vec1, vec2); \
+	}
+#else
+# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \
+	VEC_GENERIC_MULTIPLY(sign, csign, bits, size)
+#endif
+
+#ifdef vec_splats
+# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \
 	VEC_DECL_SPLAT(sign, bits, size) \
 	{ \
 		return vec_splats(x); \
+	}
+#else
+# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \
+	VEC_GENERIC_SPLAT(sign, csign, bits, size)
+#endif
+
+#define VEC_ALTIVEC_uRSHIFT vec_sr
+#define VEC_ALTIVEC_RSHIFT vec_sra
+
+#define VEC_ALTIVEC_uLRSHIFT(sign, csign, bits, size) \
+	VEC_DECL_SHIFT(sign, bits, size, l, r) \
+	{ \
+		return vec_sr(vec1, vec2); \
+	}
+#define VEC_ALTIVEC_LRSHIFT(sign, csign, bits, size) \
+	VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r)
+
+/* Since altivec conveniently made their API super user friendly, we can just use
+ * one giant macro to define literally everything */
+#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
+	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
+	{ \
+		return vec_ld(0, in); \
 	} \
 	\
 	VEC_DECL_LOAD(sign, bits, size) \
@@ -44,6 +78,11 @@
 		return vec_perm(vec_ld(0, in), vec_ld(VEC_ALTIVEC_ALIGNMENT, in), vec_lvsl(0, in)); \
 	} \
 	\
+	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
+	{ \
+		vec_st(vec, 0, out); \
+	} \
+	\
 	VEC_DECL_STORE(sign, bits, size) \
 	{ \
 		VEC_ALIGNED_ARRAY(sign##int##bits##_t, aligned_out, size, VEC_ALTIVEC_ALIGNMENT); \
@@ -61,10 +100,7 @@
 		return vec_sub(vec1, vec2); \
 	} \
 	\
-	VEC_DECL_MUL(sign, bits, size) \
-	{ \
-		return vec_mul(vec1, vec2); \
-	} \
+	VEC_ALTIVEC_MUL(sign, csign, bits, size) \
 	\
 	VEC_DECL_SHIFT(sign, bits, size, , l) \
 	{ \
@@ -73,13 +109,10 @@
 	\
 	VEC_DECL_SHIFT(sign, bits, size, , r) \
 	{ \
-		return vec_sra(vec1, vec2); \
+		return VEC_ALTIVEC_##sign##RSHIFT(vec1, vec2); \
 	} \
 	\
-	VEC_DECL_SHIFT(sign, bits, size, l, r) \
-	{ \
-		return vec_sr(vec1, vec2); \
-	} \
+	VEC_ALTIVEC_##sign##LRSHIFT(sign, csign, bits, size) \
 	\
 	VEC_DECL_AVG(sign, bits, size) \
 	{ \
@@ -101,15 +134,17 @@
 		return vec_xor(vec1, vec2); \
 	} \
 	\
-	VEC_GENERIC_DIVIDE(sign, bits, size)
+	VEC_GENERIC_COMPARISONS(sign, csign, bits, size) \
+	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
+	VEC_ALTIVEC_SPLAT(sign, csign, bits, size)
 
 #ifndef VEC_VUINT8X16
 # define VEC_VUINT8X16
 typedef vector unsigned char vuint8x16;
 # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-VEC_DEFINE_OPERATIONS(u, 8, 16)
 # define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 8, 16)
 #endif /* VEC_VUINT8X16 */
 
 #ifndef VEC_VINT8X16
@@ -117,8 +152,8 @@
 typedef vector signed char vint8x16;
 # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-VEC_DEFINE_OPERATIONS(, 8, 16)
 # define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 8, 16)
 #endif /* VEC_VINT8X16 */
 
 #ifndef VEC_VUINT16X8
@@ -126,8 +161,8 @@
 typedef vector unsigned short vuint16x8;
 # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(vuint16x8){ a, b, c, d, e, f, g, h }
-VEC_DEFINE_OPERATIONS(u, 16, 8)
 # define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 16, 8)
 #endif /* VEC_VUINT16X8 */
 
 #ifndef VEC_VINT16X8
@@ -135,8 +170,8 @@
 typedef vector signed short vint16x8;
 # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(vint16x8){ a, b, c, d, e, f, g, h }
-VEC_DEFINE_OPERATIONS(, 16, 8)
 # define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 16, 8)
 #endif /* VEC_VINT16X8 */
 
 #ifndef VEC_VUINT32X4
@@ -144,8 +179,8 @@
 typedef vector unsigned int vuint32x4;
 # define VUINT32x4_CONSTANT(a, b, c, d) \
 	(vuint32x4){ a, b, c, d }
-VEC_DEFINE_OPERATIONS(u, 32, 4)
 # define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 32, 4)
 #endif /* VEC_VUINT32X4 */
 
 #ifndef VEC_VINT32X4
@@ -153,8 +188,8 @@
 typedef vector signed int vint32x4;
 # define VINT32x4_CONSTANT(a, b, c, d) \
 	(vint32x4){ a, b, c, d }
-VEC_DEFINE_OPERATIONS(, 32, 4)
 # define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 32, 4)
 #endif /* VEC_VINT32X4 */
 
 #if defined(__POWER8__) && defined(__VSX__)
@@ -164,8 +199,8 @@
 typedef vector unsigned long long vuint64x2;
 #  define VUINT64x2_CONSTANT(a, b) \
 	(vuint64x2){ a, b }
-VEC_DEFINE_OPERATIONS(u, 64, 2)
 #  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 64, 2)
 # endif /* VEC_VUINT64X2 */
 
 # ifndef VEC_VINT64X2
@@ -173,10 +208,12 @@
 typedef vector signed long long vint64x2;
 #  define VINT64x2_CONSTANT(a, b) \
 	(vint64x2){ a, b }
-VEC_DEFINE_OPERATIONS(, 64, 2)
 #  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 64, 2)
 # endif /* VEC_VINT64X2 */
 
 #endif /* defined(__POWER8__) && defined(__VSX__) */
 
 #undef VEC_DEFINE_OPERATIONS
+#undef VEC_ALTIVEC_MUL
+#undef VEC_ALTIVEC_SPLAT
--- a/include/vec/impl/gcc.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/include/vec/impl/gcc.h	Wed Oct 23 10:13:25 2024 -0400
@@ -27,7 +27,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
 	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
 	{ \
 		v##sign##int##bits##x##size vec; \
@@ -100,9 +100,9 @@
 		return vec1 >= vec2; \
 	} \
 	\
-	VEC_GENERIC_DIVIDE(sign, bits, size) \
-	VEC_GENERIC_SPLAT(sign, bits, size) \
-	VEC_GENERIC_SHIFTS(sign, bits, size) \
+	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
+	VEC_GENERIC_SPLAT(sign, csign, bits, size) \
+	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
 	VEC_GENERIC_AVG(sign, bits, size)
 
 #ifndef VEC_VUINT8X16
@@ -110,8 +110,8 @@
 typedef uint8_t vuint8x16 __attribute__((__vector_size__(16)));
 # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-VEC_DEFINE_OPERATIONS(u, 8, 16)
-# define VINT8x16_ALIGNED 1
+# define VUINT8x16_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 8, 16)
 #endif
 
 #ifndef VEC_VUINT16X8
@@ -119,8 +119,8 @@
 typedef uint16_t vuint16x8 __attribute__((__vector_size__(16)));
 # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(vuint16x8){ a, b, c, d, e, f, g, h }
-VEC_DEFINE_OPERATIONS(u, 16, 8)
-# define VINT16x8_ALIGNED 1
+# define VUINT16x8_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 16, 8)
 #endif
 
 #ifndef VEC_VUINT32X4
@@ -128,8 +128,8 @@
 typedef uint32_t vuint32x4 __attribute__((__vector_size__(16)));
 # define VUINT32x4_CONSTANT(a, b, c, d) \
 	(vuint32x4){ a, b, c, d }
-VEC_DEFINE_OPERATIONS(u, 32, 4)
-# define VINT32x4_ALIGNED 1
+# define VUINT32x4_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 32, 4)
 #endif
 
 #ifndef VEC_VUINT64X2
@@ -137,8 +137,8 @@
 typedef uint64_t vuint64x2 __attribute__((__vector_size__(16)));
 # define VUINT64x2_CONSTANT(a, b) \
 	(vuint64x2){ a, b }
-VEC_DEFINE_OPERATIONS(u, 64, 2)
-# define VINT64x2_ALIGNED 1
+# define VUINT64x2_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 64, 2)
 #endif
 
 #ifndef VEC_VINT8X16
@@ -146,8 +146,8 @@
 typedef int8_t vint8x16 __attribute__((__vector_size__(16)));
 # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-VEC_DEFINE_OPERATIONS(, 8, 16)
-# define VINT8x16_ALIGNED 1
+# define VINT8x16_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 8, 16)
 #endif
 
 #ifndef VEC_VINT16X8
@@ -155,8 +155,8 @@
 typedef int16_t vint16x8 __attribute__((__vector_size__(16)));
 # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(vint16x8){ a, b, c, d, e, f, g, h }
-VEC_DEFINE_OPERATIONS(, 16, 8)
-# define VINT16x8_ALIGNED 1
+# define VINT16x8_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 16, 8)
 #endif
 
 #ifndef VEC_VINT32X4
@@ -164,8 +164,8 @@
 typedef int32_t vint32x4 __attribute__((__vector_size__(16)));
 # define VINT32x4_CONSTANT(a, b, c, d) \
 	(vint32x4){ a, b, c, d }
-VEC_DEFINE_OPERATIONS(, 32, 4)
-# define VINT32x4_ALIGNED 1
+# define VINT32x4_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 32, 4)
 #endif
 
 #ifndef VEC_VINT64X2
@@ -173,8 +173,8 @@
 typedef int64_t vint64x2 __attribute__((__vector_size__(16)));
 # define VINT64x2_CONSTANT(a, b) \
 	(vint64x2){ a, b }
-VEC_DEFINE_OPERATIONS(, 64, 2)
-# define VINT64x2_ALIGNED 1
+# define VINT64x2_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 64, 2)
 #endif
 
 #undef VEC_DEFINE_OPERATIONS
--- a/include/vec/impl/generic.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/include/vec/impl/generic.h	Wed Oct 23 10:13:25 2024 -0400
@@ -32,7 +32,7 @@
 		sign##int##bits##_t arr[size]; \
 	} v##sign##int##bits##x##size;
 
-#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
 	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
 	{ \
 		v##sign##int##bits##x##size vec; \
@@ -91,18 +91,19 @@
 		return vec1; \
 	} \
 	\
-	VEC_GENERIC_SPLAT(sign, bits, size) \
-	VEC_GENERIC_SHIFTS(sign, bits, size) \
-	VEC_GENERIC_DIVIDE(sign, bits, size) \
-	VEC_GENERIC_AVG(sign, bits, size)
+	VEC_GENERIC_SPLAT(sign, csign, bits, size) \
+	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
+	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
+	VEC_GENERIC_AVG(sign, bits, size) \
+	VEC_GENERIC_COMPARISONS(sign, csign, bits, size)
 
 #ifndef VEC_VUINT8X16
 # define VEC_VUINT8X16
 VEC_DEFINE_STRUCT(u, 8, 16)
 # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
-VEC_DEFINE_OPERATIONS(u, 8, 16)
-VEC_GENERIC_COMPARISONS(u, 8, 16)
+# define VUINT8x16_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 8, 16)
 #endif
 
 #ifndef VEC_VUINT16X8
@@ -110,8 +111,8 @@
 VEC_DEFINE_STRUCT(u, 16, 8)
 # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } })
-VEC_DEFINE_OPERATIONS(u, 16, 8)
-VEC_GENERIC_COMPARISONS(u, 16, 8)
+# define VUINT16x8_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 16, 8)
 #endif
 
 #ifndef VEC_VUINT32X4
@@ -119,8 +120,8 @@
 VEC_DEFINE_STRUCT(u, 32, 4)
 # define VUINT32x4_CONSTANT(a, b, c, d) \
 	((vuint32x4){ .arr = { a, b, c, d } })
-VEC_DEFINE_OPERATIONS(u, 32, 4)
-VEC_GENERIC_COMPARISONS(u, 32, 4)
+# define VUINT32x4_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 32, 4)
 #endif
 
 #ifndef VEC_VUINT64X2
@@ -128,8 +129,8 @@
 VEC_DEFINE_STRUCT(u, 64, 2)
 # define VUINT64x2_CONSTANT(a, b) \
 	((vuint64x2){ .arr = { a, b } })
-VEC_DEFINE_OPERATIONS(u, 64, 2)
-VEC_GENERIC_COMPARISONS(u, 64, 2)
+# define VUINT64x2_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(u, U, 64, 2)
 #endif
 
 #ifndef VEC_VINT8X16
@@ -137,8 +138,8 @@
 VEC_DEFINE_STRUCT(, 8, 16)
 # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	((vint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
-VEC_DEFINE_OPERATIONS(, 8, 16)
-VEC_GENERIC_COMPARISONS(, 8, 16)
+# define VINT8x16_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 8, 16)
 #endif
 
 #ifndef VEC_VINT16X8
@@ -146,8 +147,8 @@
 VEC_DEFINE_STRUCT(, 16, 8)
 # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	((vint16x8){ .arr = { a, b, c, d, e, f, g, h } })
-VEC_DEFINE_OPERATIONS(, 16, 8)
-VEC_GENERIC_COMPARISONS(, 16, 8)
+# define VINT16x8_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 16, 8)
 #endif
 
 #ifndef VEC_VINT32X4
@@ -155,8 +156,8 @@
 VEC_DEFINE_STRUCT(, 32, 4)
 # define VINT32x4_CONSTANT(a, b, c, d) \
 	((vint32x4){ .arr = { a, b, c, d } })
-VEC_DEFINE_OPERATIONS(, 32, 4)
-VEC_GENERIC_COMPARISONS(, 32, 4)
+# define VINT32x4_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 32, 4)
 #endif
 
 #ifndef VEC_VINT64X2
@@ -164,8 +165,8 @@
 VEC_DEFINE_STRUCT(, 64, 2)
 # define VINT64x2_CONSTANT(a, b) \
 	((vint64x2){ .arr = { a, b } })
-VEC_DEFINE_OPERATIONS(, 64, 2)
-VEC_GENERIC_COMPARISONS(, 64, 2)
+# define VINT64x2_ALIGNMENT 1
+VEC_DEFINE_OPERATIONS(, , 64, 2)
 #endif
 
 #undef VEC_DEFINE_STRUCT
--- a/include/vec/impl/sse2.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/include/vec/impl/sse2.h	Wed Oct 23 10:13:25 2024 -0400
@@ -24,11 +24,9 @@
 
 #include <emmintrin.h>
 
-#include <string.h> /* memcpy */
-
 #define VEC_SSE2_ALIGNMENT 16
 
-#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
 	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
 	{ \
 		return _mm_load_si128((const __m128i *)in); \
@@ -74,9 +72,9 @@
 		return _mm_xor_si128(vec1, vec2); \
 	} \
 	\
-	VEC_GENERIC_SPLAT(sign, bits, size) \
-	VEC_GENERIC_DIVIDE(sign, bits, size) \
-	VEC_GENERIC_SHIFTS(sign, bits, size) \
+	VEC_GENERIC_SPLAT(sign, csign, bits, size) \
+	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
+	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
 	VEC_DECL_MUL(sign, bits, size); \
 	VEC_GENERIC_AVG(sign, bits, size)
 
@@ -100,9 +98,9 @@
 typedef __m128i vuint8x16;
 # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a))
-VEC_DEFINE_OPERATIONS(u, 8, 16)
-VEC_GENERIC_COMPARISONS(u, 8, 16)
 # define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 8, 16)
+VEC_GENERIC_COMPARISONS(u, U, 8, 16)
 VEC_DECL_MUL(u, 8, 16)
 {
 	// unpack and multiply
@@ -119,9 +117,9 @@
 typedef __m128i vuint16x8;
 # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(_mm_setr_epi16(h, g, f, e, d, c, b, a))
-VEC_DEFINE_OPERATIONS(u, 16, 8)
-VEC_GENERIC_COMPARISONS(u, 16, 8)
 # define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 16, 8)
+VEC_GENERIC_COMPARISONS(u, U, 16, 8)
 VEC_DECL_MUL(u, 16, 8)
 {
 	return _mm_mullo_epi16(vec1, vec2);
@@ -133,9 +131,9 @@
 typedef __m128i vuint32x4;
 # define VUINT32x4_CONSTANT(a, b, c, d) \
 	(_mm_setr_epi32(d, c, b, a))
-VEC_DEFINE_OPERATIONS(u, 32, 4)
-VEC_GENERIC_COMPARISONS(u, 32, 4)
 # define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 32, 4)
+VEC_GENERIC_COMPARISONS(u, U, 32, 4)
 VEC_DECL_MUL(u, 32, 4)
 {
 	/* this was stolen from... somewhere :) */
@@ -156,9 +154,9 @@
 {
 	return _mm_setr_epi32(b, b >> 32, a, a >> 32);
 }
-VEC_DEFINE_OPERATIONS(u, 64, 2)
-VEC_GENERIC_COMPARISONS(u, 64, 2)
 # define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(u, U, 64, 2)
+VEC_GENERIC_COMPARISONS(u, U, 64, 2)
 VEC_DECL_MUL(u, 64, 2)
 {
 	/* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */
@@ -192,9 +190,9 @@
 typedef __m128i vint8x16;
 # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a))
-VEC_DEFINE_OPERATIONS(, 8, 16)
+# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 8, 16)
 VEC_DEFINE_COMPARISONS_SIGNED(8, 16)
-# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(, 8, 16)
 {
 	// unpack and multiply
@@ -211,9 +209,9 @@
 typedef __m128i vint16x8;
 # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(_mm_setr_epi16(h, g, f, e, d, c, b, a))
-VEC_DEFINE_OPERATIONS(, 16, 8)
+# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 16, 8)
 VEC_DEFINE_COMPARISONS_SIGNED(16, 8)
-# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(, 16, 8)
 {
 	return _mm_mullo_epi16(vec1, vec2);
@@ -225,9 +223,9 @@
 typedef __m128i vint32x4;
 # define VINT32x4_CONSTANT(a, b, c, d) \
 	(_mm_setr_epi32(d, c, b, a))
-VEC_DEFINE_OPERATIONS(, 32, 4)
+# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 32, 4)
 VEC_DEFINE_COMPARISONS_SIGNED(32, 4)
-# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(, 32, 4)
 {
 	__m128i a13    = _mm_shuffle_epi32(vec1, 0xF5);     // (-,a3,-,a1)
@@ -247,9 +245,9 @@
 {
 	return _mm_setr_epi32(b, vec_rshift(b, 32), a, vec_rshift(a, 32));
 }
-VEC_DEFINE_OPERATIONS(, 64, 2)
-VEC_GENERIC_COMPARISONS(, 64, 2)
 # define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
+VEC_DEFINE_OPERATIONS(, , 64, 2)
+VEC_GENERIC_COMPARISONS(, , 64, 2)
 VEC_DECL_MUL(, 64, 2)
 {
 	/* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */
--- a/include/vec/vec.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/include/vec/vec.h	Wed Oct 23 10:13:25 2024 -0400
@@ -49,23 +49,15 @@
 # endif
 #endif
 
-#ifndef VEC_HAVE_GNUC_VECTORS
-# if VEC_GNUC_ATLEAST(4, 0, 0)
-#  define VEC_HAVE_GNUC_VECTORS
+#ifndef VEC_ALIGNED
+# if VEC_GNUC_ATLEAST(2, 7, 0)
+#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
 # endif
 #endif
 
-#ifndef VEC_ALIGNED
-# if VEC_GNUC_ATLEAST(2, 7, 0)
-#  define VEC_ALIGNED(x) __attribute__((aligned(x)))
-# endif
-#endif
-
-#ifndef VEC_ALWAYS_INLINE
-# if VEC_GNUC_ATLEAST(3, 1, 0)
-#  define VEC_ALWAYS_INLINE(x) __attribute__((always_inline))
-# endif
-#endif
+/* FIXME: gcc 4.2 on Mac OS X doesn't have always_inline,
+ * even though docs and many online sources say that it
+ * should have it. */
 
 #ifndef VEC_ALWAYS_INLINE
 # define VEC_ALWAYS_INLINE
@@ -80,7 +72,7 @@
 /* allocate more than necessary to align */
 # define VEC_ALIGNED_ARRAY(type, var, length, align) \
 	unsigned char vec_##var##_unaligned_[((length) * sizeof(type)) + (align) - 1]; \
-	type *var = (type *)((((intptr_t)vec_##var##_unaligned_ + (align) - 1) / (align)) * (align))
+	type *var = (type *)(((intptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1))
 # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
 	(sizeof(vec_##var##_unaligned_) - ((align) - 1))
 #endif
@@ -172,6 +164,49 @@
 }
 
 /* --------------------------------------------------------------- */
+/* Array alignment macros */
+
+#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
+
+#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)
+
+#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)
+
+#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)
+
+#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)
+
+#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)
+
+#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)
+
+#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)
+
+/* --------------------------------------------------------------- */
 /* Implementation includes */
 
 #define VEC_OPERATION_DECL(sign, bits, size, ret, op, params) \
@@ -207,44 +242,60 @@
 #define VEC_DECL_CMPGE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpge)
 
 /* Generic variations. */
-#define VEC_GENERIC_SPLAT(sign, bits, size) \
+#define VEC_GENERIC_SPLAT(sign, csign, bits, size) \
 	VEC_DECL_SPLAT(sign, bits, size) \
 	{ \
-		sign##int##bits##_t va[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(va); \
 		for (int i = 0; i < size; i++) va[i] = x; \
-		return v##sign##int##bits##x##size##_load(va); \
+		return v##sign##int##bits##x##size##_load_aligned(va); \
 	}
 
-#define VEC_GENERIC_DIVIDE(sign, bits, size) \
+#define VEC_GENERIC_MULTIPLY(sign, csign, bits, size) \
+	VEC_DECL_MUL(sign, bits, size) \
+	{ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \
+	\
+		v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \
+		v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \
+	\
+		for (int i = 0; i < size; i++) vec1a[i] *= vec2a[i]; \
+	\
+		return v##sign##int##bits##x##size##_load_aligned(vec1a); \
+	}
+
+#define VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
 	VEC_DECL_DIV(sign, bits, size) \
 	{ \
-		sign##int##bits##_t vec1a[size], vec2a[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \
 	\
-		v##sign##int##bits##x##size##_store(vec1, vec1a); \
-		v##sign##int##bits##x##size##_store(vec2, vec2a); \
+		v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \
+		v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \
 	\
 		for (int i = 0; i < size; i++) vec1a[i] = (vec2a[i]) ? (vec1a[i] / vec2a[i]) : 0; \
 	\
-		return v##sign##int##bits##x##size##_load(vec1a); \
+		return v##sign##int##bits##x##size##_load_aligned(vec1a); \
 	}
 
-#define VEC_GENERIC_SHIFT(sign, bits, size, vectype, way) \
+#define VEC_GENERIC_SHIFT(sign, csign, bits, size, vectype, way) \
 	VEC_DECL_SHIFT(sign, bits, size, vectype, way) \
 	{ \
-		sign##int##bits##_t vec1a[size], vec2a[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \
+		VUINT##bits##x##size##_ALIGNED_ARRAY(vec2a); \
 	\
-		v##sign##int##bits##x##size##_store(vec1, vec1a); \
-		vuint##bits##x##size##_store(vec2, vec2a); \
+		v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \
+		vuint##bits##x##size##_store_aligned(vec2, vec2a); \
 	\
 		for (int i = 0; i < size; i++) vec1a[i] = vec_##sign##vectype##way##shift(vec1a[i], vec2a[i]); \
 	\
-		return v##sign##int##bits##x##size##_load(vec1a); \
+		return v##sign##int##bits##x##size##_load_aligned(vec1a); \
 	}
 
-#define VEC_GENERIC_SHIFTS(sign, bits, size) \
-	VEC_GENERIC_SHIFT(sign, bits, size,  , l) /* left shift */ \
-	VEC_GENERIC_SHIFT(sign, bits, size,  , r) /* arithmetic right shift */ \
-	VEC_GENERIC_SHIFT(sign, bits, size, l, r) /* logical right shift */
+#define VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
+	VEC_GENERIC_SHIFT(sign, csign, bits, size,  , l) /* left shift */ \
+	VEC_GENERIC_SHIFT(sign, csign, bits, size,  , r) /* arithmetic right shift */ \
+	VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r) /* logical right shift */
 
 #define VEC_GENERIC_AVG(sign, bits, size) \
 	VEC_DECL_AVG(sign, bits, size) \
@@ -264,23 +315,24 @@
 		return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmplt(vec1, vec2)); \
 	}
 
-#define VEC_GENERIC_COMPARISON(sign, bits, size, name, op) \
+#define VEC_GENERIC_COMPARISON(sign, csign, bits, size, name, op) \
 	VEC_DECL_CMP##name(sign, bits, size) \
 	{ \
-		sign##int##bits##_t vec1a[size], vec2a[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \
 	\
-		v##sign##int##bits##x##size##_store(vec1, vec1a); \
-		v##sign##int##bits##x##size##_store(vec2, vec2a); \
+		v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \
+		v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \
 	\
 		for (int i = 0; i < size; i++) vec1a[i] = (vec1a[i] op vec2a[i]) ? UINT##bits##_MAX : 0; \
 	\
-		return v##sign##int##bits##x##size##_load(vec1a); \
+		return v##sign##int##bits##x##size##_load_aligned(vec1a); \
 	}
 
-#define VEC_GENERIC_COMPARISONS(sign, bits, size) \
-	VEC_GENERIC_COMPARISON(sign, bits, size, LT, <) \
-	VEC_GENERIC_COMPARISON(sign, bits, size, GT, >) \
-	VEC_GENERIC_COMPARISON(sign, bits, size, EQ, ==) \
+#define VEC_GENERIC_COMPARISONS(sign, csign, bits, size) \
+	VEC_GENERIC_COMPARISON(sign, csign, bits, size, LT, <) \
+	VEC_GENERIC_COMPARISON(sign, csign, bits, size, GT, >) \
+	VEC_GENERIC_COMPARISON(sign, csign, bits, size, EQ, ==) \
 	VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size)
 
 #ifndef VEC_SUPPRESS_HW
@@ -367,81 +419,4 @@
 #undef VEC_VUINT32X4
 #undef VEC_VUINT64X2
 
-/* ---------------------------------------------------------------- */
-/* user-friendly alignment crap */
-
-#ifndef VINT8x16_ALIGNMENT
-# define VINT8x16_ALIGNMENT 1
-#endif
-
-#ifndef VINT16x8_ALIGNMENT
-# define VINT16x8_ALIGNMENT 1
-#endif
-
-#ifndef VINT32x4_ALIGNMENT
-# define VINT32x4_ALIGNMENT 1
-#endif
-
-#ifndef VINT64x2_ALIGNMENT
-# define VINT64x2_ALIGNMENT 1
-#endif
-
-#ifndef VUINT8x16_ALIGNMENT
-# define VUINT8x16_ALIGNMENT 1
-#endif
-
-#ifndef VUINT16x8_ALIGNMENT
-# define VUINT16x8_ALIGNMENT 1
-#endif
-
-#ifndef VUINT32x4_ALIGNMENT
-# define VUINT32x4_ALIGNMENT 1
-#endif
-
-#ifndef VUINT64x2_ALIGNMENT
-# define VUINT64x2_ALIGNMENT 1
-#endif
-
-/* pointer alignment macros */
-
-#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
-#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
-#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
-#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
-
-#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
-#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
-#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
-#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
-
-#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
-#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
-#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
-#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)
-
-#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
-#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
-#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
-#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)
-
-#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
-#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)
-
-#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
-#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)
-
-#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
-#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)
-
-#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
-#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)
-
 #endif /* VEC_VEC_H_ */
--- a/test/main.c	Tue Oct 22 23:27:15 2024 -0400
+++ b/test/main.c	Wed Oct 23 10:13:25 2024 -0400
@@ -47,30 +47,30 @@
 	UINT64_C(0xff), UINT64_C(645366),     UINT64_C(0x12345ABCDE), UINT64_C(0xF00000FFF),
 };
 
-#define VTEST(sign, bits, size) \
+#define VTEST(sign, csign, bits, size) \
 	static inline v##sign##int##bits##x##size vtest##sign##bits##x##size(const size_t start) \
 	{ \
-		sign##int##bits##_t x[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(x); \
 		for (size_t i = 0; i < size; i++) \
 			x[i] = testval##sign##bits[(start + i) % ARRAY_SIZE(testval##sign##bits)]; \
-		return v##sign##int##bits##x##size##_load(x); \
+		return v##sign##int##bits##x##size##_load_aligned(x); \
 	}
 
-#define VTEST_SIGN(bits, size) VTEST(, bits, size) VTEST(u, bits, size)
+#define VTEST_SIGN(bits, size) VTEST(, , bits, size) VTEST(u, U, bits, size)
 
 VTEST_SIGN(8, 16)
 VTEST_SIGN(16, 8)
 VTEST_SIGN(32, 4)
 VTEST_SIGN(64, 2)
 
-#define DEFINE_PRINT_VECTOR(sign, psign, bits, size) \
+#define DEFINE_PRINT_VECTOR(sign, csign, psign, bits, size) \
 	static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \
 	{ \
 		fputs("vector: ", file); \
 	\
-		int##bits##_t v[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(v); \
 	\
-		v##sign##int##bits##x##size##_store(vec, v); \
+		v##sign##int##bits##x##size##_store_aligned(vec, v); \
 	\
 		fprintf(file, "%" PRI ## psign ## bits, v[0]); \
 	\
@@ -81,7 +81,7 @@
 	\
 	}
 
-#define DEFINE_PRINT_VECTOR_2(bits, size) DEFINE_PRINT_VECTOR(, d, bits, size) DEFINE_PRINT_VECTOR(u, u, bits, size)
+#define DEFINE_PRINT_VECTOR_2(bits, size) DEFINE_PRINT_VECTOR(, , d, bits, size) DEFINE_PRINT_VECTOR(u, U, u, bits, size)
 
 DEFINE_PRINT_VECTOR_2(8, 16)
 DEFINE_PRINT_VECTOR_2(16, 8)
--- a/test/test_align.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/test/test_align.h	Wed Oct 23 10:13:25 2024 -0400
@@ -22,6 +22,9 @@
 	\
 		/* mark success or failure */ \
 		ret |= !!memcmp(vec_arr, vec_arr_out, V##csign##INT##bits##x##size##_ALIGNED_ARRAY_LENGTH(vec_arr)); \
+	\
+		ret |= !V##csign##INT##bits##x##size##_PTR_ALIGNED(vec_arr); \
+		ret |= !V##csign##INT##bits##x##size##_PTR_ALIGNED(vec_arr_out); \
 	} while (0);
 
 #define RUN_TESTS(bits, size) \
--- a/test/test_arith.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/test/test_arith.h	Wed Oct 23 10:13:25 2024 -0400
@@ -1,13 +1,15 @@
-#define CREATE_TEST(sign, psign, bits, size, op, equiv) \
+#define CREATE_TEST(sign, psign, csign, bits, size, op, equiv) \
 	static int test_arith_v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size a, v##sign##int##bits##x##size b) \
 	{ \
-		sign##int##bits##_t orig_a[size], orig_b[size], orig_c[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_a); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_b); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_c); \
 	\
 		v##sign##int##bits##x##size c = v##sign##int##bits##x##size##_##op(a, b); \
 	\
-		v##sign##int##bits##x##size##_store(a, orig_a); \
-		v##sign##int##bits##x##size##_store(b, orig_b); \
-		v##sign##int##bits##x##size##_store(c, orig_c); \
+		v##sign##int##bits##x##size##_store_aligned(a, orig_a); \
+		v##sign##int##bits##x##size##_store_aligned(b, orig_b); \
+		v##sign##int##bits##x##size##_store_aligned(c, orig_c); \
 	\
 		for (int i = 0; i < size; i++) { \
 			if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \
@@ -23,17 +25,18 @@
 		return 0; \
 	}
 
-#define CREATE_TEST_SHIFT(sign, psign, bits, size, op, equiv) \
+#define CREATE_TEST_SHIFT(sign, psign, csign, bits, size, op, equiv) \
 	static int test_arith_v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size a, vuint##bits##x##size b) \
 	{ \
-		sign##int##bits##_t orig_a[size], orig_c[size]; \
-		uint##bits##_t orig_b[size]; \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_a); \
+		VUINT##bits##x##size##_ALIGNED_ARRAY(orig_b); \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_c); \
 	\
 		v##sign##int##bits##x##size c = v##sign##int##bits##x##size##_##op(a, b); \
 	\
-		v##sign##int##bits##x##size##_store(a, orig_a); \
-		vuint##bits##x##size##_store(b, orig_b); \
-		v##sign##int##bits##x##size##_store(c, orig_c); \
+		v##sign##int##bits##x##size##_store_aligned(a, orig_a); \
+		vuint##bits##x##size##_store_aligned(b, orig_b); \
+		v##sign##int##bits##x##size##_store_aligned(c, orig_c); \
 	\
 		for (int i = 0; i < size; i++) { \
 			if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \
@@ -49,22 +52,22 @@
 		return 0; \
 	}
 
-#define CREATE_TESTS(sign, psign, bits, size) \
-	CREATE_TEST(sign, psign, bits, size, add, orig_a[i] + orig_b[i]) \
-	CREATE_TEST(sign, psign, bits, size, sub, orig_a[i] - orig_b[i]) \
-	CREATE_TEST(sign, psign, bits, size, mul, orig_a[i] * orig_b[i]) \
-	CREATE_TEST(sign, psign, bits, size, div, (orig_b[i]) ? (orig_a[i] / orig_b[i]) : 0) \
-	CREATE_TEST(sign, psign, bits, size, and, orig_a[i] & orig_b[i]) \
-	CREATE_TEST(sign, psign, bits, size, or,  orig_a[i] | orig_b[i]) \
-	CREATE_TEST(sign, psign, bits, size, xor, orig_a[i] ^ orig_b[i]) \
-	CREATE_TEST(sign, psign, bits, size, avg, (orig_a[i] * orig_b[i]) / 2) \
-	CREATE_TEST_SHIFT(sign, psign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \
-	CREATE_TEST_SHIFT(sign, psign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \
-	CREATE_TEST_SHIFT(sign, psign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i]))
+#define CREATE_TESTS(sign, psign, csign, bits, size) \
+	CREATE_TEST(sign, psign, csign, bits, size, add, orig_a[i] + orig_b[i]) \
+	CREATE_TEST(sign, psign, csign, bits, size, sub, orig_a[i] - orig_b[i]) \
+	CREATE_TEST(sign, psign, csign, bits, size, mul, orig_a[i] * orig_b[i]) \
+	CREATE_TEST(sign, psign, csign, bits, size, div, (orig_b[i]) ? (orig_a[i] / orig_b[i]) : 0) \
+	CREATE_TEST(sign, psign, csign, bits, size, and, orig_a[i] & orig_b[i]) \
+	CREATE_TEST(sign, psign, csign, bits, size, or,  orig_a[i] | orig_b[i]) \
+	CREATE_TEST(sign, psign, csign, bits, size, xor, orig_a[i] ^ orig_b[i]) \
+	CREATE_TEST(sign, psign, csign, bits, size, avg, (orig_a[i] * orig_b[i]) / 2) \
+	CREATE_TEST_SHIFT(sign, psign, csign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \
+	CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \
+	CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i]))
 
 #define CREATE_TESTS_2(bits, size) \
-	CREATE_TESTS(, d, bits, size) \
-	CREATE_TESTS(u, u, bits, size)
+	CREATE_TESTS(, d, , bits, size) \
+	CREATE_TESTS(u, u, U, bits, size)
 
 CREATE_TESTS_2(8, 16)
 CREATE_TESTS_2(16, 8)
@@ -96,8 +99,8 @@
 	\
 	for (size_t i = 0U; i < ARRAY_SIZE(testval##sign##bits); i++) { \
 		const v##sign##int##bits##x##size a = vtest##sign##bits##x##size(i); \
-		for (size_t j = 0U; j < ARRAY_SIZE(testvalu##bits); j++) { \
-			const vuint##bits##x##size b = vtestu##bits##x##size(j); \
+		for (uint32_t j = 0U; j < bits; j++) { \
+			const vuint##bits##x##size b = vuint##bits##x##size##_splat(j); \
 			ret |= test_arith_v##sign##int##bits##x##size##_rshift(a, b); \
 			ret |= test_arith_v##sign##int##bits##x##size##_lshift(a, b); \
 			ret |= test_arith_v##sign##int##bits##x##size##_lrshift(a, b); \
@@ -105,7 +108,7 @@
 	}
 
 #define RUN_TESTS_2(bits, size) \
-	RUN_TESTS(, bits, size) \
+	RUN_TESTS( , bits, size) \
 	RUN_TESTS(u, bits, size)
 
 	RUN_TESTS_2(8, 16)
--- a/test/test_compare.h	Tue Oct 22 23:27:15 2024 -0400
+++ b/test/test_compare.h	Wed Oct 23 10:13:25 2024 -0400
@@ -58,7 +58,9 @@
 		} \
 	}
 
-#define RUN_TESTS_2(bits, size) RUN_TESTS(, bits, size) RUN_TESTS(u, bits, size)
+#define RUN_TESTS_2(bits, size) \
+	RUN_TESTS( , bits, size) \
+	RUN_TESTS(u, bits, size)
 
 	RUN_TESTS_2(8, 16)
 	RUN_TESTS_2(16, 8)