# HG changeset patch
# User Paper <paper@tflc.us>
# Date 1729654035 14400
# Node ID 3c5545b1568f367e9bb585b019ed196121347b86
# Parent  f12b5dd4e18cc6f80220f3a3be074b6aa50b445e
*: much better alignment support & tests

diff -r f12b5dd4e18c -r 3c5545b1568f include/vec/impl/altivec.h
--- a/include/vec/impl/altivec.h	Tue Oct 22 22:39:05 2024 -0400
+++ b/include/vec/impl/altivec.h	Tue Oct 22 23:27:15 2024 -0400
@@ -109,7 +109,7 @@
 # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
 VEC_DEFINE_OPERATIONS(u, 8, 16)
-# define VUINT8x16_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 #endif /* VEC_VUINT8X16 */
 
 #ifndef VEC_VINT8X16
@@ -118,7 +118,7 @@
 # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
 VEC_DEFINE_OPERATIONS(, 8, 16)
-# define VINT8x16_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 #endif /* VEC_VINT8X16 */
 
 #ifndef VEC_VUINT16X8
@@ -127,7 +127,7 @@
 # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(vuint16x8){ a, b, c, d, e, f, g, h }
 VEC_DEFINE_OPERATIONS(u, 16, 8)
-# define VUINT16x8_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 #endif /* VEC_VUINT16X8 */
 
 #ifndef VEC_VINT16X8
@@ -136,7 +136,7 @@
 # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(vint16x8){ a, b, c, d, e, f, g, h }
 VEC_DEFINE_OPERATIONS(, 16, 8)
-# define VINT16x8_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 #endif /* VEC_VINT16X8 */
 
 #ifndef VEC_VUINT32X4
@@ -145,7 +145,7 @@
 # define VUINT32x4_CONSTANT(a, b, c, d) \
 	(vuint32x4){ a, b, c, d }
 VEC_DEFINE_OPERATIONS(u, 32, 4)
-# define VUINT32x4_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 #endif /* VEC_VUINT32X4 */
 
 #ifndef VEC_VINT32X4
@@ -154,7 +154,7 @@
 # define VINT32x4_CONSTANT(a, b, c, d) \
 	(vint32x4){ a, b, c, d }
 VEC_DEFINE_OPERATIONS(, 32, 4)
-# define VINT32x4_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 #endif /* VEC_VINT32X4 */
 
 #if defined(__POWER8__) && defined(__VSX__)
@@ -165,7 +165,7 @@
 #  define VUINT64x2_CONSTANT(a, b) \
 	(vuint64x2){ a, b }
 VEC_DEFINE_OPERATIONS(u, 64, 2)
-#  define VUINT64x2_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 # endif /* VEC_VUINT64X2 */
 
 # ifndef VEC_VINT64X2
@@ -174,7 +174,7 @@
 #  define VINT64x2_CONSTANT(a, b) \
 	(vint64x2){ a, b }
 VEC_DEFINE_OPERATIONS(, 64, 2)
-#  define VINT64x2_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
 # endif /* VEC_VINT64X2 */
 
 #endif /* defined(__POWER8__) && defined(__VSX__) */
diff -r f12b5dd4e18c -r 3c5545b1568f include/vec/impl/gcc.h
--- a/include/vec/impl/gcc.h	Tue Oct 22 22:39:05 2024 -0400
+++ b/include/vec/impl/gcc.h	Tue Oct 22 23:27:15 2024 -0400
@@ -28,16 +28,26 @@
 #include <string.h>
 
 #define VEC_DEFINE_OPERATIONS(sign, bits, size) \
-	VEC_DECL_LOAD(sign, bits, size) \
+	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
 	{ \
 		v##sign##int##bits##x##size vec; \
 		memcpy(&vec, in, sizeof(vec)); \
 		return vec; \
 	} \
 	\
+	VEC_DECL_LOAD(sign, bits, size) \
+	{ \
+		return v##sign##int##bits##x##size##_load_aligned(in); \
+	} \
+	\
+	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
+	{ \
+		memcpy(out, &vec, sizeof(vec)); \
+	} \
+	\
 	VEC_DECL_STORE(sign, bits, size) \
 	{ \
-		memcpy(out, &vec, sizeof(vec)); \
+		return v##sign##int##bits##x##size##_store_aligned(vec, out); \
 	} \
 	\
 	VEC_DECL_ADD(sign, bits, size) \
diff -r f12b5dd4e18c -r 3c5545b1568f include/vec/impl/generic.h
--- a/include/vec/impl/generic.h	Tue Oct 22 22:39:05 2024 -0400
+++ b/include/vec/impl/generic.h	Tue Oct 22 23:27:15 2024 -0400
@@ -33,16 +33,26 @@
 	} v##sign##int##bits##x##size;
 
 #define VEC_DEFINE_OPERATIONS(sign, bits, size) \
-	VEC_DECL_LOAD(sign, bits, size) \
+	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
 	{ \
 		v##sign##int##bits##x##size vec; \
 		memcpy(vec.arr, in, sizeof(vec.arr)); \
 		return vec; \
 	} \
 	\
+	VEC_DECL_LOAD(sign, bits, size) \
+	{ \
+		return v##sign##int##bits##x##size##_load_aligned(in); \
+	} \
+	\
+	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
+	{ \
+		memcpy(out, vec.arr, sizeof(vec.arr)); \
+	} \
+	\
 	VEC_DECL_STORE(sign, bits, size) \
 	{ \
-		memcpy(out, vec.arr, sizeof(vec.arr)); \
+		return v##sign##int##bits##x##size##_store_aligned(vec, out); \
 	} \
 	\
 	VEC_DECL_ADD(sign, bits, size) \
@@ -93,7 +103,6 @@
 	((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
 VEC_DEFINE_OPERATIONS(u, 8, 16)
 VEC_GENERIC_COMPARISONS(u, 8, 16)
-# define VINT8x16_ALIGNED 1
 #endif
 
 #ifndef VEC_VUINT16X8
@@ -103,7 +112,6 @@
 	((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } })
 VEC_DEFINE_OPERATIONS(u, 16, 8)
 VEC_GENERIC_COMPARISONS(u, 16, 8)
-# define VINT16x8_ALIGNED 1
 #endif
 
 #ifndef VEC_VUINT32X4
@@ -113,7 +121,6 @@
 	((vuint32x4){ .arr = { a, b, c, d } })
 VEC_DEFINE_OPERATIONS(u, 32, 4)
 VEC_GENERIC_COMPARISONS(u, 32, 4)
-# define VINT32x4_ALIGNED 1
 #endif
 
 #ifndef VEC_VUINT64X2
@@ -123,7 +130,6 @@
 	((vuint64x2){ .arr = { a, b } })
 VEC_DEFINE_OPERATIONS(u, 64, 2)
 VEC_GENERIC_COMPARISONS(u, 64, 2)
-# define VINT64x2_ALIGNED 1
 #endif
 
 #ifndef VEC_VINT8X16
@@ -133,7 +139,6 @@
 	((vint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
 VEC_DEFINE_OPERATIONS(, 8, 16)
 VEC_GENERIC_COMPARISONS(, 8, 16)
-# define VINT8x16_ALIGNED 1
 #endif
 
 #ifndef VEC_VINT16X8
@@ -143,7 +148,6 @@
 	((vint16x8){ .arr = { a, b, c, d, e, f, g, h } })
 VEC_DEFINE_OPERATIONS(, 16, 8)
 VEC_GENERIC_COMPARISONS(, 16, 8)
-# define VINT16x8_ALIGNED 1
 #endif
 
 #ifndef VEC_VINT32X4
@@ -153,7 +157,6 @@
 	((vint32x4){ .arr = { a, b, c, d } })
 VEC_DEFINE_OPERATIONS(, 32, 4)
 VEC_GENERIC_COMPARISONS(, 32, 4)
-# define VINT32x4_ALIGNED 1
 #endif
 
 #ifndef VEC_VINT64X2
@@ -163,7 +166,6 @@
 	((vint64x2){ .arr = { a, b } })
 VEC_DEFINE_OPERATIONS(, 64, 2)
 VEC_GENERIC_COMPARISONS(, 64, 2)
-# define VINT64x2_ALIGNED 1
 #endif
 
 #undef VEC_DEFINE_STRUCT
diff -r f12b5dd4e18c -r 3c5545b1568f include/vec/impl/sse2.h
--- a/include/vec/impl/sse2.h	Tue Oct 22 22:39:05 2024 -0400
+++ b/include/vec/impl/sse2.h	Tue Oct 22 23:27:15 2024 -0400
@@ -26,12 +26,24 @@
 
 #include <string.h> /* memcpy */
 
+#define VEC_SSE2_ALIGNMENT 16
+
 #define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
+	{ \
+		return _mm_load_si128((const __m128i *)in); \
+	} \
+	\
 	VEC_DECL_LOAD(sign, bits, size) \
 	{ \
 		return _mm_loadu_si128((const __m128i *)in); \
 	} \
 	\
+	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
+	{ \
+		_mm_store_si128((__m128i *)out, vec); \
+	} \
+	\
 	VEC_DECL_STORE(sign, bits, size) \
 	{ \
 		_mm_storeu_si128((__m128i *)out, vec); \
@@ -90,7 +102,7 @@
 	(_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a))
 VEC_DEFINE_OPERATIONS(u, 8, 16)
 VEC_GENERIC_COMPARISONS(u, 8, 16)
-# define VINT8x16_ALIGNED 1
+# define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(u, 8, 16)
 {
 	// unpack and multiply
@@ -109,7 +121,7 @@
 	(_mm_setr_epi16(h, g, f, e, d, c, b, a))
 VEC_DEFINE_OPERATIONS(u, 16, 8)
 VEC_GENERIC_COMPARISONS(u, 16, 8)
-# define VINT16x8_ALIGNED 1
+# define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(u, 16, 8)
 {
 	return _mm_mullo_epi16(vec1, vec2);
@@ -123,7 +135,7 @@
 	(_mm_setr_epi32(d, c, b, a))
 VEC_DEFINE_OPERATIONS(u, 32, 4)
 VEC_GENERIC_COMPARISONS(u, 32, 4)
-# define VUINT32x4_ALIGNED 1
+# define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(u, 32, 4)
 {
 	/* this was stolen from... somewhere :) */
@@ -146,7 +158,7 @@
 }
 VEC_DEFINE_OPERATIONS(u, 64, 2)
 VEC_GENERIC_COMPARISONS(u, 64, 2)
-# define VUINT64x2_ALIGNED 1
+# define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(u, 64, 2)
 {
 	/* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */
@@ -182,7 +194,7 @@
 	(_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a))
 VEC_DEFINE_OPERATIONS(, 8, 16)
 VEC_DEFINE_COMPARISONS_SIGNED(8, 16)
-# define VINT8x16_ALIGNED 1
+# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(, 8, 16)
 {
 	// unpack and multiply
@@ -201,7 +213,7 @@
 	(_mm_setr_epi16(h, g, f, e, d, c, b, a))
 VEC_DEFINE_OPERATIONS(, 16, 8)
 VEC_DEFINE_COMPARISONS_SIGNED(16, 8)
-# define VINT16x8_ALIGNED 1
+# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(, 16, 8)
 {
 	return _mm_mullo_epi16(vec1, vec2);
@@ -215,7 +227,7 @@
 	(_mm_setr_epi32(d, c, b, a))
 VEC_DEFINE_OPERATIONS(, 32, 4)
 VEC_DEFINE_COMPARISONS_SIGNED(32, 4)
-# define VINT32x4_ALIGNED 1
+# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(, 32, 4)
 {
 	__m128i a13    = _mm_shuffle_epi32(vec1, 0xF5);     // (-,a3,-,a1)
@@ -237,7 +249,7 @@
 }
 VEC_DEFINE_OPERATIONS(, 64, 2)
 VEC_GENERIC_COMPARISONS(, 64, 2)
-# define VINT64x2_ALIGNED 1
+# define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
 VEC_DECL_MUL(, 64, 2)
 {
 	/* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */
diff -r f12b5dd4e18c -r 3c5545b1568f include/vec/vec.h
--- a/include/vec/vec.h	Tue Oct 22 22:39:05 2024 -0400
+++ b/include/vec/vec.h	Tue Oct 22 23:27:15 2024 -0400
@@ -56,13 +56,13 @@
 #endif
 
 #ifndef VEC_ALIGNED
-# if VEC_GCC_ATLEAST(2, 7, 0)
+# if VEC_GNUC_ATLEAST(2, 7, 0)
 #  define VEC_ALIGNED(x) __attribute__((aligned(x)))
 # endif
 #endif
 
 #ifndef VEC_ALWAYS_INLINE
-# if VEC_GCC_ATLEAST(3, 1, 0)
+# if VEC_GNUC_ATLEAST(3, 1, 0)
 #  define VEC_ALWAYS_INLINE(x) __attribute__((always_inline))
 # endif
 #endif
@@ -72,15 +72,22 @@
 #endif
 
 #ifdef VEC_ALIGNED
-# define VEC_ALIGNED_ARRAY(type, var, size, align) \
-	VEC_ALIGNED(align) type var[size]
+# define VEC_ALIGNED_ARRAY(type, var, length, align) \
+	VEC_ALIGNED(align) type var[length]
+# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
+	(sizeof(var))
 #else
 /* allocate more than necessary to align */
-# define VEC_ALIGNED_ARRAY(type, var, size, align) \
-	type var##_unaligned_[size + align - 1]; \
-	type *var = (type *)((((intptr_t)var##_unaligned_ + align - 1) / align) * align)
+# define VEC_ALIGNED_ARRAY(type, var, length, align) \
+	unsigned char vec_##var##_unaligned_[((length) * sizeof(type)) + (align) - 1]; \
+	type *var = (type *)((((intptr_t)vec_##var##_unaligned_ + (align) - 1) / (align)) * (align))
+# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
+	(sizeof(vec_##var##_unaligned_) - ((align) - 1))
 #endif
 
+#define VEC_ALIGNED_ARRAY_LENGTH(var, align) \
+	(VEC_ALIGNED_ARRAY_SIZEOF(var, align)/sizeof(*var))
+
 /* --------------------------------------------------------------- */
 /* bit shift */
 
@@ -178,7 +185,9 @@
 
 #define VEC_DECL_SPLAT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, splat, (sign##int##bits##_t x))
 #define VEC_DECL_LOAD(sign, bits, size)  VEC_OPERATION_THIS_DECL(sign, bits, size, load, (const sign##int##bits##_t in[size]))
+#define VEC_DECL_LOAD_ALIGNED(sign, bits, size)  VEC_OPERATION_THIS_DECL(sign, bits, size, load_aligned, (const sign##int##bits##_t in[size]))
 #define VEC_DECL_STORE(sign, bits, size) VEC_OPERATION_DECL(sign, bits, size, void, store, (v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]))
+#define VEC_DECL_STORE_ALIGNED(sign, bits, size) VEC_OPERATION_DECL(sign, bits, size, void, store_aligned, (v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]))
 #define VEC_DECL_ADD(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, add)
 #define VEC_DECL_SUB(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, sub)
 #define VEC_DECL_MUL(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, mul)
@@ -274,18 +283,21 @@
 	VEC_GENERIC_COMPARISON(sign, bits, size, EQ, ==) \
 	VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size)
 
+#ifndef VEC_SUPPRESS_HW
 /* POWER altivec */
-#ifdef __ALTIVEC__
-# include "impl/altivec.h"
+# ifdef __ALTIVEC__
+#  include "impl/altivec.h"
+# endif
+/* x86 SSE2 */
+# ifdef __SSE2__
+#  include "impl/sse2.h"
+# endif
 #endif
 
-/* x86 SSE2 */
-#ifdef __SSE2__
-# include "impl/sse2.h"
-#endif
-
-#ifdef VEC_HAVE_GNUC_VECTORS
-# include "impl/gcc.h"
+#ifndef VEC_SUPPRESS_GCC
+# ifdef VEC_HAVE_GNUC_VECTORS
+#  include "impl/gcc.h"
+# endif
 #endif
 
 #include "impl/generic.h"
@@ -346,6 +358,90 @@
 #undef VEC_GENERIC_COMPARISON
 #undef VEC_GENERIC_COMPARISONS
 
+#undef VEC_VINT8X16
+#undef VEC_VINT16X8
+#undef VEC_VINT32X4
+#undef VEC_VINT64X2
+#undef VEC_VUINT8X16
+#undef VEC_VUINT16X8
+#undef VEC_VUINT32X4
+#undef VEC_VUINT64X2
+
 /* ---------------------------------------------------------------- */
+/* user-friendly alignment crap */
+
+#ifndef VINT8x16_ALIGNMENT
+# define VINT8x16_ALIGNMENT 1
+#endif
+
+#ifndef VINT16x8_ALIGNMENT
+# define VINT16x8_ALIGNMENT 1
+#endif
+
+#ifndef VINT32x4_ALIGNMENT
+# define VINT32x4_ALIGNMENT 1
+#endif
+
+#ifndef VINT64x2_ALIGNMENT
+# define VINT64x2_ALIGNMENT 1
+#endif
+
+#ifndef VUINT8x16_ALIGNMENT
+# define VUINT8x16_ALIGNMENT 1
+#endif
+
+#ifndef VUINT16x8_ALIGNMENT
+# define VUINT16x8_ALIGNMENT 1
+#endif
+
+#ifndef VUINT32x4_ALIGNMENT
+# define VUINT32x4_ALIGNMENT 1
+#endif
+
+#ifndef VUINT64x2_ALIGNMENT
+# define VUINT64x2_ALIGNMENT 1
+#endif
+
+/* pointer alignment macros */
+
+#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
+#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
+
+#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
+#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
+#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)
+
+#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
+#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)
+
+#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
+#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)
+
+#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
+#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)
+
+#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
+#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)
+
+#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
+#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)
+
+#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
+#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)
 
 #endif /* VEC_VEC_H_ */
diff -r f12b5dd4e18c -r 3c5545b1568f test/Makefile
--- a/test/Makefile	Tue Oct 22 22:39:05 2024 -0400
+++ b/test/Makefile	Tue Oct 22 23:27:15 2024 -0400
@@ -1,12 +1,30 @@
-_CFLAGS = -g -O2 -I../include $(CFLAGS)
+CFLAGS += -std=c99 -I../include
 
-_LDFLAGS = $(LDFLAGS)
+# binary files
+BINS = test-gcc test-generic test-host
+OBJS = $(BINS:=.o)
+
+.PHONY: all clean test
+
+all: $(BINS)
 
-.c.o:
-	$(CC) -c $(_CFLAGS) $< -o $@
+# suppress the platform-dependent hardware stuff so we only have
+# GCC vector extensions
+test-gcc: CFLAGS += -DVEC_SUPPRESS_HW
 
-main: main.o
-	$(CC) $(_CFLAGS) -o $@ $^ $(_LDFLAGS)
+# also suppress GCC extensions, leaving only the defaults
+test-generic: CFLAGS += -DVEC_SUPPRESS_HW -DVEC_SUPPRESS_GCC
+
+$(OBJS): main.c
+	$(CC) $(CFLAGS) -o $@ -c $^
+
+$(BINS): %: %.o
+	$(CC) $(LDFLAGS) -o $@ $^
 
 clean:
-	$(RM) main main.o
\ No newline at end of file
+	$(RM) $(BINS) $(OBJS)
+
+test: clean $(BINS)
+	./test-gcc
+	./test-generic
+	./test-host
diff -r f12b5dd4e18c -r 3c5545b1568f test/main.c
--- a/test/main.c	Tue Oct 22 22:39:05 2024 -0400
+++ b/test/main.c	Tue Oct 22 23:27:15 2024 -0400
@@ -88,6 +88,7 @@
 DEFINE_PRINT_VECTOR_2(32, 4)
 DEFINE_PRINT_VECTOR_2(64, 2)
 
+#include "test_align.h"
 #include "test_arith.h"
 #include "test_compare.h"
 
@@ -95,6 +96,7 @@
 {
 	int ret = 0;
 
+	ret |= test_align();
 	ret |= test_arith();
 	ret |= test_compare();
 
diff -r f12b5dd4e18c -r 3c5545b1568f test/test_align.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_align.h	Tue Oct 22 23:27:15 2024 -0400
@@ -0,0 +1,40 @@
+static int test_align(void)
+{
+	int ret = 0;
+
+#define RUN_TEST(sign, csign, bits, size) \
+	do { \
+		/* allocate the aligned array */ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec_arr); \
+	\
+		/* fill the values */ \
+		for (int i = 0; i < V##csign##INT##bits##x##size##_ALIGNED_ARRAY_LENGTH(vec_arr); i++) \
+			vec_arr[i] = i; \
+	\
+		/* try to load it */ \
+		v##sign##int##bits##x##size vec = v##sign##int##bits##x##size##_load_aligned(vec_arr); \
+	\
+		/* now allocate an output array */ \
+		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec_arr_out); \
+	\
+		/* try storing it */ \
+		v##sign##int##bits##x##size##_store_aligned(vec, vec_arr_out); \
+	\
+		/* mark success or failure */ \
+		ret |= !!memcmp(vec_arr, vec_arr_out, V##csign##INT##bits##x##size##_ALIGNED_ARRAY_LENGTH(vec_arr)); \
+	} while (0);
+
+#define RUN_TESTS(bits, size) \
+	RUN_TEST( ,  , bits, size) \
+	RUN_TEST(u, U, bits, size)
+
+	RUN_TESTS(8, 16)
+	RUN_TESTS(16, 8)
+	RUN_TESTS(32, 4)
+	RUN_TESTS(64, 2)
+
+#undef RUN_TESTS
+#undef RUN_TEST
+
+	return ret;
+}
diff -r f12b5dd4e18c -r 3c5545b1568f test/test_arith.h
--- a/test/test_arith.h	Tue Oct 22 22:39:05 2024 -0400
+++ b/test/test_arith.h	Tue Oct 22 23:27:15 2024 -0400
@@ -117,4 +117,4 @@
 #undef RUN_TESTS
 
 	return ret;
-}
\ No newline at end of file
+}