# HG changeset patch
# User Paper <paper@tflc.us>
# Date 1732131199 18000
# Node ID cf04071d2148ed0b647ab4f112495f68478443f1
# Parent  41dd962abdd1ad9a323fab3c10acdf8ffd414664
impl: initial NEON support; test: verify bit shifting functions

diff -r 41dd962abdd1 -r cf04071d2148 include/vec/impl/arm/neon.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/arm/neon.h	Wed Nov 20 14:33:19 2024 -0500
@@ -0,0 +1,489 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ * 
+ * Copyright (c) 2024 Paper
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_IMPL_ARM_NEON_H_
+#define VEC_IMPL_ARM_NEON_H_
+
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vld1_##sign##bits(in); \
+		return vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		vstore_lane_##bits(sign, vec.neon, out); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_add, \
+		v##sign##int##bits##x##size##_neon_sub, \
+		v##sign##int##bits##x##size##_neon_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_neon_and, \
+		v##sign##int##bits##x##size##_neon_or, \
+		v##sign##int##bits##x##size##_neon_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_neon_lshift, \
+		/* .rshift = */ NULL, \
+		/* .lrshift = */ NULL, \
+	};
+
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
+
+// Ok, we'll start out with the 64-bit types.
+
+#define vadd_8  vadd_s8
+#define vadd_16 vadd_s16
+#define vadd_32 vadd_s32
+#define vsub_8  vsub_s8
+#define vsub_16 vsub_s16
+#define vsub_32 vsub_s32
+#define vmul_8  vmul_s8
+#define vmul_16 vmul_s16
+#define vmul_32 vmul_s32
+#define vshl_8  vshl_s8
+#define vshl_16 vshl_s16
+#define vshl_32 vshl_s32
+#define veor_8  veor_s8
+#define veor_16 veor_s16
+#define veor_32 veor_s32
+#define vorr_8  vorr_s8
+#define vorr_16 vorr_s16
+#define vorr_32 vorr_s32
+#define vand_8  vand_s8
+#define vand_16 vand_s16
+#define vand_32 vand_s32
+#define vld1_8  vld1_s8
+#define vld1_16 vld1_s16
+#define vld1_32 vld1_s32
+#define vget_lane_8  vget_lane_s8
+#define vget_lane_16 vget_lane_s16
+#define vget_lane_32 vget_lane_s32
+#define vstore_lane_8(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##8(vec, 0); \
+		out[1] = vget_lane_##sign##8(vec, 1); \
+		out[2] = vget_lane_##sign##8(vec, 2); \
+		out[3] = vget_lane_##sign##8(vec, 3); \
+		out[4] = vget_lane_##sign##8(vec, 4); \
+		out[5] = vget_lane_##sign##8(vec, 5); \
+		out[6] = vget_lane_##sign##8(vec, 6); \
+		out[7] = vget_lane_##sign##8(vec, 7); \
+	} while (0)
+#define vstore_lane_16(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##16(vec, 0); \
+		out[1] = vget_lane_##sign##16(vec, 1); \
+		out[2] = vget_lane_##sign##16(vec, 2); \
+		out[3] = vget_lane_##sign##16(vec, 3); \
+	} while (0)
+#define vstore_lane_32(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##32(vec, 0); \
+		out[1] = vget_lane_##sign##32(vec, 1); \
+	} while (0)
+#define vreinterpret_8_u8(x) vreinterpret_s8_u8(x)
+#define vreinterpret_16_u16(x) vreinterpret_s16_u16(x)
+#define vreinterpret_32_u32(x) vreinterpret_s32_u32(x)
+
+VEC_DEFINE_OPERATIONS(8, 8)
+VEC_DEFINE_OPERATIONS(16, 4)
+VEC_DEFINE_OPERATIONS(32, 2)
+
+#undef vadd_8
+#undef vadd_16
+#undef vadd_32
+#undef vsub_8
+#undef vsub_16
+#undef vsub_32
+#undef vmul_8
+#undef vmul_16
+#undef vmul_32
+#undef vshl_8
+#undef vshl_16
+#undef vshl_32
+#undef veor_8
+#undef veor_16
+#undef veor_32
+#undef vorr_8
+#undef vorr_16
+#undef vorr_32
+#undef vand_8
+#undef vand_16
+#undef vand_32
+#undef vld1_8
+#undef vld1_16
+#undef vld1_32
+#undef vget_lane_8 
+#undef vget_lane_16
+#undef vget_lane_32
+#undef vstore_lane_8
+#undef vstore_lane_16
+#undef vstore_lane_32
+#undef vreinterpret_8_u8
+#undef vreinterpret_16_u16
+#undef vreinterpret_32_u32
+
+///////////////////////////////////////////////////////////////////////////////
+// 128-bit
+
+// Now we can go ahead and do the 128-bit ones.
+
+// NEON doesn't have native 64-bit multiplication, so we have
+// to do it ourselves
+static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b)
+{
+    const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a));
+    const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b));
+
+    const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a));
+
+    return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr));
+}
+
+static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b)
+{
+    const uint32x2_t ac = vmovn_u64(a);
+    const uint32x2_t pr = vmovn_u64(b);
+
+    const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a));
+
+    return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr);
+}
+
+#define vadd_8  vaddq_s8
+#define vadd_16 vaddq_s16
+#define vadd_32 vaddq_s32
+#define vadd_64 vaddq_s64
+#define vadd_u8  vaddq_u8
+#define vadd_u16 vaddq_u16
+#define vadd_u32 vaddq_u32
+#define vadd_u64 vaddq_u64
+#define vsub_8  vsubq_s8
+#define vsub_16 vsubq_s16
+#define vsub_32 vsubq_s32
+#define vsub_64 vsubq_s64
+#define vsub_u8  vsubq_u8
+#define vsub_u16 vsubq_u16
+#define vsub_u32 vsubq_u32
+#define vsub_u64 vsubq_u64
+#define vmul_8  vmulq_s8
+#define vmul_16 vmulq_s16
+#define vmul_32 vmulq_s32
+#define vmul_64 vmulq_s64
+#define vmul_u8  vmulq_u8
+#define vmul_u16 vmulq_u16
+#define vmul_u32 vmulq_u32
+#define vmul_u64 vmulq_u64
+#define vshl_8  vshlq_s8
+#define vshl_16 vshlq_s16
+#define vshl_32 vshlq_s32
+#define vshl_64 vshlq_s64
+#define vshl_u8  vshlq_u8
+#define vshl_u16 vshlq_u16
+#define vshl_u32 vshlq_u32
+#define vshl_u64 vshlq_u64
+#define veor_8  veorq_s8
+#define veor_16 veorq_s16
+#define veor_32 veorq_s32
+#define veor_64 veorq_s64
+#define veor_u8  veorq_u8
+#define veor_u16 veorq_u16
+#define veor_u32 veorq_u32
+#define veor_u64 veorq_u64
+#define vorr_8  vorrq_s8
+#define vorr_16 vorrq_s16
+#define vorr_32 vorrq_s32
+#define vorr_64 vorrq_s64
+#define vorr_u8  vorrq_u8
+#define vorr_u16 vorrq_u16
+#define vorr_u32 vorrq_u32
+#define vorr_u64 vorrq_u64
+#define vand_8  vandq_s8
+#define vand_16 vandq_s16
+#define vand_32 vandq_s32
+#define vand_64 vandq_s64
+#define vand_u8  vandq_u8
+#define vand_u16 vandq_u16
+#define vand_u32 vandq_u32
+#define vand_u64 vandq_u64
+#define vld1_8  vld1q_s8
+#define vld1_16 vld1q_s16
+#define vld1_32 vld1q_s32
+#define vld1_64 vld1q_s64
+#define vld1_u8  vld1q_u8
+#define vld1_u16 vld1q_u16
+#define vld1_u32 vld1q_u32
+#define vld1_u64 vld1q_u64
+#define vget_lane_8  vgetq_lane_s8
+#define vget_lane_16 vgetq_lane_s16
+#define vget_lane_32 vgetq_lane_s32
+#define vget_lane_64 vgetq_lane_s64
+#define vget_lane_u8  vgetq_lane_u8
+#define vget_lane_u16 vgetq_lane_u16
+#define vget_lane_u32 vgetq_lane_u32
+#define vget_lane_u64 vgetq_lane_u64
+#define vstore_lane_8(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##8(vec, 0); \
+		out[1] = vget_lane_##sign##8(vec, 1); \
+		out[2] = vget_lane_##sign##8(vec, 2); \
+		out[3] = vget_lane_##sign##8(vec, 3); \
+		out[4] = vget_lane_##sign##8(vec, 4); \
+		out[5] = vget_lane_##sign##8(vec, 5); \
+		out[6] = vget_lane_##sign##8(vec, 6); \
+		out[7] = vget_lane_##sign##8(vec, 7); \
+		out[8] = vget_lane_##sign##8(vec, 8); \
+		out[9] = vget_lane_##sign##8(vec, 9); \
+		out[10] = vget_lane_##sign##8(vec, 10); \
+		out[11] = vget_lane_##sign##8(vec, 11); \
+		out[12] = vget_lane_##sign##8(vec, 12); \
+		out[13] = vget_lane_##sign##8(vec, 13); \
+		out[14] = vget_lane_##sign##8(vec, 14); \
+		out[15] = vget_lane_##sign##8(vec, 15); \
+	} while (0)
+#define vstore_lane_16(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##16(vec, 0); \
+		out[1] = vget_lane_##sign##16(vec, 1); \
+		out[2] = vget_lane_##sign##16(vec, 2); \
+		out[3] = vget_lane_##sign##16(vec, 3); \
+		out[4] = vget_lane_##sign##16(vec, 4); \
+		out[5] = vget_lane_##sign##16(vec, 5); \
+		out[6] = vget_lane_##sign##16(vec, 6); \
+		out[7] = vget_lane_##sign##16(vec, 7); \
+	} while (0)
+#define vstore_lane_32(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##32(vec, 0); \
+		out[1] = vget_lane_##sign##32(vec, 1); \
+		out[2] = vget_lane_##sign##32(vec, 2); \
+		out[3] = vget_lane_##sign##32(vec, 3); \
+	} while (0)
+#define vstore_lane_64(sign, vec, out) \
+	do { \
+		out[0] = vget_lane_##sign##64(vec, 0); \
+		out[1] = vget_lane_##sign##64(vec, 1); \
+	} while (0)
+#define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x)
+#define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x)
+#define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x)
+#define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x)
+
+#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vld1_##sign##bits(in); \
+		return vec; \
+	} \
+	\
+	static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
+	{ \
+		vstore_lane_##bits(sign, vec.neon, out); \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \
+		return vec; \
+	} \
+	\
+	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \
+		/* .splat = */ NULL, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_load_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_store_aligned, \
+		v##sign##int##bits##x##size##_neon_add, \
+		v##sign##int##bits##x##size##_neon_sub, \
+		v##sign##int##bits##x##size##_neon_mul, \
+		/* .div = */ NULL, \
+		/* .avg = */ NULL, \
+		v##sign##int##bits##x##size##_neon_and, \
+		v##sign##int##bits##x##size##_neon_or, \
+		v##sign##int##bits##x##size##_neon_xor, \
+		/* .not = */ NULL, \
+		v##sign##int##bits##x##size##_neon_lshift, \
+		/* .rshift = */ NULL, \
+		/* .lrshift = */ NULL, \
+	};
+
+#define VEC_DEFINE_OPERATIONS(bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN( ,  , bits, size) \
+	VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size)
+
+VEC_DEFINE_OPERATIONS(8, 16)
+VEC_DEFINE_OPERATIONS(16, 8)
+VEC_DEFINE_OPERATIONS(32, 4)
+VEC_DEFINE_OPERATIONS(64, 2)
+
+#undef vadd_8
+#undef vadd_16
+#undef vadd_32
+#undef vadd_64
+#undef vsub_8
+#undef vsub_16
+#undef vsub_32
+#undef vsub_64
+#undef vmul_8
+#undef vmul_16
+#undef vmul_32
+#undef vmul_64
+#undef vshl_8
+#undef vshl_16
+#undef vshl_32
+#undef vshl_64
+#undef veor_8
+#undef veor_16
+#undef veor_32
+#undef veor_64
+#undef vorr_8
+#undef vorr_16
+#undef vorr_32
+#undef vorr_64
+#undef vand_8
+#undef vand_16
+#undef vand_32
+#undef vand_64
+#undef vld1_8
+#undef vld1_16
+#undef vld1_32
+#undef vld1_64
+#undef vget_lane_8 
+#undef vget_lane_16
+#undef vget_lane_32
+#undef vget_lane_64
+#undef vstore_lane_8
+#undef vstore_lane_16
+#undef vstore_lane_32
+#undef vstore_lane_64
+
+#undef VEC_DEFINE_OPERATIONS
+#undef VEC_DEFINE_OPERATIONS_SIGN
+
+#endif /* VEC_IMPL_ARM_NEON_H_ */
diff -r 41dd962abdd1 -r cf04071d2148 include/vec/impl/cpu.h
--- a/include/vec/impl/cpu.h	Wed Nov 20 12:02:15 2024 -0500
+++ b/include/vec/impl/cpu.h	Wed Nov 20 14:33:19 2024 -0500
@@ -47,23 +47,42 @@
  * 3. This notice may not be removed or altered from any source distribution.
 */
 
-# if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
-#  include <sys/sysctl.h> // For AltiVec check
-# elif defined(__OpenBSD__) && defined(__powerpc__)
-#  include <sys/types.h>
-#  include <sys/sysctl.h> // For AltiVec check
-#  include <machine/cpu.h>
-# elif defined(__FreeBSD__) && defined(__powerpc__)
-#  include <machine/cpu.h>
-#  include <sys/auxv.h>
-# elif defined(__ALTIVEC__)
-#  include <signal.h>
-#  include <setjmp.h>
+#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
+# include <sys/sysctl.h> // For AltiVec check
+#elif defined(__OpenBSD__) && defined(__powerpc__)
+# include <sys/types.h>
+# include <sys/sysctl.h> // For AltiVec check
+# include <machine/cpu.h>
+#elif defined(__FreeBSD__) && defined(__powerpc__)
+# include <machine/cpu.h>
+# include <sys/auxv.h>
+#elif defined(__ALTIVEC__)
+# include <signal.h>
+# include <setjmp.h>
+#endif
+
+#ifdef __FreeBSD__
+# include <sys/param.h>
+#endif
+
+#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
+# include <unistd.h>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <fcntl.h>
+# include <elf.h>
+
+/*#include <asm/hwcap.h>*/
+# ifndef AT_HWCAP
+# define AT_HWCAP 16
 # endif
-
-# ifdef __FreeBSD__
-#  include <sys/param.h>
+# ifndef AT_PLATFORM
+#  define AT_PLATFORM 15
 # endif
+# ifndef HWCAP_NEON
+#  define HWCAP_NEON (1 << 12)
+# endif
+#endif
 
 static inline int vec_CPU_have_CPUID(void)
 {
@@ -348,6 +367,98 @@
 	return 0;
 }
 
+#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
+static int readProcAuxvForNeon(void)
+{
+	int neon = 0;
+	int fd;
+
+	fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+	if (fd >= 0) {
+		Elf32_auxv_t aux;
+		while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
+			if (aux.a_type == AT_HWCAP) {
+				neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
+				break;
+			}
+		}
+		close(fd);
+	}
+	return neon;
+}
+#endif
+
+static int vec_CPU_have_NEON(void)
+{
+/* The way you detect NEON is a privileged instruction on ARM, so you have
+   query the OS kernel in a platform-specific way. :/ */
+#if defined(SDL_CPUINFO_DISABLED)
+	return 0; /* disabled */
+#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
+/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
+/* Seems to have been removed */
+#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
+#endif
+	/* All WinRT ARM devices are required to support NEON, but just in case. */
+	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
+#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
+	return 1; /* ARMv8 always has non-optional NEON support. */
+#elif defined(__VITA__)
+	return 1;
+#elif defined(__3DS__)
+	return 0;
+#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
+	/* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
+	return 1; /* all Apple ARMv7 chips and later have NEON. */
+#elif defined(__APPLE__)
+	return 0; /* assume anything else from Apple doesn't have NEON. */
+#elif !defined(__arm__)
+	return 0; /* not an ARM CPU at all. */
+#elif defined(__OpenBSD__)
+	return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
+#elif defined(HAVE_ELF_AUX_INFO)
+	unsigned long hasneon = 0;
+	if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
+		return 0;
+
+	return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
+#elif defined(__QNXNTO__)
+	return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
+#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
+#elif defined(__linux__)
+	return readProcAuxvForNeon();
+#elif defined(__ANDROID__)
+	/* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
+	{
+		AndroidCpuFamily cpu_family = android_getCpuFamily();
+		if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
+			uint64_t cpu_features = android_getCpuFeatures();
+			if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
+				return 1;
+			}
+		}
+		return 0;
+	}
+#elif defined(__RISCOS__)
+	/* Use the VFPSupport_Features SWI to access the MVFR registers */
+	{
+		_kernel_swi_regs regs;
+		regs.r[0] = 0;
+		if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
+			if ((regs.r[2] & 0xFFF000) == 0x111000) {
+				return 1;
+			}
+		}
+		return 0;
+	}
+#else
+#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
+	return 0;
+#endif
+}
+
 enum {
 	VEC_CPU_HAS_ALTIVEC = (1 << 0),
 	VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1),
@@ -360,6 +471,7 @@
 	VEC_CPU_HAS_AVX = (1 << 8),
 	VEC_CPU_HAS_AVX2 = (1 << 9),
 	VEC_CPU_HAS_AVX512F = (1 << 10),
+	VEC_CPU_HAS_NEON = (1 << 11),
 };
 
 #define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF)
@@ -392,6 +504,8 @@
 		vec_CPU_features |= VEC_CPU_HAS_AVX2;
 	if (vec_CPU_have_AVX512F())
 		vec_CPU_features |= VEC_CPU_HAS_AVX512F;
+	if (vec_CPU_have_NEON())
+		vec_CPU_features |= VEC_CPU_HAS_NEON;
 }
 
 #endif /* VEC_IMPL_CPU_H_ */
diff -r 41dd962abdd1 -r cf04071d2148 include/vec/impl/ppc/altivec.h
--- a/include/vec/impl/ppc/altivec.h	Wed Nov 20 12:02:15 2024 -0500
+++ b/include/vec/impl/ppc/altivec.h	Wed Nov 20 14:33:19 2024 -0500
@@ -27,12 +27,8 @@
 #ifndef VEC_IMPL_PPC_ALTIVEC_H_
 #define VEC_IMPL_PPC_ALTIVEC_H_
 
-#include <string.h>
-
 #include <altivec.h>
 
-#define VEC_ALTIVEC_ALIGNMENT 16
-
 /* GCC 4.2.1 on Mac OS X doesn't have these for some reason */
 #ifdef vec_mul
 # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
diff -r 41dd962abdd1 -r cf04071d2148 include/vec/vec.h
--- a/include/vec/vec.h	Wed Nov 20 12:02:15 2024 -0500
+++ b/include/vec/vec.h	Wed Nov 20 14:33:19 2024 -0500
@@ -111,6 +111,7 @@
 /* --------------------------------------------------------------- */
 /* Detect compiler SIMD support */
 
+#define VEC_NEON_ALIGNMENT    16
 #define VEC_ALTIVEC_ALIGNMENT 16
 #define VEC_SSE2_ALIGNMENT    16
 #define VEC_AVX2_ALIGNMENT    32
@@ -203,6 +204,67 @@
 # endif
 #endif
 
+#ifdef __ARM_NEON
+# include <arm_neon.h>
+# define VEC_COMPILER_HAS_NEON
+# if VINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VINT8x8_ALIGNMENT
+#  define VINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VINT16x4_ALIGNMENT
+#  define VINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VINT32x4_ALIGNMENT
+#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VUINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VUINT8x8_ALIGNMENT
+#  define VUINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VUINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VUINT16x4_ALIGNMENT
+#  define VUINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VUINT32x4_ALIGNMENT
+#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VINT8x16_ALIGNMENT
+#  define VINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VINT16x8_ALIGNMENT
+#  define VINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VINT32x4_ALIGNMENT
+#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VINT64x2_ALIGNMENT
+#  define VINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VUINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VUINT8x16_ALIGNMENT
+#  define VUINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VUINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VUINT16x8_ALIGNMENT
+#  define VUINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VUINT32x4_ALIGNMENT
+#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+# if VUINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
+#  undef VUINT64x2_ALIGNMENT
+#  define VUINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
+# endif
+#endif
+
 #ifdef __MMX__
 # include <mmintrin.h>
 # define VEC_COMPILER_HAS_MMX
@@ -410,7 +472,6 @@
 
 	xx.d = x;
 
-	// I have no idea what this does :)
 	xx.u += roffset;
 	xx.u >>= y;
 	xx.u -= roffset >> y;
@@ -485,6 +546,9 @@
 #ifdef VEC_COMPILER_HAS_MMX
 	__m64 mmx;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	uint8x8_t neon;
+#endif
 
 	vuint8x4 generic[2];
 } vuint8x8;
@@ -493,6 +557,9 @@
 #ifdef VEC_COMPILER_HAS_MMX
 	__m64 mmx;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	uint16x4_t neon;
+#endif
 
 	vuint16x2 generic[2];
 } vuint16x4;
@@ -501,6 +568,9 @@
 #ifdef VEC_COMPILER_HAS_MMX
 	__m64 mmx;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	uint32x2_t neon;
+#endif
 
 	vec_uint32 generic[2];
 } vuint32x2;
@@ -509,6 +579,9 @@
 #ifdef VEC_COMPILER_HAS_MMX
 	__m64 mmx;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	int8x8_t neon;
+#endif
 
 	vint8x4 generic[2];
 } vint8x8;
@@ -517,6 +590,9 @@
 #ifdef VEC_COMPILER_HAS_MMX
 	__m64 mmx;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	int16x4_t neon;
+#endif
 
 	vint16x2 generic[2];
 } vint16x4;
@@ -525,6 +601,9 @@
 #ifdef VEC_COMPILER_HAS_MMX
 	__m64 mmx;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	int32x2_t neon;
+#endif
 
 	vec_int32 generic[2];
 } vint32x2;
@@ -537,6 +616,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector unsigned char altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	uint8x16_t neon;
+#endif
 	vuint8x8 generic[2];
 } vuint8x16;
 
@@ -547,6 +629,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector unsigned short altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	uint16x8_t neon;
+#endif
 	vuint16x4 generic[2];
 } vuint16x8;
 
@@ -557,6 +642,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector unsigned int altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	uint32x4_t neon;
+#endif
 	vuint32x2 generic[2];
 } vuint32x4;
 
@@ -567,6 +655,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
 	vector unsigned long long altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	uint64x2_t neon;
+#endif
 	vec_uint64 generic[2];
 } vuint64x2;
 
@@ -577,6 +668,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector signed char altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	int8x16_t neon;
+#endif
 	vint8x8 generic[2];
 } vint8x16;
 
@@ -587,6 +681,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector signed short altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	int16x8_t neon;
+#endif
 	vint16x4 generic[2];
 } vint16x8;
 
@@ -597,6 +694,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC
 	vector signed int altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	int32x4_t neon;
+#endif
 	vint32x2 generic[2];
 } vint32x4;
 
@@ -607,6 +707,9 @@
 #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
 	vector signed long long altivec;
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	int64x2_t neon;
+#endif
 	vec_int64 generic[2];
 } vint64x2;
 
@@ -891,6 +994,10 @@
 # include "impl/x86/mmx.h"
 #endif
 
+#ifdef VEC_COMPILER_HAS_NEON
+# include "impl/arm/neon.h"
+#endif
+
 #include "impl/generic.h"
 
 /* ---------------------------------------------------------------- */
@@ -1029,6 +1136,27 @@
 		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
 	}
 #endif
+#ifdef VEC_COMPILER_HAS_NEON
+	if (vec_CPU_have_NEON()) {
+		// 64-bit
+		vint8x8_impl_cpu  = &vint8x8_impl_neon;
+		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
+		vint16x4_impl_cpu  = &vint16x4_impl_neon;
+		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
+		vint32x2_impl_cpu  = &vint32x2_impl_neon;
+		vuint32x2_impl_cpu = &vuint32x2_impl_neon;
+
+		// 64-bit
+		vint8x16_impl_cpu  = &vint8x16_impl_neon;
+		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
+		vint16x8_impl_cpu  = &vint16x8_impl_neon;
+		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
+		vint32x4_impl_cpu  = &vint32x4_impl_neon;
+		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
+		vint64x2_impl_cpu  = &vint64x2_impl_neon;
+		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
+	}
+#endif
 	{
 		// do nothing, they're already set to generics
 	}
diff -r 41dd962abdd1 -r cf04071d2148 test/Makefile
--- a/test/Makefile	Wed Nov 20 12:02:15 2024 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,230 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.25
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/paper/Documents/src/hg/vec
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/paper/Documents/src/hg/vec/test
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles /home/paper/Documents/src/hg/vec/test//CMakeFiles/progress.marks
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-#=============================================================================
-# Target rules for targets named vec
-
-# Build rule for target.
-vec: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 vec
-.PHONY : vec
-
-# fast build rule for target.
-vec/fast:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/build
-.PHONY : vec/fast
-
-src/vec.o: src/vec.c.o
-.PHONY : src/vec.o
-
-# target to build an object file
-src/vec.c.o:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.o
-.PHONY : src/vec.c.o
-
-src/vec.i: src/vec.c.i
-.PHONY : src/vec.i
-
-# target to preprocess a source file
-src/vec.c.i:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.i
-.PHONY : src/vec.c.i
-
-src/vec.s: src/vec.c.s
-.PHONY : src/vec.s
-
-# target to generate assembly for a file
-src/vec.c.s:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.s
-.PHONY : src/vec.c.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... vec"
-	@echo "... src/vec.o"
-	@echo "... src/vec.i"
-	@echo "... src/vec.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff -r 41dd962abdd1 -r cf04071d2148 test/Makefile.template
--- a/test/Makefile.template	Wed Nov 20 12:02:15 2024 -0500
+++ b/test/Makefile.template	Wed Nov 20 14:33:19 2024 -0500
@@ -14,7 +14,8 @@
 	../include/vec/impl/generic.h \
 	test_align.h \
 	test_arith.h \
-	test_compare.h
+	test_compare.h \
+	test_shift.h
 BINS = test-generic test-host test-cxx
 OBJS = vec-generic.o vec-host.o test.o test-cxx.o
 
@@ -22,10 +23,10 @@
 
 all: $(BINS)
 
-vec-generic.o: ../src/vec.c
+vec-generic.o: ../src/vec.c $(HEADERS)
 	$(CC) $(CFLAGS) -DVEC_SUPPRESS_HW=1 -c -o $@ $<
 
-vec-host.o: ../src/vec.c
+vec-host.o: ../src/vec.c $(HEADERS)
 	$(CC) $(CFLAGS) -c -o $@ $<
 
 test.o: test.c
@@ -40,8 +41,8 @@
 test-host: vec-host.o test.o
 	$(CC) $(LDFLAGS) -o $@ $^
 
-test-cxx: test-cxx.o
-	$(CXX) $(LDFLAGS) -o $@ $^
+test-cxx: test-cxx.o $(HEADERS)
+	$(CXX) $(LDFLAGS) -o $@ $<
 
 clean:
 	$(RM) $(BINS) $(OBJS)
diff -r 41dd962abdd1 -r cf04071d2148 test/test.c
--- a/test/test.c	Wed Nov 20 12:02:15 2024 -0500
+++ b/test/test.c	Wed Nov 20 14:33:19 2024 -0500
@@ -112,6 +112,7 @@
 #include "test_align.h"
 #include "test_arith.h"
 #include "test_compare.h"
+#include "test_shift.h"
 
 // ------------------------------------------------------------
 
@@ -124,6 +125,7 @@
 	ret |= test_align();
 	ret |= test_arith();
 	ret |= test_compare();
+	ret |= test_shift();
 
 	return ret;
 }
diff -r 41dd962abdd1 -r cf04071d2148 test/test_shift.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_shift.h	Wed Nov 20 14:33:19 2024 -0500
@@ -0,0 +1,15 @@
+static int test_shift(void)
+{
+	int ret = 0;
+
+	ret |= (vec_ulrshift(0xFFFFFFFF, 16) != 0xFFFF);
+	ret |= (vec_ullshift(0xFFFF, 16) != 0xFFFF0000);
+	ret |= (vec_lrshift(0xFFFFFFFF, 16) != 0xFFFF);
+	ret |= (vec_llshift(0xFFFF, 16) != 0xFFFF0000);
+	ret |= (vec_urshift(0xFFFFFFFF, 16) != 0xFFFF);
+	ret |= (vec_ulshift(0xFFFF, 16) != 0xFFFF0000);
+	ret |= (vec_rshift(-0xFFFF, 8) != -0x100);
+	ret |= (vec_lshift(-0xFFFF, 8) != -0xFFFF00);
+
+	return ret;
+}
diff -r 41dd962abdd1 -r cf04071d2148 test/vec.pc
--- a/test/vec.pc	Wed Nov 20 12:02:15 2024 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-prefix=/usr/local
-exec_prefix=/usr/local
-libdir=${exec_prefix}/lib
-includedir=${prefix}/include
-
-Name: vec
-Description: a tiny C99 SIMD vector library
-Version: 2.0.0
-
-Requires:
-Libs: -L${libdir} -lvec
-Cflags: -I${includedir}