Mercurial > vec

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE	Tue Oct 22 01:22:41 2024 -0400
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Paper
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Tue Oct 22 01:22:41 2024 -0400
@@ -0,0 +1,46 @@
+vec - a tiny SIMD vector header-only library written in C99
+
+it comes with an extremely basic (and somewhat lacking) API,
+where there are eight supported vector types, all 128-bit:
+
+	vint8x16  - 16 signed 8-bit integers
+	vint16x8  - 8 signed 16-bit integers
+	vint32x4  - 4 signed 32-bit integers
+	vint64x2  - 2 signed 64-bit integers
+	vuint8x16 - 16 unsigned 8-bit integers
+	vuint16x8 - 8 unsigned 16-bit integers
+	vuint32x4 - 4 unsigned 32-bit integers
+	vuint32x4 - 2 unsigned 64-bit integers
+
+all of these have many operations that are prefixed with the
+name of the type and an underscore, for example:
+
+	vint8x16 vint8x16_splat(uint8_t x)
+	- creates a vint8x16 where all of the values are filled
+	  with the value of `x'
+
+the current supported operations are:
+
+	v[u]intAxB splat([u]intA_t x)
+		creates a vector with all of the values are filled with
+		the value of `x'
+
+	v[u]intAxB load(const [u]intA_t x[B])
+		copies the values from the memory address stored at `x';
+		the address is NOT required to be aligned
+
+	void store(v[u]intAxB vec, [u]intA_t x[B])
+		copies the values from the vector into the memory address
+		stored at `x'
+
+		like with load(), this does not require address alignment
+
+	v[u]intAxB add(v[u]intAxB vec1, v[u]intAxB vec2)
+		adds the value of `vec1' and `vec2' and returns it
+
+	v[u]intAxB sub(v[u]intAxB vec1, v[u]intAxB vec2)
+		subtracts the value of `vec2' from `vec1' and returns it
+
+	v[u]intAxB mul(v[u]intAxB vec1, v[u]intAxB vec2)
+		multiplies the values of `vec1' and `vec2' together and
+		returns it
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/altivec.h	Tue Oct 22 01:22:41 2024 -0400
@@ -0,0 +1,145 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+/* Altivec vector support. */
+
+#include <stdint.h>
+#include <string.h>
+
+#include <altivec.h>
+
+#define VEC_ALTIVEC_ALIGNMENT 16
+
+/* Since altivec conveniently made their API super user friendly, we can just use
+ * one giant macro to define literally everything */
+#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t i) \
+	{ \
+		return vec_splats(i); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \
+	{ \
+		return vec_perm(vec_ld(0, in), vec_ld(VEC_ALTIVEC_ALIGNMENT, in), vec_lvsl(0, in)); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		VEC_ALIGNED_ARRAY(sign##int##bits##_t, aligned_out, size, VEC_ALTIVEC_ALIGNMENT); \
+		vec_st(vec, 0, aligned_out); \
+		memcpy(out, aligned_out, size * sizeof(*aligned_out)); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return vec_add(vec1, vec2); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return vec_sub(vec1, vec2); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return vec_mul(vec1, vec2); \
+	}
+
+#ifndef VEC_VUINT8X16
+# define VEC_VUINT8X16
+typedef vector unsigned char vuint8x16;
+# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+VEC_DEFINE_OPERATIONS(u, 8, 16)
+# define VUINT8x16_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#endif /* VEC_VUINT8X16 */
+
+#ifndef VEC_VINT8X16
+# define VEC_VINT8X16
+typedef vector signed char vint8x16;
+# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+VEC_DEFINE_OPERATIONS(, 8, 16)
+# define VINT8x16_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#endif /* VEC_VINT8X16 */
+
+#ifndef VEC_VUINT16X8
+# define VEC_VUINT16X8
+typedef vector unsigned short vuint16x8;
+# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vuint16x8){ a, b, c, d, e, f, g, h }
+VEC_DEFINE_OPERATIONS(u, 16, 8)
+# define VUINT16x8_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#endif /* VEC_VUINT16X8 */
+
+#ifndef VEC_VINT16X8
+# define VEC_VINT16X8
+typedef vector signed short vint16x8;
+# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vint16x8){ a, b, c, d, e, f, g, h }
+VEC_DEFINE_OPERATIONS(, 16, 8)
+# define VINT16x8_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#endif /* VEC_VINT16X8 */
+
+#ifndef VEC_VUINT32X4
+# define VEC_VUINT32X4
+typedef vector unsigned int vuint32x4;
+# define VUINT32x4_CONSTANT(a, b, c, d) \
+	(vuint32x4){ a, b, c, d }
+VEC_DEFINE_OPERATIONS(u, 32, 4)
+# define VUINT32x4_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#endif /* VEC_VUINT32X4 */
+
+#ifndef VEC_VINT32X4
+# define VEC_VINT32X4
+typedef vector signed int vint32x4;
+# define VINT32x4_CONSTANT(a, b, c, d) \
+	(vint32x4){ a, b, c, d }
+VEC_DEFINE_OPERATIONS(, 32, 4)
+# define VINT32x4_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+#endif /* VEC_VINT32X4 */
+
+#if defined(__POWER8__) && defined(__VSX__)
+
+# ifndef VEC_VUINT64X2
+#  define VEC_VUINT64X2
+typedef vector unsigned long long vuint64x2;
+#  define VUINT64x2_CONSTANT(a, b) \
+	(vuint64x2){ a, b }
+VEC_DEFINE_OPERATIONS(u, 64, 2)
+#  define VUINT64x2_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# endif /* VEC_VUINT64X2 */
+
+# ifndef VEC_VINT64X2
+#  define VEC_VINT64X2
+typedef vector signed long long vint64x2;
+#  define VINT64x2_CONSTANT(a, b) \
+	(vint64x2){ a, b }
+VEC_DEFINE_OPERATIONS(, 64, 2)
+#  define VINT64x2_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0)
+# endif /* VEC_VINT64X2 */
+
+#endif /* defined(__POWER8__) && defined(__VSX__) */
+
+#undef VEC_DEFINE_OPERATIONS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/gcc.h	Tue Oct 22 01:22:41 2024 -0400
@@ -0,0 +1,137 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+/* GCC built in vectors */
+
+#include <stdint.h>
+#include <string.h>
+
+#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \
+	{ \
+        v##sign##int##bits##x##size vec; \
+        for (int i = 0; i < size; i++) vec[i] = x; \
+		return vec; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \
+	{ \
+		v##sign##int##bits##x##size vec; \
+		memcpy(&vec, in, sizeof(vec)); \
+		return vec; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		memcpy(out, &vec, sizeof(vec)); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return vec1 + vec2; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return vec1 - vec2; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return vec1 * vec2; \
+	}
+
+#ifndef VEC_VINT8X16
+# define VEC_VINT8X16
+typedef int8_t vint8x16 __attribute__((__vector_size__(16)));
+# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+VEC_DEFINE_OPERATIONS(, 8, 16)
+# define VINT8x16_ALIGNED 1
+#endif
+
+#ifndef VEC_VINT16X8
+# define VEC_VINT16X8
+typedef int16_t vint16x8 __attribute__((__vector_size__(16)));
+# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vint16x8){ a, b, c, d, e, f, g, h }
+VEC_DEFINE_OPERATIONS(, 16, 8)
+# define VINT16x8_ALIGNED 1
+#endif
+
+#ifndef VEC_VINT32X4
+# define VEC_VINT32X4
+typedef int32_t vint32x4 __attribute__((__vector_size__(16)));
+# define VINT32x4_CONSTANT(a, b, c, d) \
+	(vint32x4){ a, b, c, d }
+VEC_DEFINE_OPERATIONS(, 32, 4)
+# define VINT32x4_ALIGNED 1
+#endif
+
+#ifndef VEC_VINT64X2
+# define VEC_VINT64X2
+typedef int64_t vint64x2 __attribute__((__vector_size__(16)));
+# define VINT64x2_CONSTANT(a, b) \
+	(vint64x2){ a, b }
+VEC_DEFINE_OPERATIONS(, 64, 2)
+# define VINT64x2_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT8X16
+# define VEC_VUINT8X16
+typedef uint8_t vuint8x16 __attribute__((__vector_size__(16)));
+# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+VEC_DEFINE_OPERATIONS(u, 8, 16)
+# define VINT8x16_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT16X8
+# define VEC_VUINT16X8
+typedef uint16_t vuint16x8 __attribute__((__vector_size__(16)));
+# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vuint16x8){ a, b, c, d, e, f, g, h }
+VEC_DEFINE_OPERATIONS(u, 16, 8)
+# define VINT16x8_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT32X4
+# define VEC_VUINT32X4
+typedef uint32_t vuint32x4 __attribute__((__vector_size__(16)));
+# define VUINT32x4_CONSTANT(a, b, c, d) \
+	(vuint32x4){ a, b, c, d }
+VEC_DEFINE_OPERATIONS(u, 32, 4)
+# define VINT32x4_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT64X2
+# define VEC_VUINT64X4
+typedef uint64_t vuint64x2 __attribute__((__vector_size__(16)));
+# define VUINT64x2_CONSTANT(a, b) \
+	(vuint64x2){ a, b }
+VEC_DEFINE_OPERATIONS(u, 64, 2)
+# define VINT64x2_ALIGNED 1
+#endif
+
+#undef VEC_DEFINE_OPERATIONS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/generic.h	Tue Oct 22 01:22:41 2024 -0400
@@ -0,0 +1,149 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+/* Generic array-based implementation. */
+
+#include <stdint.h>
+#include <string.h>
+
+#define VEC_DEFINE_STRUCT(sign, bits, size) \
+    typedef struct { \
+        sign##int##bits##_t arr[size]; \
+    } v##sign##int##bits##x##size;
+
+#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v ## sign ## int ## bits ## x ## size ## _splat(sign ## int ## bits ## _t x) \
+	{ \
+        v##sign##int##bits##x##size vec; \
+        for (int i = 0; i < size; i++) vec.arr[i] = x; \
+		return vec; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _load(const sign ## int ## bits ## _t in[size]) \
+	{ \
+        v##sign##int##bits##x##size vec; \
+        memcpy(vec.arr, in, sizeof(vec.arr)); \
+		return vec; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE void v ## sign ## int ## bits ## x ## size ## _store(v ## sign ## int ## bits ## x ## size vec, sign ## int ## bits ## _t out[size]) \
+	{ \
+        memcpy(out, vec.arr, sizeof(vec.arr)); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _add(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \
+	{ \
+        v##sign##int##bits##x##size vec; \
+        for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] + vec2.arr[i]; \
+		return vec; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _sub(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \
+	{ \
+        v##sign##int##bits##x##size vec; \
+        for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] - vec2.arr[i]; \
+		return vec; \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _mul(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \
+	{ \
+        v##sign##int##bits##x##size vec; \
+        for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] * vec2.arr[i]; \
+		return vec; \
+	}
+
+#ifndef VEC_VINT8X16
+# define VEC_VINT8X16
+VEC_DEFINE_STRUCT(, 8, 16)
+# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	((vint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
+VEC_DEFINE_OPERATIONS(, 8, 16)
+# define VINT8x16_ALIGNED 1
+#endif
+
+#ifndef VEC_VINT16X8
+# define VEC_VINT16X8
+VEC_DEFINE_STRUCT(, 16, 8)
+# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	((vint16x8){ .arr = { a, b, c, d, e, f, g, h } })
+VEC_DEFINE_OPERATIONS(, 16, 8)
+# define VINT16x8_ALIGNED 1
+#endif
+
+#ifndef VEC_VINT32X4
+# define VEC_VINT32X4
+VEC_DEFINE_STRUCT(, 32, 4)
+# define VINT32x4_CONSTANT(a, b, c, d) \
+	((vint32x4){ .arr = { a, b, c, d } })
+VEC_DEFINE_OPERATIONS(, 32, 4)
+# define VINT32x4_ALIGNED 1
+#endif
+
+#ifndef VEC_VINT64X2
+# define VEC_VINT64X2
+VEC_DEFINE_STRUCT(, 64, 2)
+# define VINT64x2_CONSTANT(a, b) \
+	((vint64x2){ .arr = { a, b } })
+VEC_DEFINE_OPERATIONS(, 64, 2)
+# define VINT64x2_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT8X16
+# define VEC_VUINT8X16
+VEC_DEFINE_STRUCT(u, 8, 16)
+# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
+VEC_DEFINE_OPERATIONS(u, 8, 16)
+# define VINT8x16_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT16X8
+# define VEC_VUINT16X8
+VEC_DEFINE_STRUCT(u, 16, 8)
+# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } })
+VEC_DEFINE_OPERATIONS(u, 16, 8)
+# define VINT16x8_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT32X4
+# define VEC_VUINT32X4
+VEC_DEFINE_STRUCT(u, 32, 4)
+# define VUINT32x4_CONSTANT(a, b, c, d) \
+	((vuint32x4){ .arr = { a, b, c, d } })
+VEC_DEFINE_OPERATIONS(u, 32, 4)
+# define VINT32x4_ALIGNED 1
+#endif
+
+#ifndef VEC_VUINT64X2
+# define VEC_VUINT64X2
+VEC_DEFINE_STRUCT(u, 64, 2)
+# define VUINT64x2_CONSTANT(a, b) \
+	((vuint64x2){ .arr = { a, b } })
+VEC_DEFINE_OPERATIONS(u, 64, 2)
+# define VINT64x2_ALIGNED 1
+#endif
+
+#undef VEC_DEFINE_STRUCT
+#undef VEC_DEFINE_OPERATIONS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/impl/sse2.h	Tue Oct 22 01:22:41 2024 -0400
@@ -0,0 +1,255 @@
+/**
+ * vec - a tiny SIMD vector library in plain C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#include <emmintrin.h>
+
+#define VEC_DEFINE_OPERATIONS(sign, bits, size) \
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \
+	{ \
+		return _mm_loadu_si128((const __m128i *)in); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
+	{ \
+		memcpy(out, &vec, sizeof(vec)); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return _mm_add_epi##bits(vec1, vec2); \
+	} \
+	\
+	static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		return _mm_sub_epi##bits(vec1, vec2); \
+	}
+
+#ifndef VEC_VINT8X16
+# define VEC_VINT8X16
+typedef __m128i vint8x16;
+# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(_mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p))
+VEC_DEFINE_OPERATIONS(, 8, 16)
+# define VINT8x16_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vint8x16 vint8x16_mul(vint8x16 vec1, vint8x16 vec2)
+{
+	// unpack and multiply
+	__m128i dst_even = _mm_mullo_epi16(vec1, vec2);
+	__m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8));
+
+	// repack
+	return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8));
+}
+static inline VEC_ALWAYS_INLINE vint8x16 vint8x16_splat(int8_t c)
+{
+	return VINT8x16_CONSTANT(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
+}
+#endif
+
+#ifndef VEC_VINT16X8
+# define VEC_VINT16X8
+typedef __m128i vint16x8;
+# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(_mm_setr_epi16(a, b, c, d, e, f, g, h))
+VEC_DEFINE_OPERATIONS(, 16, 8)
+# define VINT16x8_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vint16x8 vint16x8_mul(vint16x8 vec1, vint16x8 vec2)
+{
+	return _mm_mullo_epi16(vec1, vec2);
+}
+static inline VEC_ALWAYS_INLINE vint16x8 vint16x8_splat(int16_t c)
+{
+	return VINT16x8_CONSTANT(c, c, c, c, c, c, c, c);
+}
+#endif
+
+#ifndef VEC_VINT32X4
+# define VEC_VINT32X4
+typedef __m128i vint32x4;
+# define VINT32x4_CONSTANT(a, b, c, d) \
+	(_mm_setr_epi32(a, b, c, d))
+VEC_DEFINE_OPERATIONS(, 32, 4)
+# define VINT32x4_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vint32x4 vint32x4_mul(vint32x4 a, vint32x4 b)
+{
+	__m128i a13    = _mm_shuffle_epi32(a, 0xF5);        // (-,a3,-,a1)
+	__m128i b13    = _mm_shuffle_epi32(b, 0xF5);        // (-,b3,-,b1)
+	__m128i prod02 = _mm_mul_epu32(a, b);               // (-,a2*b2,-,a0*b0)
+	__m128i prod13 = _mm_mul_epu32(a13, b13);           // (-,a3*b3,-,a1*b1)
+	__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0)
+	__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2)
+	return _mm_unpacklo_epi64(prod01, prod23);          // (ab3,ab2,ab1,ab0)
+}
+static inline VEC_ALWAYS_INLINE vint32x4 vint32x4_splat(int32_t c)
+{
+	return VINT32x4_CONSTANT(c, c, c, c);
+}
+#endif
+
+#ifndef VEC_VINT64X2
+# define VEC_VINT64X2
+typedef __m128i vint64x2;
+static inline VEC_ALWAYS_INLINE vint64x2 VINT64x2_CONSTANT(int64_t a, int64_t b)
+{
+	return _mm_setr_epi32(b, b >> 32, a, a >> 32);
+}
+VEC_DEFINE_OPERATIONS(, 64, 2)
+# define VINT64x2_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vint64x2 vint64x2_mul(vint64x2 ab, vint64x2 cd)
+{
+	/* ac = (ab & 0xFFFFFFFF) * (cd & 0xFFFFFFFF); */
+	__m128i ac = _mm_mul_epu32(ab, cd);
+
+	/* b = ab >> 32; */
+	__m128i b = _mm_srli_epi64(ab, 32);
+
+	/* bc = b * (cd & 0xFFFFFFFF); */
+	__m128i bc = _mm_mul_epu32(b, cd);
+
+	/* d = cd >> 32; */
+	__m128i d = _mm_srli_epi64(cd, 32);
+
+	/* ad = (ab & 0xFFFFFFFF) * d; */
+	__m128i ad = _mm_mul_epu32(ab, d);
+
+	/* high = bc + ad; */
+	__m128i high = _mm_add_epi64(bc, ad);
+
+	/* high <<= 32; */
+	high = _mm_slli_epi64(high, 32);
+
+	/* return ac + high; */
+	return _mm_add_epi64(high, ac);
+}
+static inline VEC_ALWAYS_INLINE vint64x2 vint64x2_splat(int64_t c)
+{
+	return VINT64x2_CONSTANT(c, c);
+}
+#endif
+
+#ifndef VEC_VUINT8X16
+# define VEC_VUINT8X16
+typedef __m128i vuint8x16;
+# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(_mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p))
+VEC_DEFINE_OPERATIONS(u, 8, 16)
+# define VINT8x16_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vint8x16 vuint8x16_mul(vuint8x16 vec1, vuint8x16 vec2)
+{
+	// unpack and multiply
+	__m128i dst_even = _mm_mullo_epi16(vec1, vec2);
+	__m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8));
+
+	// repack
+	return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8));
+}
+static inline VEC_ALWAYS_INLINE vuint8x16 vuint8x16_splat(uint8_t c)
+{
+	return VUINT8x16_CONSTANT(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
+}
+#endif
+
+#ifndef VEC_VUINT16X8
+# define VEC_VUINT16X8
+typedef __m128i vuint16x8;
+# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(_mm_setr_epi16(a, b, c, d, e, f, g, h))
+VEC_DEFINE_OPERATIONS(u, 16, 8)
+# define VINT16x8_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vuint16x8 vuint16x8_mul(vuint16x8 vec1, vuint16x8 vec2)
+{
+	return _mm_mullo_epi16(vec1, vec2);
+}
+static inline VEC_ALWAYS_INLINE vuint16x8 vuint16x8_splat(uint16_t c)
+{
+	return VUINT16x8_CONSTANT(c, c, c, c, c, c, c, c);
+}
+#endif
+
+#ifndef VEC_VUINT32X4
+# define VEC_VUINT32X4
+typedef __m128i vuint32x4;
+# define VUINT32x4_CONSTANT(a, b, c, d) \
+	(_mm_setr_epi32(a, b, c, d))
+VEC_DEFINE_OPERATIONS(u, 32, 4)
+# define VUINT32x4_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vuint32x4 vuint32x4_mul(vuint32x4 a, vuint32x4 b)
+{
+	/* this was stolen from... somewhere :) */
+	__m128i a13    = _mm_shuffle_epi32(a, 0xF5);        // (-,a3,-,a1)
+	__m128i b13    = _mm_shuffle_epi32(b, 0xF5);        // (-,b3,-,b1)
+	__m128i prod02 = _mm_mul_epu32(a, b);               // (-,a2*b2,-,a0*b0)
+	__m128i prod13 = _mm_mul_epu32(a13, b13);           // (-,a3*b3,-,a1*b1)
+	__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0)
+	__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2)
+	return _mm_unpacklo_epi64(prod01, prod23);          // (ab3,ab2,ab1,ab0)
+}
+static inline VEC_ALWAYS_INLINE vuint32x4 vuint32x4_splat(int32_t c)
+{
+	return VUINT32x4_CONSTANT(c, c, c, c);
+}
+#endif
+
+#ifndef VEC_VUINT64X2
+# define VEC_VUINT64X2
+typedef __m128i vuint64x2;
+static inline VEC_ALWAYS_INLINE vint64x2 VUINT64x2_CONSTANT(int64_t a, int64_t b)
+{
+	return _mm_setr_epi32(b, b >> 32, a, a >> 32);
+}
+VEC_DEFINE_OPERATIONS(u, 64, 2)
+# define VUINT64x2_ALIGNED 1
+static inline VEC_ALWAYS_INLINE vuint64x2 vuint64x2_mul(vuint64x2 ab, vuint64x2 cd)
+{
+	/* ac = (ab & 0xFFFFFFFF) * (cd & 0xFFFFFFFF); */
+	__m128i ac = _mm_mul_epu32(ab, cd);
+
+	/* b = ab >> 32; */
+	__m128i b = _mm_srli_epi64(ab, 32);
+
+	/* bc = b * (cd & 0xFFFFFFFF); */
+	__m128i bc = _mm_mul_epu32(b, cd);
+
+	/* d = cd >> 32; */
+	__m128i d = _mm_srli_epi64(cd, 32);
+
+	/* ad = (ab & 0xFFFFFFFF) * d; */
+	__m128i ad = _mm_mul_epu32(ab, d);
+
+	/* high = bc + ad; */
+	__m128i high = _mm_add_epi64(bc, ad);
+
+	/* high <<= 32; */
+	high = _mm_slli_epi64(high, 32);
+
+	/* return ac + high; */
+	return _mm_add_epi64(high, ac);
+}
+static inline VEC_ALWAYS_INLINE vuint64x2 vuint64x2_splat(uint64_t c)
+{
+	return VUINT64x2_CONSTANT(c, c);
+}
+#endif
+
+#undef VEC_DEFINE_OPERATIONS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/vec/vec.h	Tue Oct 22 01:22:41 2024 -0400
@@ -0,0 +1,97 @@
+/**
+ * vec - a tiny SIMD vector library in C99
+ *
+ * Copyright (c) 2024 Paper
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+**/
+
+#ifndef VEC_VEC_H_
+#define VEC_VEC_H_
+
+#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
+	(((a) >= (x)) && \
+	 ((a) > x || (b) >= (y)) && \
+	 ((a) > x || (b) > (y) || (c) >= (z)))
+
+#define VEC_GCC_ATLEAST(x, y, z) \
+	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)
+
+/* GCC/clang attributes */
+#if defined(__has_attribute)
+# if __has_attribute(__always_inline__)
+#  define VEC_ALWAYS_INLINE __attribute__((__always_inline__))
+# endif
+# if __has_attribute(__aligned__)
+#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
+# endif
+# if __has_attribute(__vector_size__)
+#  define VEC_HAVE_GCC_VECTORS
+# endif
+#endif
+
+#ifndef VEC_HAVE_GCC_VECTORS
+# if __GNUC__ >= 4 || __clang_major__ >= 3
+#  define VEC_HAVE_GCC_VECTORS
+# endif
+#endif
+
+#ifndef VEC_ALIGNED
+# if VEC_GCC_ATLEAST(2, 7, 0)
+#  define VEC_ALIGNED(x) __attribute__((aligned(x)))
+# endif
+#endif
+
+#ifndef VEC_ALWAYS_INLINE
+# if VEC_GCC_ATLEAST(3, 1, 0)
+#  define VEC_ALWAYS_INLINE(x) __attribute__((always_inline))
+# endif
+#endif
+
+#ifndef VEC_ALWAYS_INLINE
+# define VEC_ALWAYS_INLINE
+#endif
+
+#ifdef VEC_ALIGNED
+# define VEC_ALIGNED_ARRAY(type, var, size, align) \
+	VEC_ALIGNED(align) type var[size]
+#else
+/* allocate more than necessary to align */
+# define VEC_ALIGNED_ARRAY(type, var, size, align) \
+	type var##_unaligned_[size + align - 1]; \
+	type *var = (type *)((((intptr_t)var##_unaligned_ + align - 1) / align) * align)
+#endif
+
+/* POWER altivec */
+#ifdef __ALTIVEC__
+# include "impl/altivec.h"
+#endif
+
+/* x86_64 SSE2+ */
+#ifdef __SSE2__
+# include "impl/sse2.h"
+#endif
+
+#ifdef VEC_HAVE_GCC_VECTORS
+# include "impl/gcc.h"
+#endif
+
+#include "impl/generic.h"
+
+#endif /* VEC_VEC_H_ */