changeset 11:13575ba795d3

impl/gcc: add native 256-bit and 512-bit intrinsics these are simple to implement. At some point I'd like to refactor vec into using a union and being able to detect AVX512 and friends at compile time, so that the processors that *can* use it are enabled at runtime. This would mean adding a vec_init function, which isn't that big of a deal and can just be run at startup anyway and will grab the CPU flags we need.
author Paper <paper@tflc.us>
date Mon, 18 Nov 2024 16:12:24 -0500 (2 months ago)
parents d1d5d767004c
children c93928877234 53197dbf4e8e
files include/vec/impl/gcc.h
diffstat 1 files changed, 223 insertions(+), 32 deletions(-) [+]
line wrap: on
line diff
--- a/include/vec/impl/gcc.h	Mon Nov 18 15:44:09 2024 -0500
+++ b/include/vec/impl/gcc.h	Mon Nov 18 16:12:24 2024 -0500
@@ -105,12 +105,15 @@
 	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
 	VEC_GENERIC_AVG(sign, bits, size)
 
+// -----------------------------------------------------------------------------------
+// 128-bit vector types
+
 #ifndef VEC_VUINT8X16
 # define VEC_VUINT8X16
 typedef uint8_t vuint8x16 __attribute__((__vector_size__(16)));
 # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
 	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-# define VUINT8x16_ALIGNMENT 1
+# define VUINT8x16_ALIGNMENT 16
 VEC_DEFINE_OPERATIONS(u, U, 8, 16)
 #endif
 
@@ -119,7 +122,7 @@
 typedef uint16_t vuint16x8 __attribute__((__vector_size__(16)));
 # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
 	(vuint16x8){ a, b, c, d, e, f, g, h }
-# define VUINT16x8_ALIGNMENT 1
+# define VUINT16x8_ALIGNMENT 16
 VEC_DEFINE_OPERATIONS(u, U, 16, 8)
 #endif
 
@@ -128,7 +131,79 @@
 typedef uint32_t vuint32x4 __attribute__((__vector_size__(16)));
 # define VUINT32x4_CONSTANT(a, b, c, d) \
 	(vuint32x4){ a, b, c, d }
-# define VUINT32x4_ALIGNMENT 1
+# define VUINT32x4_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(u, U, 32, 4)
+#endif
+
+#ifndef VEC_VUINT64X2
+# define VEC_VUINT64X2
+typedef uint64_t vuint64x2 __attribute__((__vector_size__(16)));
+# define VUINT64x2_CONSTANT(a, b) \
+	(vuint64x2){ a, b }
+# define VUINT64x2_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(u, U, 64, 2)
+#endif
+
+#ifndef VEC_VINT8X16
+# define VEC_VINT8X16
+typedef int8_t vint8x16 __attribute__((__vector_size__(16)));
+# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+# define VINT8x16_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(, , 8, 16)
+#endif
+
+#ifndef VEC_VINT16X8
+# define VEC_VINT16X8
+typedef int16_t vint16x8 __attribute__((__vector_size__(16)));
+# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vint16x8){ a, b, c, d, e, f, g, h }
+# define VINT16x8_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(, , 16, 8)
+#endif
+
+#ifndef VEC_VINT32X4
+# define VEC_VINT32X4
+typedef int32_t vint32x4 __attribute__((__vector_size__(16)));
+# define VINT32x4_CONSTANT(a, b, c, d) \
+	(vint32x4){ a, b, c, d }
+# define VINT32x4_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(, , 32, 4)
+#endif
+
+#ifndef VEC_VINT64X2
+# define VEC_VINT64X2
+typedef int64_t vint64x2 __attribute__((__vector_size__(16)));
+# define VINT64x2_CONSTANT(a, b) \
+	(vint64x2){ a, b }
+# define VINT64x2_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(, , 64, 2)
+#endif
+
+#ifndef VEC_VUINT8X16
+# define VEC_VUINT8X16
+typedef uint8_t vuint8x16 __attribute__((__vector_size__(16)));
+# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+# define VUINT8x16_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(u, U, 8, 16)
+#endif
+
+#ifndef VEC_VUINT16X8
+# define VEC_VUINT16X8
+typedef uint16_t vuint16x8 __attribute__((__vector_size__(16)));
+# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vuint16x8){ a, b, c, d, e, f, g, h }
+# define VUINT16x8_ALIGNMENT 16
+VEC_DEFINE_OPERATIONS(u, U, 16, 8)
+#endif
+
+#ifndef VEC_VUINT32X4
+# define VEC_VUINT32X4
+typedef uint32_t vuint32x4 __attribute__((__vector_size__(16)));
+# define VUINT32x4_CONSTANT(a, b, c, d) \
+	(vuint32x4){ a, b, c, d }
+# define VUINT32x4_ALIGNMENT 16
 VEC_DEFINE_OPERATIONS(u, U, 32, 4)
 #endif
 
@@ -137,44 +212,160 @@
 typedef uint64_t vuint64x2 __attribute__((__vector_size__(16)));
 # define VUINT64x2_CONSTANT(a, b) \
 	(vuint64x2){ a, b }
-# define VUINT64x2_ALIGNMENT 1
+# define VUINT64x2_ALIGNMENT 16
 VEC_DEFINE_OPERATIONS(u, U, 64, 2)
 #endif
 
-#ifndef VEC_VINT8X16
-# define VEC_VINT8X16
-typedef int8_t vint8x16 __attribute__((__vector_size__(16)));
-# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
-	(vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
-# define VINT8x16_ALIGNMENT 1
-VEC_DEFINE_OPERATIONS(, , 8, 16)
+// --------------------------------------------------------------------------
+// 256-bit vector types
+
+#ifndef VEC_VUINT8X32
+# define VEC_VUINT8X32
+typedef uint8_t vuint8x32 __attribute__((__vector_size__(32)));
+# define VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
+	((vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af })
+# define VUINT8x32_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(u, U, 8, 32)
+#endif
+
+#ifndef VEC_VUINT16X16
+# define VEC_VUINT16X16
+typedef uint16_t vuint16x16 __attribute__((__vector_size__(32)));
+# define VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vuint16x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+# define VUINT16x16_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(u, U, 16, 16)
+#endif
+
+#ifndef VEC_VUINT32X8
+# define VEC_VUINT32X8
+typedef uint32_t vuint32x8 __attribute__((__vector_size__(32)));
+# define VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vuint32x8){ a, b, c, d, e, f, g, h }
+# define VUINT32x8_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(u, U, 32, 8)
 #endif
 
-#ifndef VEC_VINT16X8
-# define VEC_VINT16X8
-typedef int16_t vint16x8 __attribute__((__vector_size__(16)));
-# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
-	(vint16x8){ a, b, c, d, e, f, g, h }
-# define VINT16x8_ALIGNMENT 1
-VEC_DEFINE_OPERATIONS(, , 16, 8)
+#ifndef VEC_VUINT64X4
+# define VEC_VUINT64X4
+typedef uint64_t vuint64x4 __attribute__((__vector_size__(32)));
+# define VUINT64x4_CONSTANT(a, b, c, d) \
+	(vuint64x4){ a, b, c, d }
+# define VUINT64x4_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(u, U, 64, 4)
+#endif
+
+#ifndef VEC_VINT8X32
+# define VEC_VINT8X32
+typedef int8_t vint8x32 __attribute__((__vector_size__(32)));
+# define VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
+	((vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af })
+# define VINT8x32_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(, , 8, 32)
+#endif
+
+#ifndef VEC_VINT16X16
+# define VEC_VINT16X16
+typedef int16_t vint16x16 __attribute__((__vector_size__(32)));
+# define VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vint16x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+# define VINT16x16_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(, , 16, 16)
+#endif
+
+#ifndef VEC_VINT32X8
+# define VEC_VINT32X8
+typedef int32_t vint32x8 __attribute__((__vector_size__(32)));
+# define VINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vint32x8){ a, b, c, d, e, f, g, h }
+# define VINT32x8_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(, , 32, 8)
+#endif
+
+#ifndef VEC_VINT64X4
+# define VEC_VINT64X4
+typedef int64_t vint64x4 __attribute__((__vector_size__(32)));
+# define VINT64x4_CONSTANT(a, b, c, d) \
+	(vint64x4){ a, b, c, d }
+# define VINT64x4_ALIGNMENT 32
+VEC_DEFINE_OPERATIONS(, , 64, 4)
 #endif
 
-#ifndef VEC_VINT32X4
-# define VEC_VINT32X4
-typedef int32_t vint32x4 __attribute__((__vector_size__(16)));
-# define VINT32x4_CONSTANT(a, b, c, d) \
-	(vint32x4){ a, b, c, d }
-# define VINT32x4_ALIGNMENT 1
-VEC_DEFINE_OPERATIONS(, , 32, 4)
+// --------------------------------------------------------------------------
+// 512-bit vector types
+
+#ifndef VEC_VUINT8X64
+# define VEC_VUINT8X64
+typedef uint8_t vuint8x64 __attribute__((__vector_size__(64)));
+# define VUINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \
+	((vuint8x64){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl })
+# define VUINT8x64_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(u, U, 8, 64)
+#endif
+
+#ifndef VEC_VUINT16X32
+# define VEC_VUINT16X32
+typedef uint16_t vuint16x32 __attribute__((__vector_size__(64)));
+# define VUINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
+	((vuint16x32){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af })
+# define VUINT16x32_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(u, U, 16, 32)
+#endif
+
+#ifndef VEC_VUINT32X16
+# define VEC_VUINT32X16
+typedef uint32_t vuint32x16 __attribute__((__vector_size__(64)));
+# define VUINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vuint32x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+# define VUINT32x16_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(u, U, 32, 16)
+#endif
+
+#ifndef VEC_VUINT64X8
+# define VEC_VUINT64X8
+typedef uint64_t vuint64x8 __attribute__((__vector_size__(64)));
+# define VUINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vuint64x8){ a, b, c, d, e, f, g, h }
+# define VUINT64x8_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(u, U, 64, 8)
 #endif
 
-#ifndef VEC_VINT64X2
-# define VEC_VINT64X2
-typedef int64_t vint64x2 __attribute__((__vector_size__(16)));
-# define VINT64x2_CONSTANT(a, b) \
-	(vint64x2){ a, b }
-# define VINT64x2_ALIGNMENT 1
-VEC_DEFINE_OPERATIONS(, , 64, 2)
+#ifndef VEC_VINT8X64
+# define VEC_VINT8X64
+typedef int8_t vint8x64 __attribute__((__vector_size__(64)));
+# define VINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \
+	((vint8x64){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl })
+# define VINT8x64_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(, , 8, 64)
+#endif
+
+#ifndef VEC_VINT16X32
+# define VEC_VINT16X32
+typedef int16_t vint16x32 __attribute__((__vector_size__(64)));
+# define VINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
+	((vint16x32){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af })
+# define VINT16x32_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(, , 16, 32)
 #endif
 
+#ifndef VEC_VINT32X16
+# define VEC_VINT32X16
+typedef int32_t vint32x16 __attribute__((__vector_size__(64)));
+# define VINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
+	(vint32x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p }
+# define VINT32x16_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(, , 32, 16)
+#endif
+
+#ifndef VEC_VINT64X8
+# define VEC_VINT64X8
+typedef int64_t vint64x8 __attribute__((__vector_size__(64)));
+# define VINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \
+	(vint64x8){ a, b, c, d, e, f, g, h }
+# define VINT64x8_ALIGNMENT 64
+VEC_DEFINE_OPERATIONS(, , 64, 8)
+#endif
+
+// ----------------------------------------------------------
+
 #undef VEC_DEFINE_OPERATIONS