Mercurial > vec
changeset 14:981cf0bc7f3a
chore: merge diverging branches
author | Paper <paper@tflc.us> |
---|---|
date | Tue, 19 Nov 2024 15:55:01 -0500 |
parents | 53197dbf4e8e (diff) c93928877234 (current diff) |
children | e05c257c6a23 |
files | include/vec/vec.h |
diffstat | 8 files changed, 1324 insertions(+), 461 deletions(-) [+] |
line wrap: on
line diff
--- a/include/vec/impl/gcc.h Tue Nov 19 01:00:09 2024 -0500 +++ b/include/vec/impl/gcc.h Tue Nov 19 15:55:01 2024 -0500 @@ -216,156 +216,6 @@ VEC_DEFINE_OPERATIONS(u, U, 64, 2) #endif -// -------------------------------------------------------------------------- -// 256-bit vector types - -#ifndef VEC_VUINT8X32 -# define VEC_VUINT8X32 -typedef uint8_t vuint8x32 __attribute__((__vector_size__(32))); -# define VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af }) -# define VUINT8x32_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(u, U, 8, 32) -#endif - -#ifndef VEC_VUINT16X16 -# define VEC_VUINT16X16 -typedef uint16_t vuint16x16 __attribute__((__vector_size__(32))); -# define VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vuint16x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VUINT16x16_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(u, U, 16, 16) -#endif - -#ifndef VEC_VUINT32X8 -# define VEC_VUINT32X8 -typedef uint32_t vuint32x8 __attribute__((__vector_size__(32))); -# define VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vuint32x8){ a, b, c, d, e, f, g, h } -# define VUINT32x8_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(u, U, 32, 8) -#endif - -#ifndef VEC_VUINT64X4 -# define VEC_VUINT64X4 -typedef uint64_t vuint64x4 __attribute__((__vector_size__(32))); -# define VUINT64x4_CONSTANT(a, b, c, d) \ - (vuint64x4){ a, b, c, d } -# define VUINT64x4_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(u, U, 64, 4) -#endif - -#ifndef VEC_VINT8X32 -# define VEC_VINT8X32 -typedef int8_t vint8x32 __attribute__((__vector_size__(32))); -# define VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af }) -# define VINT8x32_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(, , 8, 32) -#endif - -#ifndef VEC_VINT16X16 -# define VEC_VINT16X16 -typedef int16_t vint16x16 __attribute__((__vector_size__(32))); -# define VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vint16x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VINT16x16_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(, , 16, 16) -#endif - -#ifndef VEC_VINT32X8 -# define VEC_VINT32X8 -typedef int32_t vint32x8 __attribute__((__vector_size__(32))); -# define VINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vint32x8){ a, b, c, d, e, f, g, h } -# define VINT32x8_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(, , 32, 8) -#endif - -#ifndef VEC_VINT64X4 -# define VEC_VINT64X4 -typedef int64_t vint64x4 __attribute__((__vector_size__(32))); -# define VINT64x4_CONSTANT(a, b, c, d) \ - (vint64x4){ a, b, c, d } -# define VINT64x4_ALIGNMENT 32 -VEC_DEFINE_OPERATIONS(, , 64, 4) -#endif - -// -------------------------------------------------------------------------- -// 512-bit vector types - -#ifndef VEC_VUINT8X64 -# define VEC_VUINT8X64 -typedef uint8_t vuint8x64 __attribute__((__vector_size__(64))); -# define VUINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \ - ((vuint8x64){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl }) -# define VUINT8x64_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(u, U, 8, 64) -#endif - -#ifndef VEC_VUINT16X32 -# define VEC_VUINT16X32 -typedef uint16_t vuint16x32 __attribute__((__vector_size__(64))); -# define VUINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vuint16x32){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af }) -# define VUINT16x32_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(u, U, 16, 32) -#endif - -#ifndef VEC_VUINT32X16 -# define VEC_VUINT32X16 -typedef uint32_t vuint32x16 __attribute__((__vector_size__(64))); -# define VUINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vuint32x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VUINT32x16_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(u, U, 32, 16) -#endif - -#ifndef VEC_VUINT64X8 -# define VEC_VUINT64X8 -typedef uint64_t vuint64x8 __attribute__((__vector_size__(64))); -# define VUINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vuint64x8){ a, b, c, d, e, f, g, h } -# define VUINT64x8_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(u, U, 64, 8) -#endif - -#ifndef VEC_VINT8X64 -# define VEC_VINT8X64 -typedef int8_t vint8x64 __attribute__((__vector_size__(64))); -# define VINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \ - ((vint8x64){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl }) -# define VINT8x64_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(, , 8, 64) -#endif - -#ifndef VEC_VINT16X32 -# define VEC_VINT16X32 -typedef int16_t vint16x32 __attribute__((__vector_size__(64))); -# define VINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vint16x32){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af }) -# define VINT16x32_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(, , 16, 32) -#endif - -#ifndef VEC_VINT32X16 -# define VEC_VINT32X16 -typedef int32_t vint32x16 __attribute__((__vector_size__(64))); -# define VINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vint32x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VINT32x16_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(, , 32, 16) -#endif - -#ifndef VEC_VINT64X8 -# define VEC_VINT64X8 -typedef int64_t vint64x8 __attribute__((__vector_size__(64))); -# define VINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vint64x8){ a, b, c, d, e, f, g, h } -# define VINT64x8_ALIGNMENT 64 -VEC_DEFINE_OPERATIONS(, , 64, 8) -#endif - // ---------------------------------------------------------- #undef VEC_DEFINE_OPERATIONS
--- a/include/vec/impl/generic.h Tue Nov 19 01:00:09 2024 -0500 +++ b/include/vec/impl/generic.h Tue Nov 19 15:55:01 2024 -0500 @@ -111,7 +111,7 @@ VEC_DEFINE_STRUCT(u, 16, 8) # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ ((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } }) -# define VUINT16x8_ALIGNMENT 1 +# define VUINT16x8_ALIGNMENT 2 VEC_DEFINE_OPERATIONS(u, U, 16, 8) #endif @@ -120,7 +120,7 @@ VEC_DEFINE_STRUCT(u, 32, 4) # define VUINT32x4_CONSTANT(a, b, c, d) \ ((vuint32x4){ .arr = { a, b, c, d } }) -# define VUINT32x4_ALIGNMENT 1 +# define VUINT32x4_ALIGNMENT 4 VEC_DEFINE_OPERATIONS(u, U, 32, 4) #endif @@ -129,25 +129,16 @@ VEC_DEFINE_STRUCT(u, 64, 2) # define VUINT64x2_CONSTANT(a, b) \ ((vuint64x2){ .arr = { a, b } }) -# define VUINT64x2_ALIGNMENT 1 +# define VUINT64x2_ALIGNMENT 8 VEC_DEFINE_OPERATIONS(u, U, 64, 2) #endif -#ifndef VEC_VINT8X16 -# define VEC_VINT8X16 -VEC_DEFINE_STRUCT(, 8, 16) -# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - ((vint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) -# define VINT8x16_ALIGNMENT 1 -VEC_DEFINE_OPERATIONS(, , 8, 16) -#endif - #ifndef VEC_VINT16X8 # define VEC_VINT16X8 VEC_DEFINE_STRUCT(, 16, 8) # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ ((vint16x8){ .arr = { a, b, c, d, e, f, g, h } }) -# define VINT16x8_ALIGNMENT 1 +# define VINT16x8_ALIGNMENT 2 VEC_DEFINE_OPERATIONS(, , 16, 8) #endif @@ -156,7 +147,7 @@ VEC_DEFINE_STRUCT(, 32, 4) # define VINT32x4_CONSTANT(a, b, c, d) \ ((vint32x4){ .arr = { a, b, c, d } }) -# define VINT32x4_ALIGNMENT 1 +# define VINT32x4_ALIGNMENT 4 VEC_DEFINE_OPERATIONS(, , 32, 4) #endif @@ -165,7 +156,7 @@ VEC_DEFINE_STRUCT(, 64, 2) # define VINT64x2_CONSTANT(a, b) \ ((vint64x2){ .arr = { a, b } }) -# define VINT64x2_ALIGNMENT 1 +# define VINT64x2_ALIGNMENT 8 VEC_DEFINE_OPERATIONS(, , 64, 2) #endif
--- a/include/vec/impl/sse2.h Tue Nov 19 01:00:09 2024 -0500 +++ b/include/vec/impl/sse2.h Tue Nov 19 15:55:01 2024 -0500 @@ -141,16 +141,6 @@ } \ VEC_GENERIC_THAN_OR_EQUAL(, bits, size) -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -typedef __m128i vuint8x16; -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)) -# define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 8, 16) -VEC_GENERIC_COMPARISONS(u, U, 8, 16) -#endif - #ifndef VEC_VUINT16X8 # define VEC_VUINT16X8 typedef __m128i vuint16x8;
--- a/include/vec/vec.h Tue Nov 19 01:00:09 2024 -0500 +++ b/include/vec/vec.h Tue Nov 19 15:55:01 2024 -0500 @@ -26,6 +26,7 @@ #define VEC_VEC_H_ #include <stdint.h> +#include <string.h> #include <limits.h> #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ @@ -75,22 +76,137 @@ # define VEC_FUNC_KEYWORDS static inline VEC_ALWAYS_INLINE #endif -#ifdef VEC_ALIGNED -# define VEC_ALIGNED_ARRAY(type, var, length, align) \ - VEC_ALIGNED(align) type var[length] -# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(var)) +#if (__STDC_VERSION__ >= 201112L) +# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg) #else -/* allocate more than necessary to align */ -# define VEC_ALIGNED_ARRAY(type, var, length, align) \ - unsigned char vec_##var##_unaligned_[((length) * sizeof(type)) + (align) - 1]; \ - type *var = (type *)(((intptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)) -# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(vec_##var##_unaligned_) - ((align) - 1)) +// C99 static assertion +# define VEC_STATIC_ASSERT(x, msg) \ + extern int (*vec_impl_Static_assert_function_(void)) \ + [!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })] +#endif + +#ifndef VEC_ASSERT +# ifndef VEC_DISABLE_ASSERTIONS +# include <assert.h> +# define VEC_ASSERT(x, msg) assert(msg && x) +# else +# define VEC_ASSERT(x, msg) +# endif +#endif + +/* --------------------------------------------------------------- */ +/* Detect compiler SIMD support */ + +// IIRC `__VEC__' is also defined, but I don't know for sure. +// IBM says that `__ALTIVEC__' is standard though. +#ifdef __ALTIVEC__ +# include <altivec.h> +# define VEC_COMPILER_HAS_ALTIVEC + +# define VINT8x16_ALIGNMENT 16 +# define VINT16x8_ALIGNMENT 16 +# define VINT32x4_ALIGNMENT 16 +# define VINT64x2_ALIGNMENT 16 +#endif + +#ifdef __SSE2__ +# include <immintrin.h> +# define VEC_COMPILER_HAS_SSE2 +# ifdef __SSE42__ +# define VEC_COMPILER_HAS_SSE42 +# endif + +# define VINT8x16_ALIGNMENT 16 +# define VINT16x8_ALIGNMENT 16 +# define VINT32x4_ALIGNMENT 16 +# define VINT64x2_ALIGNMENT 16 #endif -#define VEC_ALIGNED_ARRAY_LENGTH(var, align) \ - (VEC_ALIGNED_ARRAY_SIZEOF(var, align)/sizeof(*var)) +#ifndef VINT8x16_ALIGNMENT +# define VINT8x16_ALIGNMENT 1 +#endif +#ifndef VINT16x8_ALIGNMENT +# define VINT16x8_ALIGNMENT 1 +#endif +#ifndef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT 1 +#endif +#ifndef VINT64x2_ALIGNMENT +# define VINT64x2_ALIGNMENT 1 +#endif +#ifndef VUINT8x16_ALIGNMENT +# define VUINT8x16_ALIGNMENT 1 +#endif +#ifndef VUINT16x8_ALIGNMENT +# define VUINT16x8_ALIGNMENT 1 +#endif +#ifndef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT 1 +#endif +#ifndef VUINT64x2_ALIGNMENT +# define VUINT64x2_ALIGNMENT 1 +#endif + +// generic 256-bit is just doubled 128-bit +#ifndef VINT8x32_ALIGNMENT +# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT +#endif +#ifndef VINT16x16_ALIGNMENT +# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT +#endif +#ifndef VINT32x8_ALIGNMENT +# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT +#endif +#ifndef VINT64x4_ALIGNMENT +# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT +#endif +#ifndef VUINT8x32_ALIGNMENT +# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT +#endif +#ifndef VUINT16x16_ALIGNMENT +# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT +#endif +#ifndef VUINT32x8_ALIGNMENT +# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT +#endif +#ifndef VUINT64x4_ALIGNMENT +# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT +#endif + +// generic 512-bit is just doubled 256-bit +#ifndef VINT8x64_ALIGNMENT +# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT +#endif +#ifndef VINT16x32_ALIGNMENT +# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT +#endif +#ifndef VINT32x16_ALIGNMENT +# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT +#endif +#ifndef VINT64x8_ALIGNMENT +# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT +#endif +#ifndef VUINT8x64_ALIGNMENT +# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT +#endif +#ifndef VUINT16x32_ALIGNMENT +# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT +#endif +#ifndef VUINT32x16_ALIGNMENT +# define VUINT32x16_ALIGNMENT VUINT32x16_ALIGNMENT +#endif +#ifndef VUINT64x8_ALIGNMENT +# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT +#endif + +/* --------------------------------------------------------------- */ +/* Detect CPU SIMD support */ + +// stubs for now... will be implemented sometime +#define VEC_CPU_have_SSE2() (0) +#define VEC_CPU_have_SSE42() (0) +#define VEC_CPU_have_ALTIVEC() (0) +#define VEC_CPU_have_ALTIVEC_VSX() (0) /* --------------------------------------------------------------- */ /* bit shift */ @@ -107,12 +223,28 @@ VEC_FUNC_KEYWORDS intmax_t vec_lrshift(intmax_t x, unsigned int y) { - return (intmax_t)(((uintmax_t)x) >> y); + // reinterpret as unsigned integer and then shift + union { + intmax_t d; + uintmax_t u; + } xx; + + xx.d = x; + xx.u >> y; + return xx.d; } VEC_FUNC_KEYWORDS intmax_t vec_llshift(intmax_t x, unsigned int y) { - return (intmax_t)(((uintmax_t)x) << y); + // reinterpret as unsigned integer and then shift + union { + intmax_t d; + uintmax_t u; + } xx; + + xx.d = x; + xx.u << y; + return xx.d; } VEC_FUNC_KEYWORDS uintmax_t vec_urshift(uintmax_t x, unsigned int y) @@ -153,326 +285,1299 @@ **/ VEC_FUNC_KEYWORDS intmax_t vec_rshift(intmax_t x, unsigned int y) { - static const uintmax_t roffset = UINTMAX_C(1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + + union { + intmax_t d; + uintmax_t u; + } xx; - uintmax_t urx = (uintmax_t)x; - urx += roffset; - urx >>= y; - urx -= roffset >> y; + xx.d = x; - return (intmax_t)urx; + // I have no idea what this does :) + xx.u += roffset; + xx.u >>= y; + xx.u -= roffset >> y; + + return xx.d; } VEC_FUNC_KEYWORDS intmax_t vec_lshift(intmax_t x, unsigned int y) { - static const uintmax_t roffset = UINTMAX_C(1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + + union { + intmax_t d; + uintmax_t u; + } xx; - uintmax_t urx = (uintmax_t)x; - urx += roffset; - urx <<= y; - urx -= roffset << y; + xx.d = x; - return (intmax_t)urx; + xx.u += roffset; + xx.u <<= y; + xx.u -= roffset << y; + + return xx.d; } /* --------------------------------------------------------------- */ /* Array alignment macros */ +#include <stdio.h> + +#ifdef VEC_ALIGNED +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_ALIGNED(align) type var[length] +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(var)) +#else +/* the alignment must be specified in bytes and must be a multiple of the + * type size. it is always assumed that the type will be on a boundary of + * its size, which may or may not be true */ +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_STATIC_ASSERT(align % sizeof(type) == 0 && align != 0, "vec: alignment needs to be a multiple of the type size and non-zero"); \ + type vec_##var##_unaligned_[(length) + (align / sizeof(type)) - 1]; \ + type *var = (type *)(((uintptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)); \ + VEC_ASSERT(((uintptr_t)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(vec_##var##_unaligned_) - ((align) - 1)) +#endif + +#define VEC_ALIGNED_ARRAY_LENGTH(var, align) \ + (VEC_ALIGNED_ARRAY_SIZEOF(var, align)/sizeof(*var)) + +// ------------------------------------------------------------ +// predefined variants for each vector type + #define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) #define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) #define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) #define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) #define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) #define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) #define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) #define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) #define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) #define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) #define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) #define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) #define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) #define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) #define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) #define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) #define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) #define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) #define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) #define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) #define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) #define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) #define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) #define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) #define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) #define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) #define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) #define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) #define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) #define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) #define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) #define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) #define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) #define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) #define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x16_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) #define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) #define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) #define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) #define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) #define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) #define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) #define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) #define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x16_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) #define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) #define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) #define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) #define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) #define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) /* --------------------------------------------------------------- */ -/* Implementation defines to keep everything relatively consistent */ +/* Defines the structures for each vector type */ + +// 128-bit +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector unsigned char altivec; +#endif + uint8_t generic[16]; +} vuint8x16; + +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector unsigned short altivec; +#endif + uint16_t generic[8]; +} vuint16x8; -#define VEC_OPERATION_DECL(sign, bits, size, ret, op, params) \ - VEC_FUNC_KEYWORDS ret v##sign##int##bits##x##size##_##op params +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector unsigned int altivec; +#endif + uint32_t generic[4]; +} vuint32x4; + +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + vector unsigned long long altivec; +#endif + uint64_t generic[2]; +} vuint64x2; + +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector signed char altivec; +#endif + int8_t generic[16]; +} vint8x16; + +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector signed short altivec; +#endif + int16_t generic[8]; +} vint16x8; -#define VEC_OPERATION_THIS_DECL(sign, bits, size, op, params) \ - VEC_OPERATION_DECL(sign, bits, size, v##sign##int##bits##x##size, op, params) +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector signed int altivec; +#endif + int32_t generic[4]; +} vint32x4; + +typedef union { +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + vector signed long long altivec; +#endif + int64_t generic[2]; +} vint64x2; + +// 256-bit +typedef union { + vuint8x16 generic[2]; +} vuint8x32; + +typedef union { + vuint16x8 generic[2]; +} vuint16x16; + +typedef union { + vuint32x4 generic[2]; +} vuint32x8; -#define VEC_TWOWAY_DECL(sign, bits, size, op) \ - VEC_OPERATION_THIS_DECL(sign, bits, size, op, (v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2)) +typedef union { + vuint64x2 generic[2]; +} vuint64x4; + +typedef union { + vint8x16 generic[2]; +} vint8x32; + +typedef union { + vint16x8 generic[2]; +} vint16x16; + +typedef union { + vint32x4 generic[2]; +} vint32x8; + +typedef union { + vint64x2 generic[2]; +} vint64x4; + +// 512-bit +typedef union { + vuint8x32 generic[2]; +} vuint8x64; + +typedef union { + vuint16x16 generic[2]; +} vuint16x32; + +typedef union { + vuint32x8 generic[2]; +} vuint32x16; + +typedef union { + vuint64x4 generic[2]; +} vuint64x8; -#define VEC_DECL_SPLAT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, splat, (sign##int##bits##_t x)) -#define VEC_DECL_LOAD(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, load, (const sign##int##bits##_t in[size])) -#define VEC_DECL_LOAD_ALIGNED(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, load_aligned, (const sign##int##bits##_t in[size])) -#define VEC_DECL_STORE(sign, bits, size) VEC_OPERATION_DECL(sign, bits, size, void, store, (v##sign##int##bits##x##size vec, sign##int##bits##_t out[size])) -#define VEC_DECL_STORE_ALIGNED(sign, bits, size) VEC_OPERATION_DECL(sign, bits, size, void, store_aligned, (v##sign##int##bits##x##size vec, sign##int##bits##_t out[size])) -#define VEC_DECL_ADD(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, add) -#define VEC_DECL_SUB(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, sub) -#define VEC_DECL_MUL(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, mul) -#define VEC_DECL_DIV(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, div) -#define VEC_DECL_AND(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, and) -#define VEC_DECL_OR(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, or) -#define VEC_DECL_XOR(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, xor) -#define VEC_DECL_AVG(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, avg) -#define VEC_DECL_SHIFT(sign, bits, size, vectype, way) VEC_OPERATION_THIS_DECL(sign, bits, size, vectype##way##shift, (v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2)) -#define VEC_DECL_LSHIFT(sign, bits, size) VEC_DECL_SHIFT(sign, bits, size, , l) -#define VEC_DECL_RSHIFT(sign, bits, size) VEC_DECL_SHIFT(sign, bits, size, , r) -#define VEC_DECL_LRSHIFT(sign, bits, size) VEC_DECL_SHIFT(sign, bits, size, l, r) -#define VEC_DECL_NOT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, not, (v##sign##int##bits##x##size vec)) +typedef union { + vint8x32 generic[2]; +} vint8x64; + +typedef union { + vint16x16 generic[2]; +} vint16x32; + +typedef union { + vint32x8 generic[2]; +} vint32x16; + +typedef union { + vint64x4 generic[2]; +} vint64x8; + +// -------------------------------------------------------------------------------- +// okay, now onto the actual functions: +// +// we have generic variations of every major operation EXCEPT aligned load and +// aligned store. this means that a vector implementation can be created with +// only aligned load and aligned store implemented, which sucks, but it werks + +#define VEC_GENERIC_OPERATION(op, sign, csign, bits, size) \ + do { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ + v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \ + \ + for (int i = 0; i < size; i++) varr1[i] = (op); \ + \ + return v##sign##int##bits##x##size##_load_aligned(varr1); \ + } while (0) + +#define VEC_GENERIC_ADD(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size) +#define VEC_GENERIC_SUB(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size) +#define VEC_GENERIC_MUL(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size) +#define VEC_GENERIC_DIV(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size) +#define VEC_GENERIC_AND(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size) +#define VEC_GENERIC_OR(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size) +#define VEC_GENERIC_XOR(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size) + +#define VEC_GENERIC_CMP(op, sign, csign, bits, size) \ + VEC_GENERIC_OPERATION((varr1[i] op varr1[i]) ? csign##INT##bits##_MAX : 0, sign, csign, bits, size) + +#define VEC_GENERIC_CMPLT(sign, csign, bits, size) VEC_GENERIC_CMP(<, sign, csign, bits, size) +#define VEC_GENERIC_CMPLE(sign, csign, bits, size) VEC_GENERIC_CMP(<=, sign, csign, bits, size) +#define VEC_GENERIC_CMPEQ(sign, csign, bits, size) VEC_GENERIC_CMP(==, sign, csign, bits, size) +#define VEC_GENERIC_CMPGE(sign, csign, bits, size) VEC_GENERIC_CMP(>=, sign, csign, bits, size) +#define VEC_GENERIC_CMPGT(sign, csign, bits, size) VEC_GENERIC_CMP(>, sign, csign, bits, size) + +#define VEC_GENERIC_SHIFT(op, sign, csign, bits, size) \ + do { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ + VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ + vuint##bits##x##size##_store_aligned(vec2, varr2); \ + \ + for (int i = 0; i < size; i++) varr1[i] = (op); \ + \ + return v##sign##int##bits##x##size##_load_aligned(varr1); \ + } while (0) -/* comparisons */ -#define VEC_DECL_CMPLT(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmplt) -#define VEC_DECL_CMPGT(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpgt) -#define VEC_DECL_CMPEQ(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpeq) -#define VEC_DECL_CMPLE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmple) -#define VEC_DECL_CMPGE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpge) +#define VEC_GENERIC_LSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size) +#define VEC_GENERIC_RSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size) +#define VEC_GENERIC_LRSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size) + +#ifdef VEC_COMPILER_HAS_SSE2 +// these are shared between SSE2 variations +# define VEC_SSE2_MUL_8x16(sign) \ + do { \ + /* unpack and multiply */ \ + __m128i dst_even = _mm_mullo_epi16(vec1.sse, vec2.sse); \ + __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \ + \ + /* repack */ \ + return (v##sign##int8x16){ .sse = _mm_or_si128( \ + _mm_slli_epi16(dst_odd, 8), \ + _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ + )}; \ + } while (0) + +# define VEC_SSE2_MUL_16x8(sign) \ + do { \ + /* we have a real instruction for this */ \ + return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \ + } while (0) -/* Generic variations. */ -#define VEC_GENERIC_SPLAT(sign, csign, bits, size) \ - VEC_DECL_SPLAT(sign, bits, size) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(va); \ - for (int i = 0; i < size; i++) va[i] = x; \ - return v##sign##int##bits##x##size##_load_aligned(va); \ +# define VEC_SSE2_MUL_32x4(sign) \ + do { \ + /* this was stolen from... somewhere :) */ \ + __m128i a13 = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \ + __m128i b13 = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \ + __m128i prod02 = _mm_mul_epu32(vec1, vec2); /* (-,a2*b2,-,a0*b0) */ \ + __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ + __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ + return (v##sign##int32x4) {.sse = _mm_unpacklo_epi64(prod01, prod23)}; /* (ab3,ab2,ab1,ab0) */ \ + } while (0) + +# define VEC_SSE2_MUL_64x2(sign) \ + do { \ + __m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ + __m128i b = _mm_srli_epi64(vec1.sse, 32); /* b = vec1 >> 32; */ \ + __m128i bc = _mm_mul_epu32(b, vec2.sse); /* bc = b * (vec2 & UINT32_MAX); */ \ + __m128i d = _mm_srli_epi64(vec2.sse, 32); /* d = vec2 >> 32; */ \ + __m128i ad = _mm_mul_epu32(vec1.sse, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ + __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ + hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ + return (v##sign##int64x2) {.sse = _mm_add_epi64(hi, ac); } /* return ac + hi; */ \ + } while (0) +#endif + +// -------------------------------------------------------------------------------- +// vuint8x16 implementation + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load_aligned(const uint8_t in[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vuint8x16) { .sse = _mm_load_si128((__m128i *)in) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_ld(0, in); + } else +#endif + { + vuint8x16 vec; + memcpy(vec.generic, in, sizeof(vec.generic)); + return vec; } -#define VEC_GENERIC_MULTIPLY(sign, csign, bits, size) \ - VEC_DECL_MUL(sign, bits, size) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ - v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \ - \ - for (int i = 0; i < size; i++) vec1a[i] *= vec2a[i]; \ - \ - return v##sign##int##bits##x##size##_load_aligned(vec1a); \ + VEC_ASSERT(0, "No suitable load_aligned variant found"); + + return (vuint8x16){ 0 }; +} + +VEC_FUNC_KEYWORDS void vuint8x16_store_aligned(vuint8x16 vec, uint8_t out[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + _mm_store_si128((__m128i *)out, vec.sse); + return; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + vec_st(vec.altivec, 0, out); + return; + } else +#endif + { + memcpy(out, vec.generic, sizeof(vec.generic)); + return; + } + + VEC_ASSERT(0, "No suitable aligned store variant found"); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_splat(uint8_t x) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return (vuint8x16){ .altivec = vec_splat_u8(x) }; + } else +#endif + { + return (vuint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} }; + } + + // okay, we don't have a regular thing. call the load function with a splatted array + VUINT8x16_ALIGNED_ARRAY(arr); + for (int i = 0; i < 16; i++) arr[i] = x; + return vuint8x16_load_aligned(arr); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load(const uint8_t in[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vuint8x16) { .sse = _mm_loadu_si128((__m128i *)in) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)); + } else +#endif + { + vuint8x16 vec; + memcpy(vec.generic, in, sizeof(vec.generic)); + return vec; + } + + // ok, we don't have unaligned load, copy the array + // and call the aligned load function + VUINT8x16_ALIGNED_ARRAY(aligned_in); + memcpy(aligned_in, in, 16); + return vuint8x16_load_aligned(aligned_in); +} + +VEC_FUNC_KEYWORDS void vuint8x16_store(vuint8x16 vec, uint8_t out[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + _mm_storeu_si128((__m128i *)out, vec.sse); + return; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // noop + } else +#endif + { + memcpy(out, vec.generic, sizeof(vec.generic)); + return; + } + + // no unaligned store? use the aligned version + VUINT8x16_ALIGNED_ARRAY(aligned_out); + vuint8x16_store_aligned(vec, aligned_out); + + // then copy to the output buffer + memcpy(out, aligned_out, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_add(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vuint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_add(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_ADD(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_sub(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vuint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_sub(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i]; + return vec1; } -#define VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ - VEC_DECL_DIV(sign, bits, size) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ - v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \ - \ - /* FIXME FIXME FIXME; the reason this zero thing is here is because */ \ - /* the tests are too stupid to not include zero for divides. remove this ASAP */ \ - for (int i = 0; i < size; i++) vec1a[i] = (vec2a[i]) ? (vec1a[i] / vec2a[i]) : 0; \ - \ - return v##sign##int##bits##x##size##_load_aligned(vec1a); \ + VEC_GENERIC_SUB(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_mul(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + VEC_SSE2_MUL_8x16(u); + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { +# ifdef vec_mul // this isn't available on older compilers + return vec_mul(vec1.altivec, vec2.altivec); +# endif + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_MUL(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_div(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + if (VEC_CPU_have_ALTIVEC_VSX()) { + return vec_div(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0; + return vec1; + } + + VEC_GENERIC_DIV(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_and(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vuint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_and(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_AND(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_or(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vuint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_or(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_OR(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_xor(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vuint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_xor(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_XOR(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lshift(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + //noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_sl(vec1, vec2); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulshift(vec1.generic[i], vec2.generic[i]); + return vec1; + } + + VEC_GENERIC_LSHIFT(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_rshift(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + //noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_sl(vec1, vec2); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec_urshift(vec1.generic[i], vec2.generic[i]); + return vec1; } -#define VEC_GENERIC_SHIFT(sign, csign, bits, size, vectype, way) \ - VEC_DECL_SHIFT(sign, bits, size, vectype, way) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ - VUINT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ - vuint##bits##x##size##_store_aligned(vec2, vec2a); \ - \ - for (int i = 0; i < size; i++) vec1a[i] = vec_##sign##vectype##way##shift(vec1a[i], vec2a[i]); \ - \ - return v##sign##int##bits##x##size##_load_aligned(vec1a); \ + VEC_GENERIC_RSHIFT(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lrshift(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + //noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_sl(vec1, vec2); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulrshift(vec1.generic[i], vec2.generic[i]); + return vec1; + } + + VEC_GENERIC_LRSHIFT(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_avg(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_avg(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (uint8_t)(vec1.generic[i] + vec2.generic[i]) / 2; + return vec1; + } + + return vuint8x16_div(vuint8x16_add(vec1, vec2), vuint8x16_splat(2)); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmplt(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } + + VEC_GENERIC_CMPLT(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmple(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } + + VEC_GENERIC_CMPLE(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpeq(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } + + VEC_GENERIC_CMPEQ(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpgt(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } + + VEC_GENERIC_CMPGT(u, U, 8, 16); +} + +VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpge(vuint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; } -#define VEC_GENERIC_LSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(sign, csign, bits, size, , l) -#define VEC_GENERIC_RSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(sign, csign, bits, size, , r) -#define VEC_GENERIC_LRSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r) + VEC_GENERIC_CMPGE(u, U, 8, 16); +} + +// -------------------------------------------------------------------------------- +// vint8x16 implementation + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_load_aligned(const int8_t in[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vint8x16) { .sse = _mm_load_si128((__m128i *)in) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return (vint8x16) { .altivec = vec_ld(0, in) }; + } else +#endif + { + vint8x16 vec; + memcpy(vec.generic, in, sizeof(vec.generic)); + return vec; + } + + VEC_ASSERT(0, "No suitable load_aligned variant found"); + + return (vint8x16){ 0 }; +} + +VEC_FUNC_KEYWORDS void vint8x16_store_aligned(vint8x16 vec, int8_t out[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + _mm_store_si128((__m128i *)out, vec.sse); + return; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + vec_st(vec.altivec, 0, out); + return; + } else +#endif + { + memcpy(out, vec.generic, sizeof(vec.generic)); + return; + } + + VEC_ASSERT(0, "No suitable aligned store variant found"); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_splat(int8_t x) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return (vint8x16){ .altivec = vec_splat_s8(x) }; + } else +#endif + { + return (vint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} }; + } -#define VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ - VEC_GENERIC_LSHIFT(sign, csign, bits, size) \ - VEC_GENERIC_RSHIFT(sign, csign, bits, size) \ - VEC_GENERIC_LRSHIFT(sign, csign, bits, size) + // okay, we don't have a regular thing. call the load function with a splatted array + VINT8x16_ALIGNED_ARRAY(arr); + for (int i = 0; i < 16; i++) arr[i] = x; + return vint8x16_load_aligned(arr); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_load(const int8_t in[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vint8x16) { .sse = _mm_loadu_si128((__m128i *)in) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return (vint8x16) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) }; + } else +#endif + { + vint8x16 vec; + memcpy(vec.generic, in, sizeof(vec.generic)); + return vec; + } + + // ok, we don't have unaligned load, copy the array + // and call the aligned load function + VINT8x16_ALIGNED_ARRAY(aligned_in); + memcpy(aligned_in, in, 16); + return vint8x16_load_aligned(aligned_in); +} -#define VEC_GENERIC_AVG(sign, bits, size) \ - VEC_DECL_AVG(sign, bits, size) \ - { \ - return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_add(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \ +VEC_FUNC_KEYWORDS void vint8x16_store(vint8x16 vec, int8_t out[16]) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + _mm_storeu_si128((__m128i *)out, vec.sse); + return; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // noop + } else +#endif + { + memcpy(out, vec.generic, sizeof(vec.generic)); + return; + } + + // no unaligned store? use the aligned version + VINT8x16_ALIGNED_ARRAY(aligned_out); + vint8x16_store_aligned(vec, aligned_out); + + // then copy to the output buffer + memcpy(out, aligned_out, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_add(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return (vint8x16) { .altivec = vec_add(vec1.altivec, vec2.altivec) }; + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i]; + return vec1; } -#define VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size) \ - VEC_DECL_NOT(sign, bits, size); \ - \ - VEC_DECL_CMPLE(sign, bits, size) \ - { \ - return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmpgt(vec1, vec2)); \ - } \ - VEC_DECL_CMPGE(sign, bits, size) \ - { \ - return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmplt(vec1, vec2)); \ + VEC_GENERIC_ADD(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_sub(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return (vint8x16) { .altivec = vec_sub(vec1.altivec, vec2.altivec) }; + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_SUB(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_mul(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + VEC_SSE2_MUL_8x16(); + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { +# ifdef vec_mul // this isn't available on older compilers + return (vint8x16) { .altivec = vec_mul(vec1.altivec, vec2.altivec) }; +# endif + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_MUL(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_div(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + if (VEC_CPU_have_ALTIVEC_VSX()) { + return (vint8x16) { .altivec = vec_div(vec1.altivec, vec2.altivec) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // noop + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0; + return vec1; } -#define VEC_GENERIC_COMPARISON(sign, csign, bits, size, name, op) \ - VEC_DECL_CMP##name(sign, bits, size) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ - v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \ - \ - for (int i = 0; i < size; i++) vec1a[i] = (vec1a[i] op vec2a[i]) ? (sign##int##bits##_t)(UINT##bits##_MAX) : 0; \ - \ - return v##sign##int##bits##x##size##_load_aligned(vec1a); \ + VEC_GENERIC_DIV(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_and(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return (vint8x16) {.altivec = vec_and(vec1.altivec, vec2.altivec) }; + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_ADD(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_or(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_or(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_OR(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_xor(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + return (vint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_xor(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i]; + return vec1; + } + + VEC_GENERIC_XOR(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_lshift(vint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + //noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_sl(vec1, vec2); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lshift(vec1.generic[i], vec2.generic[i]); + return vec1; } -#define VEC_GENERIC_COMPARISONS(sign, csign, bits, size) \ - VEC_GENERIC_COMPARISON(sign, csign, bits, size, LT, <) \ - VEC_GENERIC_COMPARISON(sign, csign, bits, size, GT, >) \ - VEC_GENERIC_COMPARISON(sign, csign, bits, size, EQ, ==) \ - VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size) + VEC_GENERIC_LSHIFT(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_rshift(vint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + //noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_sl(vec1, vec2); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec_rshift(vec1.generic[i], vec2.generic[i]); + return vec1; + } + + VEC_GENERIC_RSHIFT(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_lrshift(vint8x16 vec1, vuint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + //noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_sl(vec1, vec2); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lrshift(vec1.generic[i], vec2.generic[i]); + return vec1; + } + + VEC_GENERIC_LRSHIFT(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_avg(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + return vec_avg(vec1.altivec, vec2.altivec); + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (int8_t)(vec1.generic[i] + vec2.generic[i]) / 2; + return vec1; + } + + return vint8x16_div(vint8x16_add(vec1, vec2), vint8x16_splat(2)); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmplt(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } -#ifndef VEC_SUPPRESS_HW -/* POWER altivec */ -# ifdef __ALTIVEC__ -# include "impl/altivec.h" -# endif -/* x86 SSE2; gcc intrinsics are probably more efficient than - * vec's implementation, but whatever. */ -# ifdef __SSE2__ -# include "impl/sse2.h" -# endif + VEC_GENERIC_CMPLT(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmple(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } + + VEC_GENERIC_CMPLE(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpeq(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho :) + } else #endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } -#ifndef VEC_SUPPRESS_GCC -# ifdef VEC_HAVE_GNUC_VECTORS -# include "impl/gcc.h" -# endif + VEC_GENERIC_CMPEQ(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpgt(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else #endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } -#include "impl/generic.h" + VEC_GENERIC_CMPGT(, , 8, 16); +} + +VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpge(vint8x16 vec1, vint8x16 vec2) +{ +#ifdef VEC_COMPILER_HAS_SSE2 + if (VEC_CPU_have_SSE2()) { + // noop + } else +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (VEC_CPU_have_ALTIVEC()) { + // these functions exist, no internet rn tho + } else +#endif + { + for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0; + return vec1; + } + + VEC_GENERIC_CMPGE(, , 8, 16); +} /* ----------------------------------------------------------------- */ /* bitwise NOT is just an XOR with UINT[BITS]_MAX */ #define DEFINE_NOT_OPERATION(sign, bits, size) \ - VEC_DECL_NOT(sign, bits, size) \ + VEC_FUNC_KEYWORDS v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ { \ return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \ } DEFINE_NOT_OPERATION(, 8, 16) -DEFINE_NOT_OPERATION(, 16, 8) -DEFINE_NOT_OPERATION(, 32, 4) -DEFINE_NOT_OPERATION(, 64, 2) DEFINE_NOT_OPERATION(u, 8, 16) -DEFINE_NOT_OPERATION(u, 16, 8) -DEFINE_NOT_OPERATION(u, 32, 4) -DEFINE_NOT_OPERATION(u, 64, 2) DEFINE_NOT_OPERATION(, 8, 32) DEFINE_NOT_OPERATION(, 16, 16)
--- a/test/main.c Tue Nov 19 01:00:09 2024 -0500 +++ b/test/main.c Tue Nov 19 15:55:01 2024 -0500 @@ -81,19 +81,6 @@ VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size) DEF_VEC_TEST_FUNCS(8, 16) -DEF_VEC_TEST_FUNCS(16, 8) -DEF_VEC_TEST_FUNCS(32, 4) -DEF_VEC_TEST_FUNCS(64, 2) - -DEF_VEC_TEST_FUNCS(8, 32) -DEF_VEC_TEST_FUNCS(16, 16) -DEF_VEC_TEST_FUNCS(32, 8) -DEF_VEC_TEST_FUNCS(64, 4) - -DEF_VEC_TEST_FUNCS(8, 64) -DEF_VEC_TEST_FUNCS(16, 32) -DEF_VEC_TEST_FUNCS(32, 16) -DEF_VEC_TEST_FUNCS(64, 8) #undef DEF_VEC_TEST_FUNCS #undef VPRINT
--- a/test/test_align.h Tue Nov 19 01:00:09 2024 -0500 +++ b/test/test_align.h Tue Nov 19 15:55:01 2024 -0500 @@ -8,7 +8,7 @@ V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec_arr); \ \ /* fill the values */ \ - for (int i = 0; i < V##csign##INT##bits##x##size##_ALIGNED_ARRAY_LENGTH(vec_arr); i++) \ + for (int i = 0; i < size; i++) \ vec_arr[i] = i; \ \ /* try to load it */ \ @@ -21,7 +21,7 @@ v##sign##int##bits##x##size##_store_aligned(vec, vec_arr_out); \ \ /* mark success or failure */ \ - ret |= !!memcmp(vec_arr, vec_arr_out, V##csign##INT##bits##x##size##_ALIGNED_ARRAY_LENGTH(vec_arr)); \ + ret |= !!memcmp(vec_arr, vec_arr_out, size * sizeof(*vec_arr)); \ \ ret |= !V##csign##INT##bits##x##size##_PTR_ALIGNED(vec_arr); \ ret |= !V##csign##INT##bits##x##size##_PTR_ALIGNED(vec_arr_out); \ @@ -32,14 +32,6 @@ RUN_TEST(u, U, bits, size) RUN_TESTS(8, 16) - RUN_TESTS(16, 8) - RUN_TESTS(32, 4) - RUN_TESTS(64, 2) - - RUN_TESTS(8, 32) - RUN_TESTS(16, 16) - RUN_TESTS(32, 8) - RUN_TESTS(64, 4) #undef RUN_TESTS #undef RUN_TEST
--- a/test/test_arith.h Tue Nov 19 01:00:09 2024 -0500 +++ b/test/test_arith.h Tue Nov 19 15:55:01 2024 -0500 @@ -70,19 +70,6 @@ CREATE_TESTS_SIGN(u, u, U, bits, size) CREATE_TESTS(8, 16) -CREATE_TESTS(16, 8) -CREATE_TESTS(32, 4) -CREATE_TESTS(64, 2) - -CREATE_TESTS(8, 32) -CREATE_TESTS(16, 16) -CREATE_TESTS(32, 8) -CREATE_TESTS(64, 4) - -CREATE_TESTS(8, 64) -CREATE_TESTS(16, 32) -CREATE_TESTS(32, 16) -CREATE_TESTS(64, 8) #undef CREATE_TESTS_SIGN #undef CREATE_TESTS @@ -123,19 +110,6 @@ RUN_TESTS_SIGN(u, bits, size) RUN_TESTS(8, 16) - RUN_TESTS(16, 8) - RUN_TESTS(32, 4) - RUN_TESTS(64, 2) - - RUN_TESTS(8, 32) - RUN_TESTS(16, 16) - RUN_TESTS(32, 8) - RUN_TESTS(64, 4) - - RUN_TESTS(8, 64) - RUN_TESTS(16, 32) - RUN_TESTS(32, 16) - RUN_TESTS(64, 8) #undef RUN_TESTS_SIGN #undef RUN_TESTS
--- a/test/test_compare.h Tue Nov 19 01:00:09 2024 -0500 +++ b/test/test_compare.h Tue Nov 19 15:55:01 2024 -0500 @@ -33,19 +33,6 @@ #define CREATE_TESTS(bits, size) CREATE_TESTS_SIGN(, d, bits, size) CREATE_TESTS_SIGN(u, u, bits, size) CREATE_TESTS(8, 16) -CREATE_TESTS(16, 8) -CREATE_TESTS(32, 4) -CREATE_TESTS(64, 2) - -CREATE_TESTS(8, 32) -CREATE_TESTS(16, 16) -CREATE_TESTS(32, 8) -CREATE_TESTS(64, 4) - -CREATE_TESTS(8, 64) -CREATE_TESTS(16, 32) -CREATE_TESTS(32, 16) -CREATE_TESTS(64, 8) #undef CREATE_TESTS_SIGN #undef CREATE_TESTS @@ -73,19 +60,6 @@ RUN_TESTS_SIGN(u, bits, size) RUN_TESTS(8, 16) - RUN_TESTS(16, 8) - RUN_TESTS(32, 4) - RUN_TESTS(64, 2) - - RUN_TESTS(8, 32) - RUN_TESTS(16, 16) - RUN_TESTS(32, 8) - RUN_TESTS(64, 4) - - RUN_TESTS(8, 64) - RUN_TESTS(16, 32) - RUN_TESTS(32, 16) - RUN_TESTS(64, 8) #undef RUN_TESTS_SIGN #undef RUN_TESTS