# HG changeset patch # User Paper # Date 1729651145 14400 # Node ID f12b5dd4e18cc6f80220f3a3be074b6aa50b445e # Parent 1d9d2308c1d24f90d9d1197f31c808c9d31ed7f4 *: many new operations and a real test suite diff -r 1d9d2308c1d2 -r f12b5dd4e18c README --- a/README Tue Oct 22 01:28:48 2024 -0400 +++ b/README Tue Oct 22 22:39:05 2024 -0400 @@ -44,3 +44,65 @@ v[u]intAxB mul(v[u]intAxB vec1, v[u]intAxB vec2) multiplies the values of `vec1' and `vec2' together and returns it + + v[u]intAxB div(v[u]intAxB vec1, v[u]intAxB vec2) + divides vec1 by the values in vec2. dividing by zero is + considered defined behavior and should result in a zero; + if this doesn't happen it's considered a bug + + v[u]intAxB and(v[u]intAxB vec1, v[u]intAxB vec2) + bitwise AND (&) of the values in both vectors + + v[u]intAxB or(v[u]intAxB vec1, v[u]intAxB vec2) + bitwise OR (|) of the values in both vectors + + v[u]intAxB xor(v[u]intAxB vec1, v[u]intAxB vec2) + bitwise XOR (^) of the values in both vectors + + v[u]intAxB rshift(v[u]intAxB vec1, vuintAxB vec2) + arithmetic right shift of the values in vec1 by + the corresponding values in vec2 + + v[u]intAxB lshift(v[u]intAxB vec1, vuintAxB vec2) + arithmetic left shift of the values in vec1 by + the corresponding values in vec2 + + v[u]intAxB lrshift(v[u]intAxB vec1, vuintAxB vec2) + logical right shift of the values in vec1 by + the corresponding values in vec2 + + v[u]intAxB avg(v[u]intAxB vec1, v[u]intAxB vec2) + returns the average of the values in both vectors + i.e., div(mul(vec1, vec2), splat(2)) + +there are also a number of comparisons possible: + + v[u]intAxB cmplt(v[u]intAxB vec1, v[u]intAxB vec2) + turns on all bits of the corresponding value in + the result vector if the value in `vec1' is less + than the corresponding value in `vec2', else all + of the bits are turned off. + + v[u]intAxB cmpgt(v[u]intAxB vec1, v[u]intAxB vec2) + turns on all bits of the corresponding value in + the result vector if the value in `vec1' is greater + than the corresponding value in `vec2', else all + of the bits are turned off. + + v[u]intAxB cmpeq(v[u]intAxB vec1, v[u]intAxB vec2) + turns on all bits of the corresponding value in + the result vector if the value in `vec1' are equal + to the corresponding value in `vec2', else all + of the bits are turned off. + + v[u]intAxB cmple(v[u]intAxB vec1, v[u]intAxB vec2) + turns on all bits of the corresponding value in + the result vector if the value in `vec1' is less + than or equal to the corresponding value in `vec2', + else all of the bits are turned off. + + v[u]intAxB cmpge(v[u]intAxB vec1, v[u]intAxB vec2) + turns on all bits of the corresponding value in + the result vector if the value in `vec1' is greater + than or equal to the corresponding value in `vec2', + else all of the bits are turned off. diff -r 1d9d2308c1d2 -r f12b5dd4e18c include/vec/impl/altivec.h --- a/include/vec/impl/altivec.h Tue Oct 22 01:28:48 2024 -0400 +++ b/include/vec/impl/altivec.h Tue Oct 22 22:39:05 2024 -0400 @@ -34,37 +34,74 @@ /* Since altivec conveniently made their API super user friendly, we can just use * one giant macro to define literally everything */ #define VEC_DEFINE_OPERATIONS(sign, bits, size) \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t i) \ + VEC_DECL_SPLAT(sign, bits, size) \ { \ - return vec_splats(i); \ + return vec_splats(x); \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + VEC_DECL_LOAD(sign, bits, size) \ { \ return vec_perm(vec_ld(0, in), vec_ld(VEC_ALTIVEC_ALIGNMENT, in), vec_lvsl(0, in)); \ } \ \ - static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + VEC_DECL_STORE(sign, bits, size) \ { \ VEC_ALIGNED_ARRAY(sign##int##bits##_t, aligned_out, size, VEC_ALTIVEC_ALIGNMENT); \ vec_st(vec, 0, aligned_out); \ memcpy(out, aligned_out, size * sizeof(*aligned_out)); \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_ADD(sign, bits, size) \ { \ return vec_add(vec1, vec2); \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_SUB(sign, bits, size) \ { \ return vec_sub(vec1, vec2); \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_MUL(sign, bits, size) \ { \ return vec_mul(vec1, vec2); \ - } + } \ + \ + VEC_DECL_SHIFT(sign, bits, size, , l) \ + { \ + return vec_sl(vec1, vec2); \ + } \ + \ + VEC_DECL_SHIFT(sign, bits, size, , r) \ + { \ + return vec_sra(vec1, vec2); \ + } \ + \ + VEC_DECL_SHIFT(sign, bits, size, l, r) \ + { \ + return vec_sr(vec1, vec2); \ + } \ + \ + VEC_DECL_AVG(sign, bits, size) \ + { \ + return vec_avg(vec1, vec2); \ + } \ + \ + VEC_DECL_AND(sign, bits, size) \ + { \ + return vec_and(vec1, vec2); \ + } \ + \ + VEC_DECL_OR(sign, bits, size) \ + { \ + return vec_or(vec1, vec2); \ + } \ + \ + VEC_DECL_XOR(sign, bits, size) \ + { \ + return vec_xor(vec1, vec2); \ + } \ + \ + VEC_GENERIC_DIVIDE(sign, bits, size) #ifndef VEC_VUINT8X16 # define VEC_VUINT8X16 diff -r 1d9d2308c1d2 -r f12b5dd4e18c include/vec/impl/gcc.h --- a/include/vec/impl/gcc.h Tue Oct 22 01:28:48 2024 -0400 +++ b/include/vec/impl/gcc.h Tue Oct 22 22:39:05 2024 -0400 @@ -28,39 +28,108 @@ #include #define VEC_DEFINE_OPERATIONS(sign, bits, size) \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \ - { \ - v##sign##int##bits##x##size vec; \ - for (int i = 0; i < size; i++) vec[i] = x; \ - return vec; \ - } \ - \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + VEC_DECL_LOAD(sign, bits, size) \ { \ v##sign##int##bits##x##size vec; \ memcpy(&vec, in, sizeof(vec)); \ return vec; \ } \ \ - static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + VEC_DECL_STORE(sign, bits, size) \ { \ memcpy(out, &vec, sizeof(vec)); \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_ADD(sign, bits, size) \ { \ return vec1 + vec2; \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_SUB(sign, bits, size) \ { \ return vec1 - vec2; \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_MUL(sign, bits, size) \ { \ return vec1 * vec2; \ - } + } \ + \ + VEC_DECL_AND(sign, bits, size) \ + { \ + return vec1 & vec2; \ + } \ + \ + VEC_DECL_OR(sign, bits, size) \ + { \ + return vec1 | vec2; \ + } \ + \ + VEC_DECL_XOR(sign, bits, size) \ + { \ + return vec1 ^ vec2; \ + } \ + VEC_DECL_CMPLT(sign, bits, size) \ + { \ + return vec1 < vec2; \ + } \ + VEC_DECL_CMPGT(sign, bits, size) \ + { \ + return vec1 > vec2; \ + } \ + VEC_DECL_CMPEQ(sign, bits, size) \ + { \ + return vec1 == vec2; \ + } \ + VEC_DECL_CMPLE(sign, bits, size) \ + { \ + return vec1 <= vec2; \ + } \ + VEC_DECL_CMPGE(sign, bits, size) \ + { \ + return vec1 >= vec2; \ + } \ + \ + VEC_GENERIC_DIVIDE(sign, bits, size) \ + VEC_GENERIC_SPLAT(sign, bits, size) \ + VEC_GENERIC_SHIFTS(sign, bits, size) \ + VEC_GENERIC_AVG(sign, bits, size) + +#ifndef VEC_VUINT8X16 +# define VEC_VUINT8X16 +typedef uint8_t vuint8x16 __attribute__((__vector_size__(16))); +# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } +VEC_DEFINE_OPERATIONS(u, 8, 16) +# define VINT8x16_ALIGNED 1 +#endif + +#ifndef VEC_VUINT16X8 +# define VEC_VUINT16X8 +typedef uint16_t vuint16x8 __attribute__((__vector_size__(16))); +# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (vuint16x8){ a, b, c, d, e, f, g, h } +VEC_DEFINE_OPERATIONS(u, 16, 8) +# define VINT16x8_ALIGNED 1 +#endif + +#ifndef VEC_VUINT32X4 +# define VEC_VUINT32X4 +typedef uint32_t vuint32x4 __attribute__((__vector_size__(16))); +# define VUINT32x4_CONSTANT(a, b, c, d) \ + (vuint32x4){ a, b, c, d } +VEC_DEFINE_OPERATIONS(u, 32, 4) +# define VINT32x4_ALIGNED 1 +#endif + +#ifndef VEC_VUINT64X2 +# define VEC_VUINT64X2 +typedef uint64_t vuint64x2 __attribute__((__vector_size__(16))); +# define VUINT64x2_CONSTANT(a, b) \ + (vuint64x2){ a, b } +VEC_DEFINE_OPERATIONS(u, 64, 2) +# define VINT64x2_ALIGNED 1 +#endif #ifndef VEC_VINT8X16 # define VEC_VINT8X16 @@ -98,40 +167,4 @@ # define VINT64x2_ALIGNED 1 #endif -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -typedef uint8_t vuint8x16 __attribute__((__vector_size__(16))); -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -VEC_DEFINE_OPERATIONS(u, 8, 16) -# define VINT8x16_ALIGNED 1 -#endif - -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -typedef uint16_t vuint16x8 __attribute__((__vector_size__(16))); -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vuint16x8){ a, b, c, d, e, f, g, h } -VEC_DEFINE_OPERATIONS(u, 16, 8) -# define VINT16x8_ALIGNED 1 -#endif - -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -typedef uint32_t vuint32x4 __attribute__((__vector_size__(16))); -# define VUINT32x4_CONSTANT(a, b, c, d) \ - (vuint32x4){ a, b, c, d } -VEC_DEFINE_OPERATIONS(u, 32, 4) -# define VINT32x4_ALIGNED 1 -#endif - -#ifndef VEC_VUINT64X2 -# define VEC_VUINT64X4 -typedef uint64_t vuint64x2 __attribute__((__vector_size__(16))); -# define VUINT64x2_CONSTANT(a, b) \ - (vuint64x2){ a, b } -VEC_DEFINE_OPERATIONS(u, 64, 2) -# define VINT64x2_ALIGNED 1 -#endif - #undef VEC_DEFINE_OPERATIONS diff -r 1d9d2308c1d2 -r f12b5dd4e18c include/vec/impl/generic.h --- a/include/vec/impl/generic.h Tue Oct 22 01:28:48 2024 -0400 +++ b/include/vec/impl/generic.h Tue Oct 22 22:39:05 2024 -0400 @@ -33,45 +33,98 @@ } v##sign##int##bits##x##size; #define VEC_DEFINE_OPERATIONS(sign, bits, size) \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v ## sign ## int ## bits ## x ## size ## _splat(sign ## int ## bits ## _t x) \ - { \ - v##sign##int##bits##x##size vec; \ - for (int i = 0; i < size; i++) vec.arr[i] = x; \ - return vec; \ - } \ - \ - static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _load(const sign ## int ## bits ## _t in[size]) \ + VEC_DECL_LOAD(sign, bits, size) \ { \ v##sign##int##bits##x##size vec; \ memcpy(vec.arr, in, sizeof(vec.arr)); \ return vec; \ } \ \ - static inline VEC_ALWAYS_INLINE void v ## sign ## int ## bits ## x ## size ## _store(v ## sign ## int ## bits ## x ## size vec, sign ## int ## bits ## _t out[size]) \ + VEC_DECL_STORE(sign, bits, size) \ { \ memcpy(out, vec.arr, sizeof(vec.arr)); \ } \ \ - static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _add(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \ + VEC_DECL_ADD(sign, bits, size) \ + { \ + for (int i = 0; i < size; i++) vec1.arr[i] += vec2.arr[i]; \ + return vec1; \ + } \ + \ + VEC_DECL_SUB(sign, bits, size) \ + { \ + for (int i = 0; i < size; i++) vec1.arr[i] -= vec2.arr[i]; \ + return vec1; \ + } \ + \ + VEC_DECL_MUL(sign, bits, size) \ { \ - v##sign##int##bits##x##size vec; \ - for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] + vec2.arr[i]; \ - return vec; \ + for (int i = 0; i < size; i++) vec1.arr[i] *= vec2.arr[i]; \ + return vec1; \ + } \ + \ + VEC_DECL_AND(sign, bits, size) \ + { \ + for (int i = 0; i < size; i++) vec1.arr[i] &= vec2.arr[i]; \ + return vec1; \ + } \ + \ + VEC_DECL_OR(sign, bits, size) \ + { \ + for (int i = 0; i < size; i++) vec1.arr[i] |= vec2.arr[i]; \ + return vec1; \ + } \ + \ + VEC_DECL_XOR(sign, bits, size) \ + { \ + for (int i = 0; i < size; i++) vec1.arr[i] ^= vec2.arr[i]; \ + return vec1; \ } \ \ - static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _sub(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] - vec2.arr[i]; \ - return vec; \ - } \ - \ - static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _mul(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] * vec2.arr[i]; \ - return vec; \ - } + VEC_GENERIC_SPLAT(sign, bits, size) \ + VEC_GENERIC_SHIFTS(sign, bits, size) \ + VEC_GENERIC_DIVIDE(sign, bits, size) \ + VEC_GENERIC_AVG(sign, bits, size) + +#ifndef VEC_VUINT8X16 +# define VEC_VUINT8X16 +VEC_DEFINE_STRUCT(u, 8, 16) +# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + ((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) +VEC_DEFINE_OPERATIONS(u, 8, 16) +VEC_GENERIC_COMPARISONS(u, 8, 16) +# define VINT8x16_ALIGNED 1 +#endif + +#ifndef VEC_VUINT16X8 +# define VEC_VUINT16X8 +VEC_DEFINE_STRUCT(u, 16, 8) +# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + ((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } }) +VEC_DEFINE_OPERATIONS(u, 16, 8) +VEC_GENERIC_COMPARISONS(u, 16, 8) +# define VINT16x8_ALIGNED 1 +#endif + +#ifndef VEC_VUINT32X4 +# define VEC_VUINT32X4 +VEC_DEFINE_STRUCT(u, 32, 4) +# define VUINT32x4_CONSTANT(a, b, c, d) \ + ((vuint32x4){ .arr = { a, b, c, d } }) +VEC_DEFINE_OPERATIONS(u, 32, 4) +VEC_GENERIC_COMPARISONS(u, 32, 4) +# define VINT32x4_ALIGNED 1 +#endif + +#ifndef VEC_VUINT64X2 +# define VEC_VUINT64X2 +VEC_DEFINE_STRUCT(u, 64, 2) +# define VUINT64x2_CONSTANT(a, b) \ + ((vuint64x2){ .arr = { a, b } }) +VEC_DEFINE_OPERATIONS(u, 64, 2) +VEC_GENERIC_COMPARISONS(u, 64, 2) +# define VINT64x2_ALIGNED 1 +#endif #ifndef VEC_VINT8X16 # define VEC_VINT8X16 @@ -79,6 +132,7 @@ # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ ((vint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) VEC_DEFINE_OPERATIONS(, 8, 16) +VEC_GENERIC_COMPARISONS(, 8, 16) # define VINT8x16_ALIGNED 1 #endif @@ -88,6 +142,7 @@ # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ ((vint16x8){ .arr = { a, b, c, d, e, f, g, h } }) VEC_DEFINE_OPERATIONS(, 16, 8) +VEC_GENERIC_COMPARISONS(, 16, 8) # define VINT16x8_ALIGNED 1 #endif @@ -97,6 +152,7 @@ # define VINT32x4_CONSTANT(a, b, c, d) \ ((vint32x4){ .arr = { a, b, c, d } }) VEC_DEFINE_OPERATIONS(, 32, 4) +VEC_GENERIC_COMPARISONS(, 32, 4) # define VINT32x4_ALIGNED 1 #endif @@ -106,42 +162,7 @@ # define VINT64x2_CONSTANT(a, b) \ ((vint64x2){ .arr = { a, b } }) VEC_DEFINE_OPERATIONS(, 64, 2) -# define VINT64x2_ALIGNED 1 -#endif - -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -VEC_DEFINE_STRUCT(u, 8, 16) -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - ((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) -VEC_DEFINE_OPERATIONS(u, 8, 16) -# define VINT8x16_ALIGNED 1 -#endif - -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -VEC_DEFINE_STRUCT(u, 16, 8) -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - ((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } }) -VEC_DEFINE_OPERATIONS(u, 16, 8) -# define VINT16x8_ALIGNED 1 -#endif - -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -VEC_DEFINE_STRUCT(u, 32, 4) -# define VUINT32x4_CONSTANT(a, b, c, d) \ - ((vuint32x4){ .arr = { a, b, c, d } }) -VEC_DEFINE_OPERATIONS(u, 32, 4) -# define VINT32x4_ALIGNED 1 -#endif - -#ifndef VEC_VUINT64X2 -# define VEC_VUINT64X2 -VEC_DEFINE_STRUCT(u, 64, 2) -# define VUINT64x2_CONSTANT(a, b) \ - ((vuint64x2){ .arr = { a, b } }) -VEC_DEFINE_OPERATIONS(u, 64, 2) +VEC_GENERIC_COMPARISONS(, 64, 2) # define VINT64x2_ALIGNED 1 #endif diff -r 1d9d2308c1d2 -r f12b5dd4e18c include/vec/impl/sse2.h --- a/include/vec/impl/sse2.h Tue Oct 22 01:28:48 2024 -0400 +++ b/include/vec/impl/sse2.h Tue Oct 22 22:39:05 2024 -0400 @@ -24,35 +24,74 @@ #include +#include /* memcpy */ + #define VEC_DEFINE_OPERATIONS(sign, bits, size) \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + VEC_DECL_LOAD(sign, bits, size) \ { \ return _mm_loadu_si128((const __m128i *)in); \ } \ \ - static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + VEC_DECL_STORE(sign, bits, size) \ { \ - memcpy(out, &vec, sizeof(vec)); \ + _mm_storeu_si128((__m128i *)out, vec); \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_ADD(sign, bits, size) \ { \ return _mm_add_epi##bits(vec1, vec2); \ } \ \ - static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + VEC_DECL_SUB(sign, bits, size) \ { \ return _mm_sub_epi##bits(vec1, vec2); \ - } + } \ + \ + VEC_DECL_AND(sign, bits, size) \ + { \ + return _mm_and_si128(vec1, vec2); \ + } \ + \ + VEC_DECL_OR(sign, bits, size) \ + { \ + return _mm_or_si128(vec1, vec2); \ + } \ + \ + VEC_DECL_XOR(sign, bits, size) \ + { \ + return _mm_xor_si128(vec1, vec2); \ + } \ + \ + VEC_GENERIC_SPLAT(sign, bits, size) \ + VEC_GENERIC_DIVIDE(sign, bits, size) \ + VEC_GENERIC_SHIFTS(sign, bits, size) \ + VEC_DECL_MUL(sign, bits, size); \ + VEC_GENERIC_AVG(sign, bits, size) -#ifndef VEC_VINT8X16 -# define VEC_VINT8X16 -typedef __m128i vint8x16; -# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (_mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)) -VEC_DEFINE_OPERATIONS(, 8, 16) +#define VEC_DEFINE_COMPARISONS_SIGNED(bits, size) \ + VEC_DECL_CMPEQ(, bits, size) \ + { \ + return _mm_cmpeq_epi##bits(vec1, vec2); \ + } \ + VEC_DECL_CMPLT(, bits, size) \ + { \ + return _mm_cmplt_epi##bits(vec1, vec2); \ + } \ + VEC_DECL_CMPGT(, bits, size) \ + { \ + return _mm_cmpgt_epi##bits(vec1, vec2); \ + } \ + VEC_GENERIC_THAN_OR_EQUAL(, bits, size) + +#ifndef VEC_VUINT8X16 +# define VEC_VUINT8X16 +typedef __m128i vuint8x16; +# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)) +VEC_DEFINE_OPERATIONS(u, 8, 16) +VEC_GENERIC_COMPARISONS(u, 8, 16) # define VINT8x16_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vint8x16 vint8x16_mul(vint8x16 vec1, vint8x16 vec2) +VEC_DECL_MUL(u, 8, 16) { // unpack and multiply __m128i dst_even = _mm_mullo_epi16(vec1, vec2); @@ -61,77 +100,69 @@ // repack return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8)); } -static inline VEC_ALWAYS_INLINE vint8x16 vint8x16_splat(int8_t c) -{ - return VINT8x16_CONSTANT(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c); -} #endif -#ifndef VEC_VINT16X8 -# define VEC_VINT16X8 -typedef __m128i vint16x8; -# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (_mm_setr_epi16(a, b, c, d, e, f, g, h)) -VEC_DEFINE_OPERATIONS(, 16, 8) +#ifndef VEC_VUINT16X8 +# define VEC_VUINT16X8 +typedef __m128i vuint16x8; +# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (_mm_setr_epi16(h, g, f, e, d, c, b, a)) +VEC_DEFINE_OPERATIONS(u, 16, 8) +VEC_GENERIC_COMPARISONS(u, 16, 8) # define VINT16x8_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vint16x8 vint16x8_mul(vint16x8 vec1, vint16x8 vec2) +VEC_DECL_MUL(u, 16, 8) { return _mm_mullo_epi16(vec1, vec2); } -static inline VEC_ALWAYS_INLINE vint16x8 vint16x8_splat(int16_t c) -{ - return VINT16x8_CONSTANT(c, c, c, c, c, c, c, c); -} #endif -#ifndef VEC_VINT32X4 -# define VEC_VINT32X4 -typedef __m128i vint32x4; -# define VINT32x4_CONSTANT(a, b, c, d) \ - (_mm_setr_epi32(a, b, c, d)) -VEC_DEFINE_OPERATIONS(, 32, 4) -# define VINT32x4_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vint32x4 vint32x4_mul(vint32x4 a, vint32x4 b) +#ifndef VEC_VUINT32X4 +# define VEC_VUINT32X4 +typedef __m128i vuint32x4; +# define VUINT32x4_CONSTANT(a, b, c, d) \ + (_mm_setr_epi32(d, c, b, a)) +VEC_DEFINE_OPERATIONS(u, 32, 4) +VEC_GENERIC_COMPARISONS(u, 32, 4) +# define VUINT32x4_ALIGNED 1 +VEC_DECL_MUL(u, 32, 4) { - __m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0) + /* this was stolen from... somewhere :) */ + __m128i a13 = _mm_shuffle_epi32(vec1, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(vec2, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(vec1, vec2); // (-,a2*b2,-,a0*b0) __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) return _mm_unpacklo_epi64(prod01, prod23); // (ab3,ab2,ab1,ab0) } -static inline VEC_ALWAYS_INLINE vint32x4 vint32x4_splat(int32_t c) -{ - return VINT32x4_CONSTANT(c, c, c, c); -} #endif -#ifndef VEC_VINT64X2 -# define VEC_VINT64X2 -typedef __m128i vint64x2; -static inline VEC_ALWAYS_INLINE vint64x2 VINT64x2_CONSTANT(int64_t a, int64_t b) +#ifndef VEC_VUINT64X2 +# define VEC_VUINT64X2 +typedef __m128i vuint64x2; +static inline VEC_ALWAYS_INLINE vuint64x2 VUINT64x2_CONSTANT(uint64_t a, uint64_t b) { return _mm_setr_epi32(b, b >> 32, a, a >> 32); } -VEC_DEFINE_OPERATIONS(, 64, 2) -# define VINT64x2_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vint64x2 vint64x2_mul(vint64x2 ab, vint64x2 cd) +VEC_DEFINE_OPERATIONS(u, 64, 2) +VEC_GENERIC_COMPARISONS(u, 64, 2) +# define VUINT64x2_ALIGNED 1 +VEC_DECL_MUL(u, 64, 2) { - /* ac = (ab & 0xFFFFFFFF) * (cd & 0xFFFFFFFF); */ - __m128i ac = _mm_mul_epu32(ab, cd); + /* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */ + __m128i ac = _mm_mul_epu32(vec1, vec2); - /* b = ab >> 32; */ - __m128i b = _mm_srli_epi64(ab, 32); + /* b = vec1 >> 32; */ + __m128i b = _mm_srli_epi64(vec1, 32); - /* bc = b * (cd & 0xFFFFFFFF); */ - __m128i bc = _mm_mul_epu32(b, cd); + /* bc = b * (vec2 & 0xFFFFFFFF); */ + __m128i bc = _mm_mul_epu32(b, vec2); - /* d = cd >> 32; */ - __m128i d = _mm_srli_epi64(cd, 32); + /* d = vec2 >> 32; */ + __m128i d = _mm_srli_epi64(vec2, 32); - /* ad = (ab & 0xFFFFFFFF) * d; */ - __m128i ad = _mm_mul_epu32(ab, d); + /* ad = (vec1 & 0xFFFFFFFF) * d; */ + __m128i ad = _mm_mul_epu32(vec1, d); /* high = bc + ad; */ __m128i high = _mm_add_epi64(bc, ad); @@ -142,20 +173,17 @@ /* return ac + high; */ return _mm_add_epi64(high, ac); } -static inline VEC_ALWAYS_INLINE vint64x2 vint64x2_splat(int64_t c) -{ - return VINT64x2_CONSTANT(c, c); -} #endif -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -typedef __m128i vuint8x16; -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (_mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)) -VEC_DEFINE_OPERATIONS(u, 8, 16) +#ifndef VEC_VINT8X16 +# define VEC_VINT8X16 +typedef __m128i vint8x16; +# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)) +VEC_DEFINE_OPERATIONS(, 8, 16) +VEC_DEFINE_COMPARISONS_SIGNED(8, 16) # define VINT8x16_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vint8x16 vuint8x16_mul(vuint8x16 vec1, vuint8x16 vec2) +VEC_DECL_MUL(, 8, 16) { // unpack and multiply __m128i dst_even = _mm_mullo_epi16(vec1, vec2); @@ -164,78 +192,68 @@ // repack return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8)); } -static inline VEC_ALWAYS_INLINE vuint8x16 vuint8x16_splat(uint8_t c) -{ - return VUINT8x16_CONSTANT(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c); -} #endif -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -typedef __m128i vuint16x8; -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (_mm_setr_epi16(a, b, c, d, e, f, g, h)) -VEC_DEFINE_OPERATIONS(u, 16, 8) +#ifndef VEC_VINT16X8 +# define VEC_VINT16X8 +typedef __m128i vint16x8; +# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (_mm_setr_epi16(h, g, f, e, d, c, b, a)) +VEC_DEFINE_OPERATIONS(, 16, 8) +VEC_DEFINE_COMPARISONS_SIGNED(16, 8) # define VINT16x8_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vuint16x8 vuint16x8_mul(vuint16x8 vec1, vuint16x8 vec2) +VEC_DECL_MUL(, 16, 8) { return _mm_mullo_epi16(vec1, vec2); } -static inline VEC_ALWAYS_INLINE vuint16x8 vuint16x8_splat(uint16_t c) -{ - return VUINT16x8_CONSTANT(c, c, c, c, c, c, c, c); -} #endif -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -typedef __m128i vuint32x4; -# define VUINT32x4_CONSTANT(a, b, c, d) \ - (_mm_setr_epi32(a, b, c, d)) -VEC_DEFINE_OPERATIONS(u, 32, 4) -# define VUINT32x4_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vuint32x4 vuint32x4_mul(vuint32x4 a, vuint32x4 b) +#ifndef VEC_VINT32X4 +# define VEC_VINT32X4 +typedef __m128i vint32x4; +# define VINT32x4_CONSTANT(a, b, c, d) \ + (_mm_setr_epi32(d, c, b, a)) +VEC_DEFINE_OPERATIONS(, 32, 4) +VEC_DEFINE_COMPARISONS_SIGNED(32, 4) +# define VINT32x4_ALIGNED 1 +VEC_DECL_MUL(, 32, 4) { - /* this was stolen from... somewhere :) */ - __m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0) + __m128i a13 = _mm_shuffle_epi32(vec1, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(vec2, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(vec1, vec2); // (-,a2*b2,-,a0*b0) __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) return _mm_unpacklo_epi64(prod01, prod23); // (ab3,ab2,ab1,ab0) } -static inline VEC_ALWAYS_INLINE vuint32x4 vuint32x4_splat(int32_t c) -{ - return VUINT32x4_CONSTANT(c, c, c, c); -} #endif -#ifndef VEC_VUINT64X2 -# define VEC_VUINT64X2 -typedef __m128i vuint64x2; -static inline VEC_ALWAYS_INLINE vint64x2 VUINT64x2_CONSTANT(int64_t a, int64_t b) +#ifndef VEC_VINT64X2 +# define VEC_VINT64X2 +typedef __m128i vint64x2; +static inline VEC_ALWAYS_INLINE vint64x2 VINT64x2_CONSTANT(int64_t a, int64_t b) { - return _mm_setr_epi32(b, b >> 32, a, a >> 32); + return _mm_setr_epi32(b, vec_rshift(b, 32), a, vec_rshift(a, 32)); } -VEC_DEFINE_OPERATIONS(u, 64, 2) -# define VUINT64x2_ALIGNED 1 -static inline VEC_ALWAYS_INLINE vuint64x2 vuint64x2_mul(vuint64x2 ab, vuint64x2 cd) +VEC_DEFINE_OPERATIONS(, 64, 2) +VEC_GENERIC_COMPARISONS(, 64, 2) +# define VINT64x2_ALIGNED 1 +VEC_DECL_MUL(, 64, 2) { - /* ac = (ab & 0xFFFFFFFF) * (cd & 0xFFFFFFFF); */ - __m128i ac = _mm_mul_epu32(ab, cd); + /* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */ + __m128i ac = _mm_mul_epu32(vec1, vec2); - /* b = ab >> 32; */ - __m128i b = _mm_srli_epi64(ab, 32); + /* b = vec1 >> 32; */ + __m128i b = _mm_srli_epi64(vec1, 32); - /* bc = b * (cd & 0xFFFFFFFF); */ - __m128i bc = _mm_mul_epu32(b, cd); + /* bc = b * (vec2 & 0xFFFFFFFF); */ + __m128i bc = _mm_mul_epu32(b, vec2); - /* d = cd >> 32; */ - __m128i d = _mm_srli_epi64(cd, 32); + /* d = vec2 >> 32; */ + __m128i d = _mm_srli_epi64(vec2, 32); - /* ad = (ab & 0xFFFFFFFF) * d; */ - __m128i ad = _mm_mul_epu32(ab, d); + /* ad = (vec1 & 0xFFFFFFFF) * d; */ + __m128i ad = _mm_mul_epu32(vec1, d); /* high = bc + ad; */ __m128i high = _mm_add_epi64(bc, ad); @@ -246,10 +264,8 @@ /* return ac + high; */ return _mm_add_epi64(high, ac); } -static inline VEC_ALWAYS_INLINE vuint64x2 vuint64x2_splat(uint64_t c) -{ - return VUINT64x2_CONSTANT(c, c); -} #endif #undef VEC_DEFINE_OPERATIONS +#undef VEC_SSE2_8x16_SHIFT +#undef VEC_SSE2_16x8_SHIFT diff -r 1d9d2308c1d2 -r f12b5dd4e18c include/vec/vec.h --- a/include/vec/vec.h Tue Oct 22 01:28:48 2024 -0400 +++ b/include/vec/vec.h Tue Oct 22 22:39:05 2024 -0400 @@ -25,12 +25,15 @@ #ifndef VEC_VEC_H_ #define VEC_VEC_H_ +#include +#include + #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ (((a) >= (x)) && \ ((a) > x || (b) >= (y)) && \ ((a) > x || (b) > (y) || (c) >= (z))) -#define VEC_GCC_ATLEAST(x, y, z) \ +#define VEC_GNUC_ATLEAST(x, y, z) \ VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z) /* GCC/clang attributes */ @@ -42,13 +45,13 @@ # define VEC_ALIGNED(x) __attribute__((__aligned__(x))) # endif # if __has_attribute(__vector_size__) -# define VEC_HAVE_GCC_VECTORS +# define VEC_HAVE_GNUC_VECTORS # endif #endif -#ifndef VEC_HAVE_GCC_VECTORS -# if __GNUC__ >= 4 || __clang_major__ >= 3 -# define VEC_HAVE_GCC_VECTORS +#ifndef VEC_HAVE_GNUC_VECTORS +# if VEC_GNUC_ATLEAST(4, 0, 0) +# define VEC_HAVE_GNUC_VECTORS # endif #endif @@ -78,20 +81,271 @@ type *var = (type *)((((intptr_t)var##_unaligned_ + align - 1) / align) * align) #endif +/* --------------------------------------------------------------- */ +/* bit shift */ + +static inline VEC_ALWAYS_INLINE uintmax_t vec_ulrshift(uintmax_t x, unsigned int y) +{ + return x >> y; +} + +static inline VEC_ALWAYS_INLINE uintmax_t vec_ullshift(uintmax_t x, unsigned int y) +{ + return x << y; +} + +static inline VEC_ALWAYS_INLINE intmax_t vec_lrshift(intmax_t x, unsigned int y) +{ + return (intmax_t)(((uintmax_t)x) >> y); +} + +static inline VEC_ALWAYS_INLINE intmax_t vec_llshift(intmax_t x, unsigned int y) +{ + return (intmax_t)(((uintmax_t)x) << y); +} + +static inline VEC_ALWAYS_INLINE uintmax_t vec_urshift(uintmax_t x, unsigned int y) +{ + return x >> y; +} + +static inline VEC_ALWAYS_INLINE uintmax_t vec_ulshift(uintmax_t x, unsigned int y) +{ + return x << y; +} + +/** + * Arithmetic shifts; based off code from OpenMPT, which is under + * the Boost Software License: + * + * Permission is hereby granted, free of charge, to any person or organization + * obtaining a copy of the software and accompanying documentation covered by + * this license (the "Software") to use, reproduce, display, distribute, + * execute, and transmit the Software, and to prepare derivative works of the + * Software, and to permit third-parties to whom the Software is furnished to + * do so, all subject to the following: + * + * The copyright notices in the Software and this entire statement, including + * the above license grant, this restriction and the following disclaimer, + * must be included in all copies of the Software, in whole or in part, and + * all derivative works of the Software, unless such copies or derivative + * works are solely in the form of machine-executable object code generated by + * a source language processor. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. +**/ +static inline VEC_ALWAYS_INLINE intmax_t vec_rshift(intmax_t x, unsigned int y) +{ + static const uintmax_t roffset = UINTMAX_C(1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + + uintmax_t urx = (uintmax_t)x; + urx += roffset; + urx >>= y; + urx -= roffset >> y; + + return (intmax_t)urx; +} + +static inline VEC_ALWAYS_INLINE intmax_t vec_lshift(intmax_t x, unsigned int y) +{ + static const uintmax_t roffset = UINTMAX_C(1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + + uintmax_t urx = (uintmax_t)x; + urx += roffset; + urx <<= y; + urx -= roffset << y; + + return (intmax_t)urx; +} + +/* --------------------------------------------------------------- */ +/* Implementation includes */ + +#define VEC_OPERATION_DECL(sign, bits, size, ret, op, params) \ + static inline VEC_ALWAYS_INLINE ret v##sign##int##bits##x##size##_##op params + +#define VEC_OPERATION_THIS_DECL(sign, bits, size, op, params) \ + VEC_OPERATION_DECL(sign, bits, size, v##sign##int##bits##x##size, op, params) + +#define VEC_TWOWAY_DECL(sign, bits, size, op) \ + VEC_OPERATION_THIS_DECL(sign, bits, size, op, (v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2)) + +#define VEC_DECL_SPLAT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, splat, (sign##int##bits##_t x)) +#define VEC_DECL_LOAD(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, load, (const sign##int##bits##_t in[size])) +#define VEC_DECL_STORE(sign, bits, size) VEC_OPERATION_DECL(sign, bits, size, void, store, (v##sign##int##bits##x##size vec, sign##int##bits##_t out[size])) +#define VEC_DECL_ADD(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, add) +#define VEC_DECL_SUB(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, sub) +#define VEC_DECL_MUL(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, mul) +#define VEC_DECL_DIV(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, div) +#define VEC_DECL_AND(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, and) +#define VEC_DECL_OR(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, or) +#define VEC_DECL_XOR(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, xor) +#define VEC_DECL_AVG(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, avg) +#define VEC_DECL_SHIFT(sign, bits, size, vectype, way) VEC_OPERATION_THIS_DECL(sign, bits, size, vectype##way##shift, (v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2)) +#define VEC_DECL_NOT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, not, (v##sign##int##bits##x##size vec)) + +/* comparisons */ +#define VEC_DECL_CMPLT(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmplt) +#define VEC_DECL_CMPGT(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpgt) +#define VEC_DECL_CMPEQ(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpeq) +#define VEC_DECL_CMPLE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmple) +#define VEC_DECL_CMPGE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpge) + +/* Generic variations. */ +#define VEC_GENERIC_SPLAT(sign, bits, size) \ + VEC_DECL_SPLAT(sign, bits, size) \ + { \ + sign##int##bits##_t va[size]; \ + for (int i = 0; i < size; i++) va[i] = x; \ + return v##sign##int##bits##x##size##_load(va); \ + } + +#define VEC_GENERIC_DIVIDE(sign, bits, size) \ + VEC_DECL_DIV(sign, bits, size) \ + { \ + sign##int##bits##_t vec1a[size], vec2a[size]; \ + \ + v##sign##int##bits##x##size##_store(vec1, vec1a); \ + v##sign##int##bits##x##size##_store(vec2, vec2a); \ + \ + for (int i = 0; i < size; i++) vec1a[i] = (vec2a[i]) ? (vec1a[i] / vec2a[i]) : 0; \ + \ + return v##sign##int##bits##x##size##_load(vec1a); \ + } + +#define VEC_GENERIC_SHIFT(sign, bits, size, vectype, way) \ + VEC_DECL_SHIFT(sign, bits, size, vectype, way) \ + { \ + sign##int##bits##_t vec1a[size], vec2a[size]; \ + \ + v##sign##int##bits##x##size##_store(vec1, vec1a); \ + vuint##bits##x##size##_store(vec2, vec2a); \ + \ + for (int i = 0; i < size; i++) vec1a[i] = vec_##sign##vectype##way##shift(vec1a[i], vec2a[i]); \ + \ + return v##sign##int##bits##x##size##_load(vec1a); \ + } + +#define VEC_GENERIC_SHIFTS(sign, bits, size) \ + VEC_GENERIC_SHIFT(sign, bits, size, , l) /* left shift */ \ + VEC_GENERIC_SHIFT(sign, bits, size, , r) /* arithmetic right shift */ \ + VEC_GENERIC_SHIFT(sign, bits, size, l, r) /* logical right shift */ + +#define VEC_GENERIC_AVG(sign, bits, size) \ + VEC_DECL_AVG(sign, bits, size) \ + { \ + return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_mul(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \ + } + +#define VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size) \ + VEC_DECL_NOT(sign, bits, size); \ + \ + VEC_DECL_CMPLE(sign, bits, size) \ + { \ + return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmpgt(vec1, vec2)); \ + } \ + VEC_DECL_CMPGE(sign, bits, size) \ + { \ + return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmplt(vec1, vec2)); \ + } + +#define VEC_GENERIC_COMPARISON(sign, bits, size, name, op) \ + VEC_DECL_CMP##name(sign, bits, size) \ + { \ + sign##int##bits##_t vec1a[size], vec2a[size]; \ + \ + v##sign##int##bits##x##size##_store(vec1, vec1a); \ + v##sign##int##bits##x##size##_store(vec2, vec2a); \ + \ + for (int i = 0; i < size; i++) vec1a[i] = (vec1a[i] op vec2a[i]) ? UINT##bits##_MAX : 0; \ + \ + return v##sign##int##bits##x##size##_load(vec1a); \ + } + +#define VEC_GENERIC_COMPARISONS(sign, bits, size) \ + VEC_GENERIC_COMPARISON(sign, bits, size, LT, <) \ + VEC_GENERIC_COMPARISON(sign, bits, size, GT, >) \ + VEC_GENERIC_COMPARISON(sign, bits, size, EQ, ==) \ + VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size) + /* POWER altivec */ #ifdef __ALTIVEC__ # include "impl/altivec.h" #endif -/* x86_64 SSE2+ */ +/* x86 SSE2 */ #ifdef __SSE2__ # include "impl/sse2.h" #endif -#ifdef VEC_HAVE_GCC_VECTORS +#ifdef VEC_HAVE_GNUC_VECTORS # include "impl/gcc.h" #endif #include "impl/generic.h" +/* ----------------------------------------------------------------- */ +/* bitwise NOT is just an XOR with UINT[BITS]_MAX */ + +#define DEFINE_NOT_OPERATION(sign, bits, size) \ + VEC_DECL_NOT(sign, bits, size) \ + { \ + return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat(UINT##bits##_MAX)); \ + } + +DEFINE_NOT_OPERATION(, 8, 16) +DEFINE_NOT_OPERATION(, 16, 8) +DEFINE_NOT_OPERATION(, 32, 4) +DEFINE_NOT_OPERATION(, 64, 2) +DEFINE_NOT_OPERATION(u, 8, 16) +DEFINE_NOT_OPERATION(u, 16, 8) +DEFINE_NOT_OPERATION(u, 32, 4) +DEFINE_NOT_OPERATION(u, 64, 2) + +#undef DEFINE_NOT_OPERATION + +/* ---------------------------------------------------------------- */ + +/* cleanup */ +#undef VEC_OPERATION_DECL +#undef VEC_OPERATION_THIS_DECL +#undef VEC_TWOWAY_DECL + +#undef VEC_DECL_SPLAT +#undef VEC_DECL_LOAD +#undef VEC_DECL_STORE +#undef VEC_DECL_ADD +#undef VEC_DECL_SUB +#undef VEC_DECL_MUL +#undef VEC_DECL_DIV +#undef VEC_DECL_AND +#undef VEC_DECL_OR +#undef VEC_DECL_XOR +#undef VEC_DECL_AVG +#undef VEC_DECL_SHIFT +#undef VEC_DECL_NOT + +#undef VEC_DECL_CMPLT +#undef VEC_DECL_CMPGT +#undef VEC_DECL_CMPEQ +#undef VEC_DECL_CMPLE +#undef VEC_DECL_CMPGE + +#undef VEC_GENERIC_SPLAT +#undef VEC_GENERIC_DIVIDE +#undef VEC_GENERIC_SHIFT +#undef VEC_GENERIC_SHIFTS +#undef VEC_GENERIC_AVG +#undef VEC_GENERIC_THAN_OR_EQUAL +#undef VEC_GENERIC_COMPARISON +#undef VEC_GENERIC_COMPARISONS + +/* ---------------------------------------------------------------- */ + #endif /* VEC_VEC_H_ */ diff -r 1d9d2308c1d2 -r f12b5dd4e18c test/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile Tue Oct 22 22:39:05 2024 -0400 @@ -0,0 +1,12 @@ +_CFLAGS = -g -O2 -I../include $(CFLAGS) + +_LDFLAGS = $(LDFLAGS) + +.c.o: + $(CC) -c $(_CFLAGS) $< -o $@ + +main: main.o + $(CC) $(_CFLAGS) -o $@ $^ $(_LDFLAGS) + +clean: + $(RM) main main.o \ No newline at end of file diff -r 1d9d2308c1d2 -r f12b5dd4e18c test/main.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/main.c Tue Oct 22 22:39:05 2024 -0400 @@ -0,0 +1,102 @@ +#include "vec/vec.h" + +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) + +static const int8_t testval8[] = { + INT8_C(-80), INT8_C(-3), INT8_C(25), INT8_C(0x7F), + INT8_C(-42), INT8_C(27), INT8_C(24), INT8_C(0x40), +}; + +static const uint8_t testvalu8[] = { + UINT8_C(0x00), UINT8_C(0xFF), UINT8_C(0xFE), UINT8_C(0x7F), + UINT8_C(0xC0), UINT8_C(0x80), UINT8_C(0x20), UINT8_C(0x50), +}; + +static const int16_t testval16[] = { + INT16_C(-8000), INT16_C(-30), INT16_MAX, INT16_C(0x4000), + INT16_C(-42), INT16_C(250), INT16_MIN, INT16_C(0x500), +}; + +static const uint16_t testvalu16[] = { + UINT16_C(0x0000), UINT16_C(0xFFFF), UINT16_C(0xFEA), UINT16_C(0x7FF), + UINT16_C(0x7FFF), UINT16_C(0x8000), UINT16_C(0x20B), UINT16_C(0x50C), +}; + +static const int32_t testval32[] = { + INT32_C(-1000000), INT32_C(-3), INT32_C(0x00000000), INT32_C(0xFFFFFFFF), + INT32_C( -42), INT32_C(27), INT32_C(0xABCDEF03), INT32_C(0x00000FFF), + INT32_C(0xFFFFFFFF), INT32_C( 0), INT32_C(0xFFFFFFFE), INT32_C( 1), +}; + +static const uint32_t testvalu32[] = { + UINT32_C(0x00000000), UINT32_C(0xDEADBEEF), UINT32_C(42), UINT32_C(0x12340000), + UINT32_C(0xFFFFFFFF), UINT32_C(0xFEDCBA98), UINT32_C(17), UINT32_C(0x00012345), + UINT32_C(0xFFFFFFFF), UINT32_C(0xFFFFFFFE), UINT32_C( 0), UINT32_C( 1), +}; + +static const int64_t testval64[] = { + INT64_MAX, INT64_C(-3), INT64_C(0x00000000), INT64_C(0xFFFFFFFFF), + INT64_MIN, INT64_C(645366), INT64_C(0x12345ABCDE), INT64_C(0xF00000FFF), +}; + +static const uint64_t testvalu64[] = { + UINT64_MAX, UINT64_C(0x44354365), UINT64_C(0x00000000), UINT64_C(0xFFFFFFFFF), + UINT64_C(0xff), UINT64_C(645366), UINT64_C(0x12345ABCDE), UINT64_C(0xF00000FFF), +}; + +#define VTEST(sign, bits, size) \ + static inline v##sign##int##bits##x##size vtest##sign##bits##x##size(const size_t start) \ + { \ + sign##int##bits##_t x[size]; \ + for (size_t i = 0; i < size; i++) \ + x[i] = testval##sign##bits[(start + i) % ARRAY_SIZE(testval##sign##bits)]; \ + return v##sign##int##bits##x##size##_load(x); \ + } + +#define VTEST_SIGN(bits, size) VTEST(, bits, size) VTEST(u, bits, size) + +VTEST_SIGN(8, 16) +VTEST_SIGN(16, 8) +VTEST_SIGN(32, 4) +VTEST_SIGN(64, 2) + +#define DEFINE_PRINT_VECTOR(sign, psign, bits, size) \ + static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \ + { \ + fputs("vector: ", file); \ + \ + int##bits##_t v[size]; \ + \ + v##sign##int##bits##x##size##_store(vec, v); \ + \ + fprintf(file, "%" PRI ## psign ## bits, v[0]); \ + \ + for (int i = 1; i < size; i++) \ + fprintf(file, ", %" PRI ## psign ## bits, v[i]); \ + \ + fputs("\n", file); \ + \ + } + +#define DEFINE_PRINT_VECTOR_2(bits, size) DEFINE_PRINT_VECTOR(, d, bits, size) DEFINE_PRINT_VECTOR(u, u, bits, size) + +DEFINE_PRINT_VECTOR_2(8, 16) +DEFINE_PRINT_VECTOR_2(16, 8) +DEFINE_PRINT_VECTOR_2(32, 4) +DEFINE_PRINT_VECTOR_2(64, 2) + +#include "test_arith.h" +#include "test_compare.h" + +int main(void) +{ + int ret = 0; + + ret |= test_arith(); + ret |= test_compare(); + + return ret; +} diff -r 1d9d2308c1d2 -r f12b5dd4e18c test/test_arith.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_arith.h Tue Oct 22 22:39:05 2024 -0400 @@ -0,0 +1,120 @@ +#define CREATE_TEST(sign, psign, bits, size, op, equiv) \ + static int test_arith_v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size a, v##sign##int##bits##x##size b) \ + { \ + sign##int##bits##_t orig_a[size], orig_b[size], orig_c[size]; \ + \ + v##sign##int##bits##x##size c = v##sign##int##bits##x##size##_##op(a, b); \ + \ + v##sign##int##bits##x##size##_store(a, orig_a); \ + v##sign##int##bits##x##size##_store(b, orig_b); \ + v##sign##int##bits##x##size##_store(c, orig_c); \ + \ + for (int i = 0; i < size; i++) { \ + if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \ + fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%" PRI ## psign ## bits "] does not equal result [%" PRI ## psign ## bits "]!\n", i, equiv, orig_c[i]); \ + print_v##sign##int##bits##x##size(stderr,a); \ + print_v##sign##int##bits##x##size(stderr,b); \ + print_v##sign##int##bits##x##size(stderr,c); \ + fprintf(stderr, "\n"); \ + return 1; \ + } \ + } \ + \ + return 0; \ + } + +#define CREATE_TEST_SHIFT(sign, psign, bits, size, op, equiv) \ + static int test_arith_v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size a, vuint##bits##x##size b) \ + { \ + sign##int##bits##_t orig_a[size], orig_c[size]; \ + uint##bits##_t orig_b[size]; \ + \ + v##sign##int##bits##x##size c = v##sign##int##bits##x##size##_##op(a, b); \ + \ + v##sign##int##bits##x##size##_store(a, orig_a); \ + vuint##bits##x##size##_store(b, orig_b); \ + v##sign##int##bits##x##size##_store(c, orig_c); \ + \ + for (int i = 0; i < size; i++) { \ + if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \ + fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%" PRI ## psign ## bits "] does not equal result [%" PRI ## psign ## bits "]!\n", i, equiv, orig_c[i]); \ + print_v##sign##int##bits##x##size(stderr,a); \ + print_vuint##bits##x##size(stderr,b); \ + print_v##sign##int##bits##x##size(stderr,c); \ + fprintf(stderr, "\n"); \ + return 1; \ + } \ + } \ + \ + return 0; \ + } + +#define CREATE_TESTS(sign, psign, bits, size) \ + CREATE_TEST(sign, psign, bits, size, add, orig_a[i] + orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, sub, orig_a[i] - orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, mul, orig_a[i] * orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, div, (orig_b[i]) ? (orig_a[i] / orig_b[i]) : 0) \ + CREATE_TEST(sign, psign, bits, size, and, orig_a[i] & orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, or, orig_a[i] | orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, xor, orig_a[i] ^ orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, avg, (orig_a[i] * orig_b[i]) / 2) \ + CREATE_TEST_SHIFT(sign, psign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \ + CREATE_TEST_SHIFT(sign, psign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \ + CREATE_TEST_SHIFT(sign, psign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i])) + +#define CREATE_TESTS_2(bits, size) \ + CREATE_TESTS(, d, bits, size) \ + CREATE_TESTS(u, u, bits, size) + +CREATE_TESTS_2(8, 16) +CREATE_TESTS_2(16, 8) +CREATE_TESTS_2(32, 4) +CREATE_TESTS_2(64, 2) + +#undef CREATE_TESTS_2 +#undef CREATE_TESTS +#undef CREATE_TEST + +static int test_arith(void) +{ + int ret = 0; + +#define RUN_TESTS(sign, bits, size) \ + for (size_t i = 0U; i < ARRAY_SIZE(testval##sign##bits); i++) { \ + const v##sign##int##bits##x##size a = vtest##sign##bits##x##size(i); \ + for (size_t j = 0U; j < ARRAY_SIZE(testval##sign##bits); j++) { \ + const v##sign##int##bits##x##size b = vtest##sign##bits##x##size(j); \ + ret |= test_arith_v##sign##int##bits##x##size##_add(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_sub(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_mul(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_div(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_and(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_or(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_xor(a, b); \ + } \ + } \ + \ + for (size_t i = 0U; i < ARRAY_SIZE(testval##sign##bits); i++) { \ + const v##sign##int##bits##x##size a = vtest##sign##bits##x##size(i); \ + for (size_t j = 0U; j < ARRAY_SIZE(testvalu##bits); j++) { \ + const vuint##bits##x##size b = vtestu##bits##x##size(j); \ + ret |= test_arith_v##sign##int##bits##x##size##_rshift(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_lshift(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_lrshift(a, b); \ + } \ + } + +#define RUN_TESTS_2(bits, size) \ + RUN_TESTS(, bits, size) \ + RUN_TESTS(u, bits, size) + + RUN_TESTS_2(8, 16) + RUN_TESTS_2(16, 8) + RUN_TESTS_2(32, 4) + RUN_TESTS_2(64, 2) + +#undef RUN_TESTS_2 +#undef RUN_TESTS + + return ret; +} \ No newline at end of file diff -r 1d9d2308c1d2 -r f12b5dd4e18c test/test_compare.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_compare.h Tue Oct 22 22:39:05 2024 -0400 @@ -0,0 +1,72 @@ +#define CREATE_TEST(sign, psign, bits, size, op, equiv) \ + static int test_compare_v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size a, v##sign##int##bits##x##size b) \ + { \ + sign##int##bits##_t orig_a[size], orig_b[size], orig_c[size]; \ + \ + v##sign##int##bits##x##size c = v##sign##int##bits##x##size##_##op(a, b); \ + \ + v##sign##int##bits##x##size##_store(a, orig_a); \ + v##sign##int##bits##x##size##_store(b, orig_b); \ + v##sign##int##bits##x##size##_store(c, orig_c); \ + \ + for (int i = 0; i < size; i++) { \ + if ((sign##int##bits##_t)(((equiv) ? UINT##bits##_MAX : 0)) != orig_c[i]) { \ + fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%" PRI ## psign ## bits "] does not equal result [%" PRI ## psign ## bits "]!\n", i, equiv, orig_c[i]); \ + print_v##sign##int##bits##x##size(stderr,a); \ + print_v##sign##int##bits##x##size(stderr,b); \ + print_v##sign##int##bits##x##size(stderr,c); \ + fprintf(stderr, "\n"); \ + return 1; \ + } \ + } \ + \ + return 0; \ + } + +#define CREATE_TESTS(sign, psign, bits, size) \ + CREATE_TEST(sign, psign, bits, size, cmplt, orig_a[i] < orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, cmpgt, orig_a[i] > orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, cmpeq, orig_a[i] == orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, cmple, orig_a[i] <= orig_b[i]) \ + CREATE_TEST(sign, psign, bits, size, cmpge, orig_a[i] >= orig_b[i]) + +#define CREATE_TESTS_2(bits, size) CREATE_TESTS(, d, bits, size) CREATE_TESTS(u, u, bits, size) + +CREATE_TESTS_2(8, 16) +CREATE_TESTS_2(16, 8) +CREATE_TESTS_2(32, 4) +CREATE_TESTS_2(64, 2) + +#undef CREATE_TESTS_2 +#undef CREATE_TESTS +#undef CREATE_TEST + +static int test_compare(void) +{ + int ret = 0; + +#define RUN_TESTS(sign, bits, size) \ + for (size_t i = 0U; i < ARRAY_SIZE(testval##sign##bits); i++) { \ + const v##sign##int##bits##x##size a = vtest##sign##bits##x##size(i); \ + for (size_t j = 0U; j < ARRAY_SIZE(testval##sign##bits); j++) { \ + const v##sign##int##bits##x##size b = vtest##sign##bits##x##size(j); \ + ret |= test_compare_v##sign##int##bits##x##size##_cmplt(a, b); \ + ret |= test_compare_v##sign##int##bits##x##size##_cmpgt(a, b); \ + ret |= test_compare_v##sign##int##bits##x##size##_cmpeq(a, b); \ + ret |= test_compare_v##sign##int##bits##x##size##_cmple(a, b); \ + ret |= test_compare_v##sign##int##bits##x##size##_cmpge(a, b); \ + } \ + } + +#define RUN_TESTS_2(bits, size) RUN_TESTS(, bits, size) RUN_TESTS(u, bits, size) + + RUN_TESTS_2(8, 16) + RUN_TESTS_2(16, 8) + RUN_TESTS_2(32, 4) + RUN_TESTS_2(64, 2) + +#undef RUN_TESTS_2 +#undef RUN_TESTS + + return ret; +} \ No newline at end of file