Mercurial > vec
changeset 4:75ab77f874e2
*: aligned generics, fixed altivec, aligned tests...
author | Paper <paper@tflc.us> |
---|---|
date | Wed, 23 Oct 2024 10:13:25 -0400 (2 months ago) |
parents | 3c5545b1568f |
children | 1f070512497f |
files | include/vec/impl/altivec.h include/vec/impl/gcc.h include/vec/impl/generic.h include/vec/impl/sse2.h include/vec/vec.h test/main.c test/test_align.h test/test_arith.h test/test_compare.h |
diffstat | 9 files changed, 258 insertions(+), 239 deletions(-) [+] |
line wrap: on
line diff
--- a/include/vec/impl/altivec.h Tue Oct 22 23:27:15 2024 -0400 +++ b/include/vec/impl/altivec.h Wed Oct 23 10:13:25 2024 -0400 @@ -31,12 +31,46 @@ #define VEC_ALTIVEC_ALIGNMENT 16 -/* Since altivec conveniently made their API super user friendly, we can just use - * one giant macro to define literally everything */ -#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ +/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */ +#ifdef vec_mul +# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \ + VEC_DECL_MUL(sign, csign, bits, size) \ + { \ + return vec_mul(vec1, vec2); \ + } +#else +# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \ + VEC_GENERIC_MULTIPLY(sign, csign, bits, size) +#endif + +#ifdef vec_splats +# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \ VEC_DECL_SPLAT(sign, bits, size) \ { \ return vec_splats(x); \ + } +#else +# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \ + VEC_GENERIC_SPLAT(sign, csign, bits, size) +#endif + +#define VEC_ALTIVEC_uRSHIFT vec_sr +#define VEC_ALTIVEC_RSHIFT vec_sra + +#define VEC_ALTIVEC_uLRSHIFT(sign, csign, bits, size) \ + VEC_DECL_SHIFT(sign, bits, size, l, r) \ + { \ + return vec_sr(vec1, vec2); \ + } +#define VEC_ALTIVEC_LRSHIFT(sign, csign, bits, size) \ + VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r) + +/* Since altivec conveniently made their API super user friendly, we can just use + * one giant macro to define literally everything */ +#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ + VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ + { \ + return vec_ld(0, in); \ } \ \ VEC_DECL_LOAD(sign, bits, size) \ @@ -44,6 +78,11 @@ return vec_perm(vec_ld(0, in), vec_ld(VEC_ALTIVEC_ALIGNMENT, in), vec_lvsl(0, in)); \ } \ \ + VEC_DECL_STORE_ALIGNED(sign, bits, size) \ + { \ + vec_st(vec, 0, out); \ + } \ + \ VEC_DECL_STORE(sign, bits, size) \ { \ VEC_ALIGNED_ARRAY(sign##int##bits##_t, aligned_out, size, VEC_ALTIVEC_ALIGNMENT); \ @@ -61,10 +100,7 @@ return vec_sub(vec1, vec2); \ } \ \ - VEC_DECL_MUL(sign, bits, size) \ - { \ - return vec_mul(vec1, vec2); \ - } \ + VEC_ALTIVEC_MUL(sign, csign, bits, size) \ \ VEC_DECL_SHIFT(sign, bits, size, , l) \ { \ @@ -73,13 +109,10 @@ \ VEC_DECL_SHIFT(sign, bits, size, , r) \ { \ - return vec_sra(vec1, vec2); \ + return VEC_ALTIVEC_##sign##RSHIFT(vec1, vec2); \ } \ \ - VEC_DECL_SHIFT(sign, bits, size, l, r) \ - { \ - return vec_sr(vec1, vec2); \ - } \ + VEC_ALTIVEC_##sign##LRSHIFT(sign, csign, bits, size) \ \ VEC_DECL_AVG(sign, bits, size) \ { \ @@ -101,15 +134,17 @@ return vec_xor(vec1, vec2); \ } \ \ - VEC_GENERIC_DIVIDE(sign, bits, size) + VEC_GENERIC_COMPARISONS(sign, csign, bits, size) \ + VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ + VEC_ALTIVEC_SPLAT(sign, csign, bits, size) #ifndef VEC_VUINT8X16 # define VEC_VUINT8X16 typedef vector unsigned char vuint8x16; # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -VEC_DEFINE_OPERATIONS(u, 8, 16) # define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 8, 16) #endif /* VEC_VUINT8X16 */ #ifndef VEC_VINT8X16 @@ -117,8 +152,8 @@ typedef vector signed char vint8x16; # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ (vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -VEC_DEFINE_OPERATIONS(, 8, 16) # define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 8, 16) #endif /* VEC_VINT8X16 */ #ifndef VEC_VUINT16X8 @@ -126,8 +161,8 @@ typedef vector unsigned short vuint16x8; # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ (vuint16x8){ a, b, c, d, e, f, g, h } -VEC_DEFINE_OPERATIONS(u, 16, 8) # define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 16, 8) #endif /* VEC_VUINT16X8 */ #ifndef VEC_VINT16X8 @@ -135,8 +170,8 @@ typedef vector signed short vint16x8; # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ (vint16x8){ a, b, c, d, e, f, g, h } -VEC_DEFINE_OPERATIONS(, 16, 8) # define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 16, 8) #endif /* VEC_VINT16X8 */ #ifndef VEC_VUINT32X4 @@ -144,8 +179,8 @@ typedef vector unsigned int vuint32x4; # define VUINT32x4_CONSTANT(a, b, c, d) \ (vuint32x4){ a, b, c, d } -VEC_DEFINE_OPERATIONS(u, 32, 4) # define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 32, 4) #endif /* VEC_VUINT32X4 */ #ifndef VEC_VINT32X4 @@ -153,8 +188,8 @@ typedef vector signed int vint32x4; # define VINT32x4_CONSTANT(a, b, c, d) \ (vint32x4){ a, b, c, d } -VEC_DEFINE_OPERATIONS(, 32, 4) # define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 32, 4) #endif /* VEC_VINT32X4 */ #if defined(__POWER8__) && defined(__VSX__) @@ -164,8 +199,8 @@ typedef vector unsigned long long vuint64x2; # define VUINT64x2_CONSTANT(a, b) \ (vuint64x2){ a, b } -VEC_DEFINE_OPERATIONS(u, 64, 2) # define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 64, 2) # endif /* VEC_VUINT64X2 */ # ifndef VEC_VINT64X2 @@ -173,10 +208,12 @@ typedef vector signed long long vint64x2; # define VINT64x2_CONSTANT(a, b) \ (vint64x2){ a, b } -VEC_DEFINE_OPERATIONS(, 64, 2) # define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 64, 2) # endif /* VEC_VINT64X2 */ #endif /* defined(__POWER8__) && defined(__VSX__) */ #undef VEC_DEFINE_OPERATIONS +#undef VEC_ALTIVEC_MUL +#undef VEC_ALTIVEC_SPLAT
--- a/include/vec/impl/gcc.h Tue Oct 22 23:27:15 2024 -0400 +++ b/include/vec/impl/gcc.h Wed Oct 23 10:13:25 2024 -0400 @@ -27,7 +27,7 @@ #include <stdint.h> #include <string.h> -#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ +#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ { \ v##sign##int##bits##x##size vec; \ @@ -100,9 +100,9 @@ return vec1 >= vec2; \ } \ \ - VEC_GENERIC_DIVIDE(sign, bits, size) \ - VEC_GENERIC_SPLAT(sign, bits, size) \ - VEC_GENERIC_SHIFTS(sign, bits, size) \ + VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ + VEC_GENERIC_SPLAT(sign, csign, bits, size) \ + VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ VEC_GENERIC_AVG(sign, bits, size) #ifndef VEC_VUINT8X16 @@ -110,8 +110,8 @@ typedef uint8_t vuint8x16 __attribute__((__vector_size__(16))); # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -VEC_DEFINE_OPERATIONS(u, 8, 16) -# define VINT8x16_ALIGNED 1 +# define VUINT8x16_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 8, 16) #endif #ifndef VEC_VUINT16X8 @@ -119,8 +119,8 @@ typedef uint16_t vuint16x8 __attribute__((__vector_size__(16))); # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ (vuint16x8){ a, b, c, d, e, f, g, h } -VEC_DEFINE_OPERATIONS(u, 16, 8) -# define VINT16x8_ALIGNED 1 +# define VUINT16x8_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 16, 8) #endif #ifndef VEC_VUINT32X4 @@ -128,8 +128,8 @@ typedef uint32_t vuint32x4 __attribute__((__vector_size__(16))); # define VUINT32x4_CONSTANT(a, b, c, d) \ (vuint32x4){ a, b, c, d } -VEC_DEFINE_OPERATIONS(u, 32, 4) -# define VINT32x4_ALIGNED 1 +# define VUINT32x4_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 32, 4) #endif #ifndef VEC_VUINT64X2 @@ -137,8 +137,8 @@ typedef uint64_t vuint64x2 __attribute__((__vector_size__(16))); # define VUINT64x2_CONSTANT(a, b) \ (vuint64x2){ a, b } -VEC_DEFINE_OPERATIONS(u, 64, 2) -# define VINT64x2_ALIGNED 1 +# define VUINT64x2_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 64, 2) #endif #ifndef VEC_VINT8X16 @@ -146,8 +146,8 @@ typedef int8_t vint8x16 __attribute__((__vector_size__(16))); # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ (vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -VEC_DEFINE_OPERATIONS(, 8, 16) -# define VINT8x16_ALIGNED 1 +# define VINT8x16_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 8, 16) #endif #ifndef VEC_VINT16X8 @@ -155,8 +155,8 @@ typedef int16_t vint16x8 __attribute__((__vector_size__(16))); # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ (vint16x8){ a, b, c, d, e, f, g, h } -VEC_DEFINE_OPERATIONS(, 16, 8) -# define VINT16x8_ALIGNED 1 +# define VINT16x8_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 16, 8) #endif #ifndef VEC_VINT32X4 @@ -164,8 +164,8 @@ typedef int32_t vint32x4 __attribute__((__vector_size__(16))); # define VINT32x4_CONSTANT(a, b, c, d) \ (vint32x4){ a, b, c, d } -VEC_DEFINE_OPERATIONS(, 32, 4) -# define VINT32x4_ALIGNED 1 +# define VINT32x4_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 32, 4) #endif #ifndef VEC_VINT64X2 @@ -173,8 +173,8 @@ typedef int64_t vint64x2 __attribute__((__vector_size__(16))); # define VINT64x2_CONSTANT(a, b) \ (vint64x2){ a, b } -VEC_DEFINE_OPERATIONS(, 64, 2) -# define VINT64x2_ALIGNED 1 +# define VINT64x2_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 64, 2) #endif #undef VEC_DEFINE_OPERATIONS
--- a/include/vec/impl/generic.h Tue Oct 22 23:27:15 2024 -0400 +++ b/include/vec/impl/generic.h Wed Oct 23 10:13:25 2024 -0400 @@ -32,7 +32,7 @@ sign##int##bits##_t arr[size]; \ } v##sign##int##bits##x##size; -#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ +#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ { \ v##sign##int##bits##x##size vec; \ @@ -91,18 +91,19 @@ return vec1; \ } \ \ - VEC_GENERIC_SPLAT(sign, bits, size) \ - VEC_GENERIC_SHIFTS(sign, bits, size) \ - VEC_GENERIC_DIVIDE(sign, bits, size) \ - VEC_GENERIC_AVG(sign, bits, size) + VEC_GENERIC_SPLAT(sign, csign, bits, size) \ + VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ + VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ + VEC_GENERIC_AVG(sign, bits, size) \ + VEC_GENERIC_COMPARISONS(sign, csign, bits, size) #ifndef VEC_VUINT8X16 # define VEC_VUINT8X16 VEC_DEFINE_STRUCT(u, 8, 16) # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ ((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) -VEC_DEFINE_OPERATIONS(u, 8, 16) -VEC_GENERIC_COMPARISONS(u, 8, 16) +# define VUINT8x16_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 8, 16) #endif #ifndef VEC_VUINT16X8 @@ -110,8 +111,8 @@ VEC_DEFINE_STRUCT(u, 16, 8) # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ ((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } }) -VEC_DEFINE_OPERATIONS(u, 16, 8) -VEC_GENERIC_COMPARISONS(u, 16, 8) +# define VUINT16x8_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 16, 8) #endif #ifndef VEC_VUINT32X4 @@ -119,8 +120,8 @@ VEC_DEFINE_STRUCT(u, 32, 4) # define VUINT32x4_CONSTANT(a, b, c, d) \ ((vuint32x4){ .arr = { a, b, c, d } }) -VEC_DEFINE_OPERATIONS(u, 32, 4) -VEC_GENERIC_COMPARISONS(u, 32, 4) +# define VUINT32x4_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 32, 4) #endif #ifndef VEC_VUINT64X2 @@ -128,8 +129,8 @@ VEC_DEFINE_STRUCT(u, 64, 2) # define VUINT64x2_CONSTANT(a, b) \ ((vuint64x2){ .arr = { a, b } }) -VEC_DEFINE_OPERATIONS(u, 64, 2) -VEC_GENERIC_COMPARISONS(u, 64, 2) +# define VUINT64x2_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(u, U, 64, 2) #endif #ifndef VEC_VINT8X16 @@ -137,8 +138,8 @@ VEC_DEFINE_STRUCT(, 8, 16) # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ ((vint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) -VEC_DEFINE_OPERATIONS(, 8, 16) -VEC_GENERIC_COMPARISONS(, 8, 16) +# define VINT8x16_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 8, 16) #endif #ifndef VEC_VINT16X8 @@ -146,8 +147,8 @@ VEC_DEFINE_STRUCT(, 16, 8) # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ ((vint16x8){ .arr = { a, b, c, d, e, f, g, h } }) -VEC_DEFINE_OPERATIONS(, 16, 8) -VEC_GENERIC_COMPARISONS(, 16, 8) +# define VINT16x8_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 16, 8) #endif #ifndef VEC_VINT32X4 @@ -155,8 +156,8 @@ VEC_DEFINE_STRUCT(, 32, 4) # define VINT32x4_CONSTANT(a, b, c, d) \ ((vint32x4){ .arr = { a, b, c, d } }) -VEC_DEFINE_OPERATIONS(, 32, 4) -VEC_GENERIC_COMPARISONS(, 32, 4) +# define VINT32x4_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 32, 4) #endif #ifndef VEC_VINT64X2 @@ -164,8 +165,8 @@ VEC_DEFINE_STRUCT(, 64, 2) # define VINT64x2_CONSTANT(a, b) \ ((vint64x2){ .arr = { a, b } }) -VEC_DEFINE_OPERATIONS(, 64, 2) -VEC_GENERIC_COMPARISONS(, 64, 2) +# define VINT64x2_ALIGNMENT 1 +VEC_DEFINE_OPERATIONS(, , 64, 2) #endif #undef VEC_DEFINE_STRUCT
--- a/include/vec/impl/sse2.h Tue Oct 22 23:27:15 2024 -0400 +++ b/include/vec/impl/sse2.h Wed Oct 23 10:13:25 2024 -0400 @@ -24,11 +24,9 @@ #include <emmintrin.h> -#include <string.h> /* memcpy */ - #define VEC_SSE2_ALIGNMENT 16 -#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ +#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ { \ return _mm_load_si128((const __m128i *)in); \ @@ -74,9 +72,9 @@ return _mm_xor_si128(vec1, vec2); \ } \ \ - VEC_GENERIC_SPLAT(sign, bits, size) \ - VEC_GENERIC_DIVIDE(sign, bits, size) \ - VEC_GENERIC_SHIFTS(sign, bits, size) \ + VEC_GENERIC_SPLAT(sign, csign, bits, size) \ + VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ + VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ VEC_DECL_MUL(sign, bits, size); \ VEC_GENERIC_AVG(sign, bits, size) @@ -100,9 +98,9 @@ typedef __m128i vuint8x16; # define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ (_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)) -VEC_DEFINE_OPERATIONS(u, 8, 16) -VEC_GENERIC_COMPARISONS(u, 8, 16) # define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 8, 16) +VEC_GENERIC_COMPARISONS(u, U, 8, 16) VEC_DECL_MUL(u, 8, 16) { // unpack and multiply @@ -119,9 +117,9 @@ typedef __m128i vuint16x8; # define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ (_mm_setr_epi16(h, g, f, e, d, c, b, a)) -VEC_DEFINE_OPERATIONS(u, 16, 8) -VEC_GENERIC_COMPARISONS(u, 16, 8) # define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 16, 8) +VEC_GENERIC_COMPARISONS(u, U, 16, 8) VEC_DECL_MUL(u, 16, 8) { return _mm_mullo_epi16(vec1, vec2); @@ -133,9 +131,9 @@ typedef __m128i vuint32x4; # define VUINT32x4_CONSTANT(a, b, c, d) \ (_mm_setr_epi32(d, c, b, a)) -VEC_DEFINE_OPERATIONS(u, 32, 4) -VEC_GENERIC_COMPARISONS(u, 32, 4) # define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 32, 4) +VEC_GENERIC_COMPARISONS(u, U, 32, 4) VEC_DECL_MUL(u, 32, 4) { /* this was stolen from... somewhere :) */ @@ -156,9 +154,9 @@ { return _mm_setr_epi32(b, b >> 32, a, a >> 32); } -VEC_DEFINE_OPERATIONS(u, 64, 2) -VEC_GENERIC_COMPARISONS(u, 64, 2) # define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 64, 2) +VEC_GENERIC_COMPARISONS(u, U, 64, 2) VEC_DECL_MUL(u, 64, 2) { /* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */ @@ -192,9 +190,9 @@ typedef __m128i vint8x16; # define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ (_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)) -VEC_DEFINE_OPERATIONS(, 8, 16) +# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 8, 16) VEC_DEFINE_COMPARISONS_SIGNED(8, 16) -# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DECL_MUL(, 8, 16) { // unpack and multiply @@ -211,9 +209,9 @@ typedef __m128i vint16x8; # define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ (_mm_setr_epi16(h, g, f, e, d, c, b, a)) -VEC_DEFINE_OPERATIONS(, 16, 8) +# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 16, 8) VEC_DEFINE_COMPARISONS_SIGNED(16, 8) -# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DECL_MUL(, 16, 8) { return _mm_mullo_epi16(vec1, vec2); @@ -225,9 +223,9 @@ typedef __m128i vint32x4; # define VINT32x4_CONSTANT(a, b, c, d) \ (_mm_setr_epi32(d, c, b, a)) -VEC_DEFINE_OPERATIONS(, 32, 4) +# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 32, 4) VEC_DEFINE_COMPARISONS_SIGNED(32, 4) -# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DECL_MUL(, 32, 4) { __m128i a13 = _mm_shuffle_epi32(vec1, 0xF5); // (-,a3,-,a1) @@ -247,9 +245,9 @@ { return _mm_setr_epi32(b, vec_rshift(b, 32), a, vec_rshift(a, 32)); } -VEC_DEFINE_OPERATIONS(, 64, 2) -VEC_GENERIC_COMPARISONS(, 64, 2) # define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 64, 2) +VEC_GENERIC_COMPARISONS(, , 64, 2) VEC_DECL_MUL(, 64, 2) { /* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */
--- a/include/vec/vec.h Tue Oct 22 23:27:15 2024 -0400 +++ b/include/vec/vec.h Wed Oct 23 10:13:25 2024 -0400 @@ -49,23 +49,15 @@ # endif #endif -#ifndef VEC_HAVE_GNUC_VECTORS -# if VEC_GNUC_ATLEAST(4, 0, 0) -# define VEC_HAVE_GNUC_VECTORS +#ifndef VEC_ALIGNED +# if VEC_GNUC_ATLEAST(2, 7, 0) +# define VEC_ALIGNED(x) __attribute__((__aligned__(x))) # endif #endif -#ifndef VEC_ALIGNED -# if VEC_GNUC_ATLEAST(2, 7, 0) -# define VEC_ALIGNED(x) __attribute__((aligned(x))) -# endif -#endif - -#ifndef VEC_ALWAYS_INLINE -# if VEC_GNUC_ATLEAST(3, 1, 0) -# define VEC_ALWAYS_INLINE(x) __attribute__((always_inline)) -# endif -#endif +/* FIXME: gcc 4.2 on Mac OS X doesn't have always_inline, + * even though docs and many online sources say that it + * should have it. */ #ifndef VEC_ALWAYS_INLINE # define VEC_ALWAYS_INLINE @@ -80,7 +72,7 @@ /* allocate more than necessary to align */ # define VEC_ALIGNED_ARRAY(type, var, length, align) \ unsigned char vec_##var##_unaligned_[((length) * sizeof(type)) + (align) - 1]; \ - type *var = (type *)((((intptr_t)vec_##var##_unaligned_ + (align) - 1) / (align)) * (align)) + type *var = (type *)(((intptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)) # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ (sizeof(vec_##var##_unaligned_) - ((align) - 1)) #endif @@ -172,6 +164,49 @@ } /* --------------------------------------------------------------- */ +/* Array alignment macros */ + +#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) +#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) + +#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) +#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) + +#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) +#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) + +#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) +#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) + +#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) + +#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) + +#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) + +#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) + +/* --------------------------------------------------------------- */ /* Implementation includes */ #define VEC_OPERATION_DECL(sign, bits, size, ret, op, params) \ @@ -207,44 +242,60 @@ #define VEC_DECL_CMPGE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpge) /* Generic variations. */ -#define VEC_GENERIC_SPLAT(sign, bits, size) \ +#define VEC_GENERIC_SPLAT(sign, csign, bits, size) \ VEC_DECL_SPLAT(sign, bits, size) \ { \ - sign##int##bits##_t va[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(va); \ for (int i = 0; i < size; i++) va[i] = x; \ - return v##sign##int##bits##x##size##_load(va); \ + return v##sign##int##bits##x##size##_load_aligned(va); \ } -#define VEC_GENERIC_DIVIDE(sign, bits, size) \ +#define VEC_GENERIC_MULTIPLY(sign, csign, bits, size) \ + VEC_DECL_MUL(sign, bits, size) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ + v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \ + \ + for (int i = 0; i < size; i++) vec1a[i] *= vec2a[i]; \ + \ + return v##sign##int##bits##x##size##_load_aligned(vec1a); \ + } + +#define VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ VEC_DECL_DIV(sign, bits, size) \ { \ - sign##int##bits##_t vec1a[size], vec2a[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ \ - v##sign##int##bits##x##size##_store(vec1, vec1a); \ - v##sign##int##bits##x##size##_store(vec2, vec2a); \ + v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ + v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \ \ for (int i = 0; i < size; i++) vec1a[i] = (vec2a[i]) ? (vec1a[i] / vec2a[i]) : 0; \ \ - return v##sign##int##bits##x##size##_load(vec1a); \ + return v##sign##int##bits##x##size##_load_aligned(vec1a); \ } -#define VEC_GENERIC_SHIFT(sign, bits, size, vectype, way) \ +#define VEC_GENERIC_SHIFT(sign, csign, bits, size, vectype, way) \ VEC_DECL_SHIFT(sign, bits, size, vectype, way) \ { \ - sign##int##bits##_t vec1a[size], vec2a[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ + VUINT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ \ - v##sign##int##bits##x##size##_store(vec1, vec1a); \ - vuint##bits##x##size##_store(vec2, vec2a); \ + v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ + vuint##bits##x##size##_store_aligned(vec2, vec2a); \ \ for (int i = 0; i < size; i++) vec1a[i] = vec_##sign##vectype##way##shift(vec1a[i], vec2a[i]); \ \ - return v##sign##int##bits##x##size##_load(vec1a); \ + return v##sign##int##bits##x##size##_load_aligned(vec1a); \ } -#define VEC_GENERIC_SHIFTS(sign, bits, size) \ - VEC_GENERIC_SHIFT(sign, bits, size, , l) /* left shift */ \ - VEC_GENERIC_SHIFT(sign, bits, size, , r) /* arithmetic right shift */ \ - VEC_GENERIC_SHIFT(sign, bits, size, l, r) /* logical right shift */ +#define VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ + VEC_GENERIC_SHIFT(sign, csign, bits, size, , l) /* left shift */ \ + VEC_GENERIC_SHIFT(sign, csign, bits, size, , r) /* arithmetic right shift */ \ + VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r) /* logical right shift */ #define VEC_GENERIC_AVG(sign, bits, size) \ VEC_DECL_AVG(sign, bits, size) \ @@ -264,23 +315,24 @@ return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmplt(vec1, vec2)); \ } -#define VEC_GENERIC_COMPARISON(sign, bits, size, name, op) \ +#define VEC_GENERIC_COMPARISON(sign, csign, bits, size, name, op) \ VEC_DECL_CMP##name(sign, bits, size) \ { \ - sign##int##bits##_t vec1a[size], vec2a[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec1a); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(vec2a); \ \ - v##sign##int##bits##x##size##_store(vec1, vec1a); \ - v##sign##int##bits##x##size##_store(vec2, vec2a); \ + v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ + v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \ \ for (int i = 0; i < size; i++) vec1a[i] = (vec1a[i] op vec2a[i]) ? UINT##bits##_MAX : 0; \ \ - return v##sign##int##bits##x##size##_load(vec1a); \ + return v##sign##int##bits##x##size##_load_aligned(vec1a); \ } -#define VEC_GENERIC_COMPARISONS(sign, bits, size) \ - VEC_GENERIC_COMPARISON(sign, bits, size, LT, <) \ - VEC_GENERIC_COMPARISON(sign, bits, size, GT, >) \ - VEC_GENERIC_COMPARISON(sign, bits, size, EQ, ==) \ +#define VEC_GENERIC_COMPARISONS(sign, csign, bits, size) \ + VEC_GENERIC_COMPARISON(sign, csign, bits, size, LT, <) \ + VEC_GENERIC_COMPARISON(sign, csign, bits, size, GT, >) \ + VEC_GENERIC_COMPARISON(sign, csign, bits, size, EQ, ==) \ VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size) #ifndef VEC_SUPPRESS_HW @@ -367,81 +419,4 @@ #undef VEC_VUINT32X4 #undef VEC_VUINT64X2 -/* ---------------------------------------------------------------- */ -/* user-friendly alignment crap */ - -#ifndef VINT8x16_ALIGNMENT -# define VINT8x16_ALIGNMENT 1 -#endif - -#ifndef VINT16x8_ALIGNMENT -# define VINT16x8_ALIGNMENT 1 -#endif - -#ifndef VINT32x4_ALIGNMENT -# define VINT32x4_ALIGNMENT 1 -#endif - -#ifndef VINT64x2_ALIGNMENT -# define VINT64x2_ALIGNMENT 1 -#endif - -#ifndef VUINT8x16_ALIGNMENT -# define VUINT8x16_ALIGNMENT 1 -#endif - -#ifndef VUINT16x8_ALIGNMENT -# define VUINT16x8_ALIGNMENT 1 -#endif - -#ifndef VUINT32x4_ALIGNMENT -# define VUINT32x4_ALIGNMENT 1 -#endif - -#ifndef VUINT64x2_ALIGNMENT -# define VUINT64x2_ALIGNMENT 1 -#endif - -/* pointer alignment macros */ - -#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) -#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) - -#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) -#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) - -#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) -#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) - -#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) -#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) - -#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) - -#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) - -#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) - -#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) - #endif /* VEC_VEC_H_ */
--- a/test/main.c Tue Oct 22 23:27:15 2024 -0400 +++ b/test/main.c Wed Oct 23 10:13:25 2024 -0400 @@ -47,30 +47,30 @@ UINT64_C(0xff), UINT64_C(645366), UINT64_C(0x12345ABCDE), UINT64_C(0xF00000FFF), }; -#define VTEST(sign, bits, size) \ +#define VTEST(sign, csign, bits, size) \ static inline v##sign##int##bits##x##size vtest##sign##bits##x##size(const size_t start) \ { \ - sign##int##bits##_t x[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(x); \ for (size_t i = 0; i < size; i++) \ x[i] = testval##sign##bits[(start + i) % ARRAY_SIZE(testval##sign##bits)]; \ - return v##sign##int##bits##x##size##_load(x); \ + return v##sign##int##bits##x##size##_load_aligned(x); \ } -#define VTEST_SIGN(bits, size) VTEST(, bits, size) VTEST(u, bits, size) +#define VTEST_SIGN(bits, size) VTEST(, , bits, size) VTEST(u, U, bits, size) VTEST_SIGN(8, 16) VTEST_SIGN(16, 8) VTEST_SIGN(32, 4) VTEST_SIGN(64, 2) -#define DEFINE_PRINT_VECTOR(sign, psign, bits, size) \ +#define DEFINE_PRINT_VECTOR(sign, csign, psign, bits, size) \ static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \ { \ fputs("vector: ", file); \ \ - int##bits##_t v[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(v); \ \ - v##sign##int##bits##x##size##_store(vec, v); \ + v##sign##int##bits##x##size##_store_aligned(vec, v); \ \ fprintf(file, "%" PRI ## psign ## bits, v[0]); \ \ @@ -81,7 +81,7 @@ \ } -#define DEFINE_PRINT_VECTOR_2(bits, size) DEFINE_PRINT_VECTOR(, d, bits, size) DEFINE_PRINT_VECTOR(u, u, bits, size) +#define DEFINE_PRINT_VECTOR_2(bits, size) DEFINE_PRINT_VECTOR(, , d, bits, size) DEFINE_PRINT_VECTOR(u, U, u, bits, size) DEFINE_PRINT_VECTOR_2(8, 16) DEFINE_PRINT_VECTOR_2(16, 8)
--- a/test/test_align.h Tue Oct 22 23:27:15 2024 -0400 +++ b/test/test_align.h Wed Oct 23 10:13:25 2024 -0400 @@ -22,6 +22,9 @@ \ /* mark success or failure */ \ ret |= !!memcmp(vec_arr, vec_arr_out, V##csign##INT##bits##x##size##_ALIGNED_ARRAY_LENGTH(vec_arr)); \ + \ + ret |= !V##csign##INT##bits##x##size##_PTR_ALIGNED(vec_arr); \ + ret |= !V##csign##INT##bits##x##size##_PTR_ALIGNED(vec_arr_out); \ } while (0); #define RUN_TESTS(bits, size) \
--- a/test/test_arith.h Tue Oct 22 23:27:15 2024 -0400 +++ b/test/test_arith.h Wed Oct 23 10:13:25 2024 -0400 @@ -1,13 +1,15 @@ -#define CREATE_TEST(sign, psign, bits, size, op, equiv) \ +#define CREATE_TEST(sign, psign, csign, bits, size, op, equiv) \ static int test_arith_v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size a, v##sign##int##bits##x##size b) \ { \ - sign##int##bits##_t orig_a[size], orig_b[size], orig_c[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_a); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_b); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_c); \ \ v##sign##int##bits##x##size c = v##sign##int##bits##x##size##_##op(a, b); \ \ - v##sign##int##bits##x##size##_store(a, orig_a); \ - v##sign##int##bits##x##size##_store(b, orig_b); \ - v##sign##int##bits##x##size##_store(c, orig_c); \ + v##sign##int##bits##x##size##_store_aligned(a, orig_a); \ + v##sign##int##bits##x##size##_store_aligned(b, orig_b); \ + v##sign##int##bits##x##size##_store_aligned(c, orig_c); \ \ for (int i = 0; i < size; i++) { \ if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \ @@ -23,17 +25,18 @@ return 0; \ } -#define CREATE_TEST_SHIFT(sign, psign, bits, size, op, equiv) \ +#define CREATE_TEST_SHIFT(sign, psign, csign, bits, size, op, equiv) \ static int test_arith_v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size a, vuint##bits##x##size b) \ { \ - sign##int##bits##_t orig_a[size], orig_c[size]; \ - uint##bits##_t orig_b[size]; \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_a); \ + VUINT##bits##x##size##_ALIGNED_ARRAY(orig_b); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(orig_c); \ \ v##sign##int##bits##x##size c = v##sign##int##bits##x##size##_##op(a, b); \ \ - v##sign##int##bits##x##size##_store(a, orig_a); \ - vuint##bits##x##size##_store(b, orig_b); \ - v##sign##int##bits##x##size##_store(c, orig_c); \ + v##sign##int##bits##x##size##_store_aligned(a, orig_a); \ + vuint##bits##x##size##_store_aligned(b, orig_b); \ + v##sign##int##bits##x##size##_store_aligned(c, orig_c); \ \ for (int i = 0; i < size; i++) { \ if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \ @@ -49,22 +52,22 @@ return 0; \ } -#define CREATE_TESTS(sign, psign, bits, size) \ - CREATE_TEST(sign, psign, bits, size, add, orig_a[i] + orig_b[i]) \ - CREATE_TEST(sign, psign, bits, size, sub, orig_a[i] - orig_b[i]) \ - CREATE_TEST(sign, psign, bits, size, mul, orig_a[i] * orig_b[i]) \ - CREATE_TEST(sign, psign, bits, size, div, (orig_b[i]) ? (orig_a[i] / orig_b[i]) : 0) \ - CREATE_TEST(sign, psign, bits, size, and, orig_a[i] & orig_b[i]) \ - CREATE_TEST(sign, psign, bits, size, or, orig_a[i] | orig_b[i]) \ - CREATE_TEST(sign, psign, bits, size, xor, orig_a[i] ^ orig_b[i]) \ - CREATE_TEST(sign, psign, bits, size, avg, (orig_a[i] * orig_b[i]) / 2) \ - CREATE_TEST_SHIFT(sign, psign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \ - CREATE_TEST_SHIFT(sign, psign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \ - CREATE_TEST_SHIFT(sign, psign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i])) +#define CREATE_TESTS(sign, psign, csign, bits, size) \ + CREATE_TEST(sign, psign, csign, bits, size, add, orig_a[i] + orig_b[i]) \ + CREATE_TEST(sign, psign, csign, bits, size, sub, orig_a[i] - orig_b[i]) \ + CREATE_TEST(sign, psign, csign, bits, size, mul, orig_a[i] * orig_b[i]) \ + CREATE_TEST(sign, psign, csign, bits, size, div, (orig_b[i]) ? (orig_a[i] / orig_b[i]) : 0) \ + CREATE_TEST(sign, psign, csign, bits, size, and, orig_a[i] & orig_b[i]) \ + CREATE_TEST(sign, psign, csign, bits, size, or, orig_a[i] | orig_b[i]) \ + CREATE_TEST(sign, psign, csign, bits, size, xor, orig_a[i] ^ orig_b[i]) \ + CREATE_TEST(sign, psign, csign, bits, size, avg, (orig_a[i] * orig_b[i]) / 2) \ + CREATE_TEST_SHIFT(sign, psign, csign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \ + CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \ + CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i])) #define CREATE_TESTS_2(bits, size) \ - CREATE_TESTS(, d, bits, size) \ - CREATE_TESTS(u, u, bits, size) + CREATE_TESTS(, d, , bits, size) \ + CREATE_TESTS(u, u, U, bits, size) CREATE_TESTS_2(8, 16) CREATE_TESTS_2(16, 8) @@ -96,8 +99,8 @@ \ for (size_t i = 0U; i < ARRAY_SIZE(testval##sign##bits); i++) { \ const v##sign##int##bits##x##size a = vtest##sign##bits##x##size(i); \ - for (size_t j = 0U; j < ARRAY_SIZE(testvalu##bits); j++) { \ - const vuint##bits##x##size b = vtestu##bits##x##size(j); \ + for (uint32_t j = 0U; j < bits; j++) { \ + const vuint##bits##x##size b = vuint##bits##x##size##_splat(j); \ ret |= test_arith_v##sign##int##bits##x##size##_rshift(a, b); \ ret |= test_arith_v##sign##int##bits##x##size##_lshift(a, b); \ ret |= test_arith_v##sign##int##bits##x##size##_lrshift(a, b); \ @@ -105,7 +108,7 @@ } #define RUN_TESTS_2(bits, size) \ - RUN_TESTS(, bits, size) \ + RUN_TESTS( , bits, size) \ RUN_TESTS(u, bits, size) RUN_TESTS_2(8, 16)
--- a/test/test_compare.h Tue Oct 22 23:27:15 2024 -0400 +++ b/test/test_compare.h Wed Oct 23 10:13:25 2024 -0400 @@ -58,7 +58,9 @@ } \ } -#define RUN_TESTS_2(bits, size) RUN_TESTS(, bits, size) RUN_TESTS(u, bits, size) +#define RUN_TESTS_2(bits, size) \ + RUN_TESTS( , bits, size) \ + RUN_TESTS(u, bits, size) RUN_TESTS_2(8, 16) RUN_TESTS_2(16, 8)