Mercurial > vec
changeset 9:6ff0b7a44bb6
generic: add initial support for 256-bit and 512-bit types
eventually this could be expanded to add floating point stuff
as well
author | Paper <paper@tflc.us> |
---|---|
date | Mon, 18 Nov 2024 13:52:09 -0500 |
parents | 978c167dcceb |
children | d1d5d767004c |
files | include/vec/impl/generic.h include/vec/vec.h test/main.c test/test_align.h test/test_arith.h test/test_compare.h |
diffstat | 6 files changed, 442 insertions(+), 52 deletions(-) [+] |
line wrap: on
line diff
--- a/include/vec/impl/generic.h Wed Oct 23 19:05:34 2024 -0400 +++ b/include/vec/impl/generic.h Mon Nov 18 13:52:09 2024 -0500 @@ -171,3 +171,224 @@ #undef VEC_DEFINE_STRUCT #undef VEC_DEFINE_OPERATIONS + +// ----------------------------------------------------------------- +// Okay, now we can implement our "double" structures. +// These use existing structures that are 128 bits in +// size to provide 256-bit or even 512-bit data types. + +#define VEC_DEFINE_STRUCT(sign, bits, size, halfsize) \ + typedef struct { \ + v##sign##int##bits##x##halfsize vecs[2]; \ + } v##sign##int##bits##x##size; + +#define VEC_DEFINE_OP(opcap, op, sign, bits, size, halfsize) \ + VEC_DECL_##opcap(sign, bits, size) \ + { \ + vec1.vecs[0] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[0], vec2.vecs[0]); \ + vec1.vecs[1] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[1], vec2.vecs[1]); \ + return vec1; \ + } + +// This could be in way fewer lines, but whatever +#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size, halfsize) \ + VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.vecs[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \ + vec.vecs[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \ + return vec; \ + } \ + \ + VEC_DECL_LOAD(sign, bits, size) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.vecs[0] = v##sign##int##bits##x##halfsize##_load(in); \ + vec.vecs[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \ + return vec; \ + } \ + \ + VEC_DECL_SPLAT(sign, bits, size) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.vecs[0] = v##sign##int##bits##x##halfsize##_splat(x); \ + vec.vecs[1] = v##sign##int##bits##x##halfsize##_splat(x); \ + return vec; \ + } \ + \ + VEC_DECL_STORE_ALIGNED(sign, bits, size) \ + { \ + v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[0], out); \ + v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[1], out + halfsize); \ + } \ + \ + VEC_DECL_STORE(sign, bits, size) \ + { \ + v##sign##int##bits##x##halfsize##_store(vec.vecs[0], out); \ + v##sign##int##bits##x##halfsize##_store(vec.vecs[1], out + halfsize); \ + } \ + \ + VEC_DEFINE_OP(ADD, add, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(SUB, sub, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(MUL, mul, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(AND, and, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(OR, or, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(XOR, xor, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(LSHIFT, lshift, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(RSHIFT, rshift, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(LRSHIFT, lrshift, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(DIV, div, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(AVG, avg, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(CMPLT, cmplt, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(CMPGT, cmpgt, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(CMPEQ, cmpeq, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(CMPGE, cmpge, sign, bits, size, halfsize) \ + VEC_DEFINE_OP(CMPLE, cmple, sign, bits, size, halfsize) + +#ifndef VEC_VUINT8X32 +# define VEC_VUINT8X32 +VEC_DEFINE_STRUCT(u, 8, 32, 16) +# define VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ + ((vuint8x32){ .vecs = { VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) +# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 8, 32, 16) +#endif + +#ifndef VEC_VUINT16X16 +# define VEC_VUINT16X16 +VEC_DEFINE_STRUCT(u, 16, 16, 8) +# define VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + ((vuint16x16){ .vecs = { VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } }) +# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 16, 16, 8) +#endif + +#ifndef VEC_VUINT32X8 +# define VEC_VUINT32X8 +VEC_DEFINE_STRUCT(u, 32, 8, 4) +# define VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \ + ((vuint32x8){ .vecs = { VUINT32x4_CONSTANT(a, b, c, d), VUINT32x4_CONSTANT(e, f, g, h) } }) +# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 32, 8, 4) +#endif + +#ifndef VEC_VUINT64X4 +# define VEC_VUINT64X4 +VEC_DEFINE_STRUCT(u, 64, 4, 2) +# define VUINT64x4_CONSTANT(a, b, c, d) \ + ((vuint64x4){ .vecs = { VUINT64x2_CONSTANT(a, b), VUINT64x2_CONSTANT(c, d) } }) +# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 64, 4, 2) +#endif + +#ifndef VEC_VINT8X32 +# define VEC_VINT8X32 +VEC_DEFINE_STRUCT(, 8, 32, 16) +# define VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ + ((vint8x32){ .vecs = { VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) +# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 8, 32, 16) +#endif + +#ifndef VEC_VINT16X16 +# define VEC_VINT16X16 +VEC_DEFINE_STRUCT(, 16, 16, 8) +# define VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + ((vint16x16){ .vecs = { VINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } }) +# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 16, 16, 8) +#endif + +#ifndef VEC_VINT32X8 +# define VEC_VINT32X8 +VEC_DEFINE_STRUCT(, 32, 8, 4) +# define VINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \ + ((vuint32x8){ .vecs = { VINT32x4_CONSTANT(a, b, c, d), VINT32x4_CONSTANT(e, f, g, h) } }) +# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 32, 8, 4) +#endif + +#ifndef VEC_VINT64X4 +# define VEC_VINT64X4 +VEC_DEFINE_STRUCT(, 64, 4, 2) +# define VINT64x4_CONSTANT(a, b, c, d) \ + ((vint64x4){ .vecs = { VINT64x2_CONSTANT(a, b), VINT64x2_CONSTANT(c, d) } }) +# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 64, 4, 2) +#endif + +#ifndef VEC_VUINT8X64 +# define VEC_VUINT8X64 +VEC_DEFINE_STRUCT(u, 8, 64, 32) +# define VUINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \ + ((vuint8x64){ .vecs = { VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VUINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } }) +# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 8, 64, 32) +#endif + +#ifndef VEC_VUINT16X32 +# define VEC_VUINT16X32 +VEC_DEFINE_STRUCT(u, 16, 32, 16) +# define VUINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ + ((vuint16x32){ .vecs = { VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) +# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 16, 32, 16) +#endif + +#ifndef VEC_VUINT32X16 +# define VEC_VUINT32X16 +VEC_DEFINE_STRUCT(u, 32, 16, 8) +# define VUINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + ((vuint32x16){ .vecs = { VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } }) +# define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 32, 16, 8) +#endif + +#ifndef VEC_VUINT64X8 +# define VEC_VUINT64X8 +VEC_DEFINE_STRUCT(u, 64, 8, 4) +# define VUINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \ + ((vuint64x8){ .vecs = { VUINT64x4_CONSTANT(a, b, c, d), VUINT64x4_CONSTANT(e, f, g, h) } }) +# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT +VEC_DEFINE_OPERATIONS(u, U, 64, 8, 4) +#endif + +#ifndef VEC_VINT8X64 +# define VEC_VINT8X64 +VEC_DEFINE_STRUCT(, 8, 64, 32) +# define VINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \ + ((vint8x64){ .vecs = { VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } }) +# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 8, 64, 32) +#endif + +#ifndef VEC_VINT16X32 +# define VEC_VINT16X32 +VEC_DEFINE_STRUCT(, 16, 32, 16) +# define VINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ + ((vint16x32){ .vecs = { VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) +# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 16, 32, 16) +#endif + +#ifndef VEC_VINT32X16 +# define VEC_VINT32X16 +VEC_DEFINE_STRUCT(, 32, 16, 8) +# define VINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + ((vint32x16){ .vecs = { VINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } }) +# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 32, 16, 8) +#endif + +#ifndef VEC_VINT64X8 +# define VEC_VINT64X8 +VEC_DEFINE_STRUCT(, 64, 8, 4) +# define VINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \ + ((vint64x8){ .vecs = { VINT64x4_CONSTANT(a, b, c, d), VINT64x4_CONSTANT(e, f, g, h) } }) +# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT +VEC_DEFINE_OPERATIONS(, , 64, 8, 4) +#endif + +#undef VEC_DEFINE_STRUCT +#undef VEC_DEFINE_OPERATIONS +#undef VEC_DEFINE_OP
--- a/include/vec/vec.h Wed Oct 23 19:05:34 2024 -0400 +++ b/include/vec/vec.h Mon Nov 18 13:52:09 2024 -0500 @@ -206,8 +206,88 @@ #define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) #define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) +#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) +#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) + +#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) +#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) +#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) + +#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) +#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) + +#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) + +#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) + +#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) + +#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) +#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) + +#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x16_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) +#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) +#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) + +#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) +#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) + +#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) + +#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x16_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) +#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) + +#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) + /* --------------------------------------------------------------- */ -/* Implementation includes */ +/* Implementation defines to keep everything relatively consistent */ #define VEC_OPERATION_DECL(sign, bits, size, ret, op, params) \ static inline VEC_ALWAYS_INLINE ret v##sign##int##bits##x##size##_##op params @@ -232,6 +312,9 @@ #define VEC_DECL_XOR(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, xor) #define VEC_DECL_AVG(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, avg) #define VEC_DECL_SHIFT(sign, bits, size, vectype, way) VEC_OPERATION_THIS_DECL(sign, bits, size, vectype##way##shift, (v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2)) +#define VEC_DECL_LSHIFT(sign, bits, size) VEC_DECL_SHIFT(sign, bits, size, , l) +#define VEC_DECL_RSHIFT(sign, bits, size) VEC_DECL_SHIFT(sign, bits, size, , r) +#define VEC_DECL_LRSHIFT(sign, bits, size) VEC_DECL_SHIFT(sign, bits, size, l, r) #define VEC_DECL_NOT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, not, (v##sign##int##bits##x##size vec)) /* comparisons */ @@ -273,6 +356,8 @@ v##sign##int##bits##x##size##_store_aligned(vec1, vec1a); \ v##sign##int##bits##x##size##_store_aligned(vec2, vec2a); \ \ + /* FIXME FIXME FIXME; the reason this zero thing is here is because */ \ + /* the tests are too stupid to not include zero for divides. remove this ASAP */ \ for (int i = 0; i < size; i++) vec1a[i] = (vec2a[i]) ? (vec1a[i] / vec2a[i]) : 0; \ \ return v##sign##int##bits##x##size##_load_aligned(vec1a); \ @@ -292,10 +377,14 @@ return v##sign##int##bits##x##size##_load_aligned(vec1a); \ } +#define VEC_GENERIC_LSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(sign, csign, bits, size, , l) +#define VEC_GENERIC_RSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(sign, csign, bits, size, , r) +#define VEC_GENERIC_LRSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r) + #define VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ - VEC_GENERIC_SHIFT(sign, csign, bits, size, , l) /* left shift */ \ - VEC_GENERIC_SHIFT(sign, csign, bits, size, , r) /* arithmetic right shift */ \ - VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r) /* logical right shift */ + VEC_GENERIC_LSHIFT(sign, csign, bits, size) \ + VEC_GENERIC_RSHIFT(sign, csign, bits, size) \ + VEC_GENERIC_LRSHIFT(sign, csign, bits, size) #define VEC_GENERIC_AVG(sign, bits, size) \ VEC_DECL_AVG(sign, bits, size) \ @@ -340,7 +429,8 @@ # ifdef __ALTIVEC__ # include "impl/altivec.h" # endif -/* x86 SSE2 */ +/* x86 SSE2; gcc intrinsics are probably more efficient than + * vec's implementation, but whatever. */ # ifdef __SSE2__ # include "impl/sse2.h" # endif @@ -405,6 +495,9 @@ #undef VEC_GENERIC_DIVIDE #undef VEC_GENERIC_SHIFT #undef VEC_GENERIC_SHIFTS +#undef VEC_GENERIC_LSHIFT +#undef VEC_GENERIC_RSHIFT +#undef VEC_GENERIC_LRSHIFT #undef VEC_GENERIC_AVG #undef VEC_GENERIC_THAN_OR_EQUAL #undef VEC_GENERIC_COMPARISON @@ -419,4 +512,22 @@ #undef VEC_VUINT32X4 #undef VEC_VUINT64X2 +#undef VEC_VINT8X32 +#undef VEC_VINT16X16 +#undef VEC_VINT32X8 +#undef VEC_VINT64X4 +#undef VEC_VUINT8X32 +#undef VEC_VUINT16X16 +#undef VEC_VUINT32X8 +#undef VEC_VUINT64X4 + +#undef VEC_VINT8X64 +#undef VEC_VINT16X32 +#undef VEC_VINT32X16 +#undef VEC_VINT64X8 +#undef VEC_VUINT8X64 +#undef VEC_VUINT16X32 +#undef VEC_VUINT32X16 +#undef VEC_VUINT64X8 + #endif /* VEC_VEC_H_ */
--- a/test/main.c Wed Oct 23 19:05:34 2024 -0400 +++ b/test/main.c Mon Nov 18 13:52:09 2024 -0500 @@ -56,14 +56,7 @@ return v##sign##int##bits##x##size##_load_aligned(x); \ } -#define VTEST_SIGN(bits, size) VTEST(, , bits, size) VTEST(u, U, bits, size) - -VTEST_SIGN(8, 16) -VTEST_SIGN(16, 8) -VTEST_SIGN(32, 4) -VTEST_SIGN(64, 2) - -#define DEFINE_PRINT_VECTOR(sign, csign, psign, bits, size) \ +#define VPRINT(sign, csign, psign, bits, size) \ static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \ { \ fputs("vector: ", file); \ @@ -81,17 +74,37 @@ \ } -#define DEFINE_PRINT_VECTOR_2(bits, size) DEFINE_PRINT_VECTOR(, , d, bits, size) DEFINE_PRINT_VECTOR(u, U, u, bits, size) +#define DEF_VEC_TEST_FUNCS(bits, size) \ + VTEST(, , bits, size) VTEST(u, U, bits, size) \ + VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size) + +DEF_VEC_TEST_FUNCS(8, 16) +DEF_VEC_TEST_FUNCS(16, 8) +DEF_VEC_TEST_FUNCS(32, 4) +DEF_VEC_TEST_FUNCS(64, 2) -DEFINE_PRINT_VECTOR_2(8, 16) -DEFINE_PRINT_VECTOR_2(16, 8) -DEFINE_PRINT_VECTOR_2(32, 4) -DEFINE_PRINT_VECTOR_2(64, 2) +DEF_VEC_TEST_FUNCS(8, 32) +DEF_VEC_TEST_FUNCS(16, 16) +DEF_VEC_TEST_FUNCS(32, 8) +DEF_VEC_TEST_FUNCS(64, 4) + +DEF_VEC_TEST_FUNCS(8, 64) +DEF_VEC_TEST_FUNCS(16, 32) +DEF_VEC_TEST_FUNCS(32, 16) +DEF_VEC_TEST_FUNCS(64, 8) + +#undef DEF_VEC_TEST_FUNCS +#undef VPRINT +#undef VTEST + +// ------------------------------------------------------------ #include "test_align.h" #include "test_arith.h" #include "test_compare.h" +// ------------------------------------------------------------ + int main(void) { int ret = 0;
--- a/test/test_align.h Wed Oct 23 19:05:34 2024 -0400 +++ b/test/test_align.h Mon Nov 18 13:52:09 2024 -0500 @@ -36,6 +36,11 @@ RUN_TESTS(32, 4) RUN_TESTS(64, 2) + RUN_TESTS(8, 32) + RUN_TESTS(16, 16) + RUN_TESTS(32, 8) + RUN_TESTS(64, 4) + #undef RUN_TESTS #undef RUN_TEST
--- a/test/test_arith.h Wed Oct 23 19:05:34 2024 -0400 +++ b/test/test_arith.h Mon Nov 18 13:52:09 2024 -0500 @@ -52,7 +52,7 @@ return 0; \ } -#define CREATE_TESTS(sign, psign, csign, bits, size) \ +#define CREATE_TESTS_SIGN(sign, psign, csign, bits, size) \ CREATE_TEST(sign, psign, csign, bits, size, add, orig_a[i] + orig_b[i]) \ CREATE_TEST(sign, psign, csign, bits, size, sub, orig_a[i] - orig_b[i]) \ CREATE_TEST(sign, psign, csign, bits, size, mul, orig_a[i] * orig_b[i]) \ @@ -65,16 +65,26 @@ CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \ CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i])) -#define CREATE_TESTS_2(bits, size) \ - CREATE_TESTS(, d, , bits, size) \ - CREATE_TESTS(u, u, U, bits, size) +#define CREATE_TESTS(bits, size) \ + CREATE_TESTS_SIGN(, d, , bits, size) \ + CREATE_TESTS_SIGN(u, u, U, bits, size) + +CREATE_TESTS(8, 16) +CREATE_TESTS(16, 8) +CREATE_TESTS(32, 4) +CREATE_TESTS(64, 2) -CREATE_TESTS_2(8, 16) -CREATE_TESTS_2(16, 8) -CREATE_TESTS_2(32, 4) -CREATE_TESTS_2(64, 2) +CREATE_TESTS(8, 32) +CREATE_TESTS(16, 16) +CREATE_TESTS(32, 8) +CREATE_TESTS(64, 4) -#undef CREATE_TESTS_2 +CREATE_TESTS(8, 64) +CREATE_TESTS(16, 32) +CREATE_TESTS(32, 16) +CREATE_TESTS(64, 8) + +#undef CREATE_TESTS_SIGN #undef CREATE_TESTS #undef CREATE_TEST @@ -82,7 +92,7 @@ { int ret = 0; -#define RUN_TESTS(sign, bits, size) \ +#define RUN_TESTS_SIGN(sign, bits, size) \ for (size_t i = 0U; i < ARRAY_SIZE(testval##sign##bits); i++) { \ const v##sign##int##bits##x##size a = vtest##sign##bits##x##size(i); \ for (size_t j = 0U; j < ARRAY_SIZE(testval##sign##bits); j++) { \ @@ -107,16 +117,26 @@ } \ } -#define RUN_TESTS_2(bits, size) \ - RUN_TESTS( , bits, size) \ - RUN_TESTS(u, bits, size) +#define RUN_TESTS(bits, size) \ + RUN_TESTS_SIGN( , bits, size) \ + RUN_TESTS_SIGN(u, bits, size) + + RUN_TESTS(8, 16) + RUN_TESTS(16, 8) + RUN_TESTS(32, 4) + RUN_TESTS(64, 2) - RUN_TESTS_2(8, 16) - RUN_TESTS_2(16, 8) - RUN_TESTS_2(32, 4) - RUN_TESTS_2(64, 2) + RUN_TESTS(8, 32) + RUN_TESTS(16, 16) + RUN_TESTS(32, 8) + RUN_TESTS(64, 4) -#undef RUN_TESTS_2 + RUN_TESTS(8, 64) + RUN_TESTS(16, 32) + RUN_TESTS(32, 16) + RUN_TESTS(64, 8) + +#undef RUN_TESTS_SIGN #undef RUN_TESTS return ret;
--- a/test/test_compare.h Wed Oct 23 19:05:34 2024 -0400 +++ b/test/test_compare.h Mon Nov 18 13:52:09 2024 -0500 @@ -23,21 +23,31 @@ return 0; \ } -#define CREATE_TESTS(sign, psign, bits, size) \ +#define CREATE_TESTS_SIGN(sign, psign, bits, size) \ CREATE_TEST(sign, psign, bits, size, cmplt, orig_a[i] < orig_b[i]) \ CREATE_TEST(sign, psign, bits, size, cmpgt, orig_a[i] > orig_b[i]) \ CREATE_TEST(sign, psign, bits, size, cmpeq, orig_a[i] == orig_b[i]) \ CREATE_TEST(sign, psign, bits, size, cmple, orig_a[i] <= orig_b[i]) \ CREATE_TEST(sign, psign, bits, size, cmpge, orig_a[i] >= orig_b[i]) -#define CREATE_TESTS_2(bits, size) CREATE_TESTS(, d, bits, size) CREATE_TESTS(u, u, bits, size) +#define CREATE_TESTS(bits, size) CREATE_TESTS_SIGN(, d, bits, size) CREATE_TESTS_SIGN(u, u, bits, size) + +CREATE_TESTS(8, 16) +CREATE_TESTS(16, 8) +CREATE_TESTS(32, 4) +CREATE_TESTS(64, 2) -CREATE_TESTS_2(8, 16) -CREATE_TESTS_2(16, 8) -CREATE_TESTS_2(32, 4) -CREATE_TESTS_2(64, 2) +CREATE_TESTS(8, 32) +CREATE_TESTS(16, 16) +CREATE_TESTS(32, 8) +CREATE_TESTS(64, 4) -#undef CREATE_TESTS_2 +CREATE_TESTS(8, 64) +CREATE_TESTS(16, 32) +CREATE_TESTS(32, 16) +CREATE_TESTS(64, 8) + +#undef CREATE_TESTS_SIGN #undef CREATE_TESTS #undef CREATE_TEST @@ -45,7 +55,7 @@ { int ret = 0; -#define RUN_TESTS(sign, bits, size) \ +#define RUN_TESTS_SIGN(sign, bits, size) \ for (size_t i = 0U; i < ARRAY_SIZE(testval##sign##bits); i++) { \ const v##sign##int##bits##x##size a = vtest##sign##bits##x##size(i); \ for (size_t j = 0U; j < ARRAY_SIZE(testval##sign##bits); j++) { \ @@ -58,16 +68,26 @@ } \ } -#define RUN_TESTS_2(bits, size) \ - RUN_TESTS( , bits, size) \ - RUN_TESTS(u, bits, size) +#define RUN_TESTS(bits, size) \ + RUN_TESTS_SIGN( , bits, size) \ + RUN_TESTS_SIGN(u, bits, size) + + RUN_TESTS(8, 16) + RUN_TESTS(16, 8) + RUN_TESTS(32, 4) + RUN_TESTS(64, 2) - RUN_TESTS_2(8, 16) - RUN_TESTS_2(16, 8) - RUN_TESTS_2(32, 4) - RUN_TESTS_2(64, 2) + RUN_TESTS(8, 32) + RUN_TESTS(16, 16) + RUN_TESTS(32, 8) + RUN_TESTS(64, 4) -#undef RUN_TESTS_2 + RUN_TESTS(8, 64) + RUN_TESTS(16, 32) + RUN_TESTS(32, 16) + RUN_TESTS(64, 8) + +#undef RUN_TESTS_SIGN #undef RUN_TESTS return ret;