/**
 * vec - a tiny SIMD vector library in plain C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

/* Generic array-based implementation. */

#include <stdint.h>
#include <string.h>

#define VEC_DEFINE_STRUCT(sign, bits, size) \
	typedef struct { \
		sign##int##bits##_t arr[size]; \
	} v##sign##int##bits##x##size;

#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \
	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
	{ \
		v##sign##int##bits##x##size vec; \
		memcpy(vec.arr, in, sizeof(vec.arr)); \
		return vec; \
	} \
	\
	VEC_DECL_LOAD(sign, bits, size) \
	{ \
		return v##sign##int##bits##x##size##_load_aligned(in); \
	} \
	\
	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
	{ \
		memcpy(out, vec.arr, sizeof(vec.arr)); \
	} \
	\
	VEC_DECL_STORE(sign, bits, size) \
	{ \
		return v##sign##int##bits##x##size##_store_aligned(vec, out); \
	} \
	\
	VEC_DECL_ADD(sign, bits, size) \
	{ \
		for (int i = 0; i < size; i++) vec1.arr[i] += vec2.arr[i]; \
		return vec1; \
	} \
	\
	VEC_DECL_SUB(sign, bits, size) \
	{ \
		for (int i = 0; i < size; i++) vec1.arr[i] -= vec2.arr[i]; \
		return vec1; \
	} \
	\
	VEC_DECL_MUL(sign, bits, size) \
	{ \
		for (int i = 0; i < size; i++) vec1.arr[i] *= vec2.arr[i]; \
		return vec1; \
	} \
	\
	VEC_DECL_AND(sign, bits, size) \
	{ \
		for (int i = 0; i < size; i++) vec1.arr[i] &= vec2.arr[i]; \
		return vec1; \
	} \
	\
	VEC_DECL_OR(sign, bits, size) \
	{ \
		for (int i = 0; i < size; i++) vec1.arr[i] |= vec2.arr[i]; \
		return vec1; \
	} \
	\
	VEC_DECL_XOR(sign, bits, size) \
	{ \
		for (int i = 0; i < size; i++) vec1.arr[i] ^= vec2.arr[i]; \
		return vec1; \
	} \
	\
	VEC_GENERIC_SPLAT(sign, csign, bits, size) \
	VEC_GENERIC_SHIFTS(sign, csign, bits, size) \
	VEC_GENERIC_DIVIDE(sign, csign, bits, size) \
	VEC_GENERIC_AVG(sign, bits, size) \
	VEC_GENERIC_COMPARISONS(sign, csign, bits, size)

#ifndef VEC_VUINT8X16
# define VEC_VUINT8X16
VEC_DEFINE_STRUCT(u, 8, 16)
# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
	((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } })
# define VUINT8x16_ALIGNMENT 1
VEC_DEFINE_OPERATIONS(u, U, 8, 16)
#endif

#ifndef VEC_VUINT16X8
# define VEC_VUINT16X8
VEC_DEFINE_STRUCT(u, 16, 8)
# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
	((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } })
# define VUINT16x8_ALIGNMENT 2
VEC_DEFINE_OPERATIONS(u, U, 16, 8)
#endif

#ifndef VEC_VUINT32X4
# define VEC_VUINT32X4
VEC_DEFINE_STRUCT(u, 32, 4)
# define VUINT32x4_CONSTANT(a, b, c, d) \
	((vuint32x4){ .arr = { a, b, c, d } })
# define VUINT32x4_ALIGNMENT 4
VEC_DEFINE_OPERATIONS(u, U, 32, 4)
#endif

#ifndef VEC_VUINT64X2
# define VEC_VUINT64X2
VEC_DEFINE_STRUCT(u, 64, 2)
# define VUINT64x2_CONSTANT(a, b) \
	((vuint64x2){ .arr = { a, b } })
# define VUINT64x2_ALIGNMENT 8
VEC_DEFINE_OPERATIONS(u, U, 64, 2)
#endif

#ifndef VEC_VINT16X8
# define VEC_VINT16X8
VEC_DEFINE_STRUCT(, 16, 8)
# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \
	((vint16x8){ .arr = { a, b, c, d, e, f, g, h } })
# define VINT16x8_ALIGNMENT 2
VEC_DEFINE_OPERATIONS(, , 16, 8)
#endif

#ifndef VEC_VINT32X4
# define VEC_VINT32X4
VEC_DEFINE_STRUCT(, 32, 4)
# define VINT32x4_CONSTANT(a, b, c, d) \
	((vint32x4){ .arr = { a, b, c, d } })
# define VINT32x4_ALIGNMENT 4
VEC_DEFINE_OPERATIONS(, , 32, 4)
#endif

#ifndef VEC_VINT64X2
# define VEC_VINT64X2
VEC_DEFINE_STRUCT(, 64, 2)
# define VINT64x2_CONSTANT(a, b) \
	((vint64x2){ .arr = { a, b } })
# define VINT64x2_ALIGNMENT 8
VEC_DEFINE_OPERATIONS(, , 64, 2)
#endif

#undef VEC_DEFINE_STRUCT
#undef VEC_DEFINE_OPERATIONS

// -----------------------------------------------------------------
// Okay, now we can implement our "double" structures.
// These use existing structures that are 128 bits in
// size to provide 256-bit or even 512-bit data types.

#define VEC_DEFINE_STRUCT(sign, bits, size, halfsize) \
	typedef struct { \
		v##sign##int##bits##x##halfsize vecs[2]; \
	} v##sign##int##bits##x##size;

#define VEC_DEFINE_OP(opcap, op, sign, bits, size, halfsize) \
	VEC_DECL_##opcap(sign, bits, size) \
	{ \
		vec1.vecs[0] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[0], vec2.vecs[0]); \
		vec1.vecs[1] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[1], vec2.vecs[1]); \
		return vec1; \
	}

// This could be in way fewer lines, but whatever
#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size, halfsize) \
	VEC_DECL_LOAD_ALIGNED(sign, bits, size) \
	{ \
		v##sign##int##bits##x##size vec; \
		vec.vecs[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \
		vec.vecs[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \
		return vec; \
	} \
	\
	VEC_DECL_LOAD(sign, bits, size) \
	{ \
		v##sign##int##bits##x##size vec; \
		vec.vecs[0] = v##sign##int##bits##x##halfsize##_load(in); \
		vec.vecs[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \
		return vec; \
	} \
	\
	VEC_DECL_SPLAT(sign, bits, size) \
	{ \
		v##sign##int##bits##x##size vec; \
		vec.vecs[0] = v##sign##int##bits##x##halfsize##_splat(x); \
		vec.vecs[1] = v##sign##int##bits##x##halfsize##_splat(x); \
		return vec; \
	} \
	\
	VEC_DECL_STORE_ALIGNED(sign, bits, size) \
	{ \
		v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[0], out); \
		v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[1], out + halfsize); \
	} \
	\
	VEC_DECL_STORE(sign, bits, size) \
	{ \
		v##sign##int##bits##x##halfsize##_store(vec.vecs[0], out); \
		v##sign##int##bits##x##halfsize##_store(vec.vecs[1], out + halfsize); \
	} \
	\
	VEC_DEFINE_OP(ADD, add, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(SUB, sub, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(MUL, mul, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(AND, and, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(OR, or, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(XOR, xor, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(LSHIFT, lshift, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(RSHIFT, rshift, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(LRSHIFT, lrshift, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(DIV, div, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(AVG, avg, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(CMPLT, cmplt, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(CMPGT, cmpgt, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(CMPEQ, cmpeq, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(CMPGE, cmpge, sign, bits, size, halfsize) \
	VEC_DEFINE_OP(CMPLE, cmple, sign, bits, size, halfsize)

#ifndef VEC_VUINT8X32
# define VEC_VUINT8X32
VEC_DEFINE_STRUCT(u, 8, 32, 16)
# define VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
	((vuint8x32){ .vecs = { VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 8, 32, 16)
#endif

#ifndef VEC_VUINT16X16
# define VEC_VUINT16X16
VEC_DEFINE_STRUCT(u, 16, 16, 8)
# define VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
	((vuint16x16){ .vecs = { VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } })
# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 16, 16, 8)
#endif

#ifndef VEC_VUINT32X8
# define VEC_VUINT32X8
VEC_DEFINE_STRUCT(u, 32, 8, 4)
# define VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \
	((vuint32x8){ .vecs = { VUINT32x4_CONSTANT(a, b, c, d), VUINT32x4_CONSTANT(e, f, g, h) } })
# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 32, 8, 4)
#endif

#ifndef VEC_VUINT64X4
# define VEC_VUINT64X4
VEC_DEFINE_STRUCT(u, 64, 4, 2)
# define VUINT64x4_CONSTANT(a, b, c, d) \
	((vuint64x4){ .vecs = { VUINT64x2_CONSTANT(a, b), VUINT64x2_CONSTANT(c, d) } })
# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 64, 4, 2)
#endif

#ifndef VEC_VINT8X32
# define VEC_VINT8X32
VEC_DEFINE_STRUCT(, 8, 32, 16)
# define VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
	((vint8x32){ .vecs = { VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 8, 32, 16)
#endif

#ifndef VEC_VINT16X16
# define VEC_VINT16X16
VEC_DEFINE_STRUCT(, 16, 16, 8)
# define VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
	((vint16x16){ .vecs = { VINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } })
# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 16, 16, 8)
#endif

#ifndef VEC_VINT32X8
# define VEC_VINT32X8
VEC_DEFINE_STRUCT(, 32, 8, 4)
# define VINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \
	((vuint32x8){ .vecs = { VINT32x4_CONSTANT(a, b, c, d), VINT32x4_CONSTANT(e, f, g, h) } })
# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 32, 8, 4)
#endif

#ifndef VEC_VINT64X4
# define VEC_VINT64X4
VEC_DEFINE_STRUCT(, 64, 4, 2)
# define VINT64x4_CONSTANT(a, b, c, d) \
	((vint64x4){ .vecs = { VINT64x2_CONSTANT(a, b), VINT64x2_CONSTANT(c, d) } })
# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 64, 4, 2)
#endif

#ifndef VEC_VUINT8X64
# define VEC_VUINT8X64
VEC_DEFINE_STRUCT(u, 8, 64, 32)
# define VUINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \
	((vuint8x64){ .vecs = { VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VUINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } })
# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 8, 64, 32)
#endif

#ifndef VEC_VUINT16X32
# define VEC_VUINT16X32
VEC_DEFINE_STRUCT(u, 16, 32, 16)
# define VUINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
	((vuint16x32){ .vecs = { VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 16, 32, 16)
#endif

#ifndef VEC_VUINT32X16
# define VEC_VUINT32X16
VEC_DEFINE_STRUCT(u, 32, 16, 8)
# define VUINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
	((vuint32x16){ .vecs = { VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } })
# define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 32, 16, 8)
#endif

#ifndef VEC_VUINT64X8
# define VEC_VUINT64X8
VEC_DEFINE_STRUCT(u, 64, 8, 4)
# define VUINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \
	((vuint64x8){ .vecs = { VUINT64x4_CONSTANT(a, b, c, d), VUINT64x4_CONSTANT(e, f, g, h) } })
# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT
VEC_DEFINE_OPERATIONS(u, U, 64, 8, 4)
#endif

#ifndef VEC_VINT8X64
# define VEC_VINT8X64
VEC_DEFINE_STRUCT(, 8, 64, 32)
# define VINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \
	((vint8x64){ .vecs = { VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } })
# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 8, 64, 32)
#endif

#ifndef VEC_VINT16X32
# define VEC_VINT16X32
VEC_DEFINE_STRUCT(, 16, 32, 16)
# define VINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \
	((vint16x32){ .vecs = { VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } })
# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 16, 32, 16)
#endif

#ifndef VEC_VINT32X16
# define VEC_VINT32X16
VEC_DEFINE_STRUCT(, 32, 16, 8)
# define VINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
	((vint32x16){ .vecs = { VINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } })
# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 32, 16, 8)
#endif

#ifndef VEC_VINT64X8
# define VEC_VINT64X8
VEC_DEFINE_STRUCT(, 64, 8, 4)
# define VINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \
	((vint64x8){ .vecs = { VINT64x4_CONSTANT(a, b, c, d), VINT64x4_CONSTANT(e, f, g, h) } })
# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
VEC_DEFINE_OPERATIONS(, , 64, 8, 4)
#endif

#undef VEC_DEFINE_STRUCT
#undef VEC_DEFINE_OPERATIONS
#undef VEC_DEFINE_OP
