/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#ifndef VEC_VEC_H_
#define VEC_VEC_H_

#include <stdint.h>
#include <limits.h>

#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
	(((a) >= (x)) && \
	 ((a) > x || (b) >= (y)) && \
	 ((a) > x || (b) > (y) || (c) >= (z)))

#define VEC_GNUC_ATLEAST(x, y, z) \
	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)

/* GCC/clang attributes */
#if defined(__has_attribute)
# if __has_attribute(__always_inline__)
#  define VEC_ALWAYS_INLINE __attribute__((__always_inline__))
# endif
# if __has_attribute(__aligned__)
#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
# endif
# if __has_attribute(__vector_size__)
#  define VEC_HAVE_GNUC_VECTORS
# endif
#endif

#ifndef VEC_HAVE_GNUC_VECTORS
# if VEC_GNUC_ATLEAST(4, 0, 0)
#  define VEC_HAVE_GNUC_VECTORS
# endif
#endif

#ifndef VEC_ALIGNED
# if VEC_GNUC_ATLEAST(2, 7, 0)
#  define VEC_ALIGNED(x) __attribute__((aligned(x)))
# endif
#endif

#ifndef VEC_ALWAYS_INLINE
# if VEC_GNUC_ATLEAST(3, 1, 0)
#  define VEC_ALWAYS_INLINE(x) __attribute__((always_inline))
# endif
#endif

#ifndef VEC_ALWAYS_INLINE
# define VEC_ALWAYS_INLINE
#endif

#ifdef VEC_ALIGNED
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_ALIGNED(align) type var[length]
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(var))
#else
/* allocate more than necessary to align */
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	unsigned char vec_##var##_unaligned_[((length) * sizeof(type)) + (align) - 1]; \
	type *var = (type *)((((intptr_t)vec_##var##_unaligned_ + (align) - 1) / (align)) * (align))
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(vec_##var##_unaligned_) - ((align) - 1))
#endif

#define VEC_ALIGNED_ARRAY_LENGTH(var, align) \
	(VEC_ALIGNED_ARRAY_SIZEOF(var, align)/sizeof(*var))

/* --------------------------------------------------------------- */
/* bit shift */

static inline VEC_ALWAYS_INLINE uintmax_t vec_ulrshift(uintmax_t x, unsigned int y)
{
	return x >> y;
}

static inline VEC_ALWAYS_INLINE uintmax_t vec_ullshift(uintmax_t x, unsigned int y)
{
	return x << y;
}

static inline VEC_ALWAYS_INLINE intmax_t vec_lrshift(intmax_t x, unsigned int y)
{
	return (intmax_t)(((uintmax_t)x) >> y);
}

static inline VEC_ALWAYS_INLINE intmax_t vec_llshift(intmax_t x, unsigned int y)
{
	return (intmax_t)(((uintmax_t)x) << y);
}

static inline VEC_ALWAYS_INLINE uintmax_t vec_urshift(uintmax_t x, unsigned int y)
{
	return x >> y;
}

static inline VEC_ALWAYS_INLINE uintmax_t vec_ulshift(uintmax_t x, unsigned int y)
{
	return x << y;
}

/**
 * Arithmetic shifts; based off code from OpenMPT, which is under
 * the Boost Software License:
 *
 * Permission is hereby granted, free of charge, to any person or organization
 * obtaining a copy of the software and accompanying documentation covered by
 * this license (the "Software") to use, reproduce, display, distribute,
 * execute, and transmit the Software, and to prepare derivative works of the
 * Software, and to permit third-parties to whom the Software is furnished to
 * do so, all subject to the following:
 * 
 * The copyright notices in the Software and this entire statement, including
 * the above license grant, this restriction and the following disclaimer,
 * must be included in all copies of the Software, in whole or in part, and
 * all derivative works of the Software, unless such copies or derivative
 * works are solely in the form of machine-executable object code generated by
 * a source language processor.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
**/
static inline VEC_ALWAYS_INLINE intmax_t vec_rshift(intmax_t x, unsigned int y)
{
	static const uintmax_t roffset = UINTMAX_C(1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

	uintmax_t urx = (uintmax_t)x;
	urx += roffset;
	urx >>= y;
	urx -= roffset >> y;

	return (intmax_t)urx;
}

static inline VEC_ALWAYS_INLINE intmax_t vec_lshift(intmax_t x, unsigned int y)
{
	static const uintmax_t roffset = UINTMAX_C(1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

	uintmax_t urx = (uintmax_t)x;
	urx += roffset;
	urx <<= y;
	urx -= roffset << y;

	return (intmax_t)urx;
}

/* --------------------------------------------------------------- */
/* Implementation includes */

#define VEC_OPERATION_DECL(sign, bits, size, ret, op, params) \
	static inline VEC_ALWAYS_INLINE ret v##sign##int##bits##x##size##_##op params

#define VEC_OPERATION_THIS_DECL(sign, bits, size, op, params) \
	VEC_OPERATION_DECL(sign, bits, size, v##sign##int##bits##x##size, op, params)

#define VEC_TWOWAY_DECL(sign, bits, size, op) \
	VEC_OPERATION_THIS_DECL(sign, bits, size, op, (v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2))

#define VEC_DECL_SPLAT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, splat, (sign##int##bits##_t x))
#define VEC_DECL_LOAD(sign, bits, size)  VEC_OPERATION_THIS_DECL(sign, bits, size, load, (const sign##int##bits##_t in[size]))
#define VEC_DECL_LOAD_ALIGNED(sign, bits, size)  VEC_OPERATION_THIS_DECL(sign, bits, size, load_aligned, (const sign##int##bits##_t in[size]))
#define VEC_DECL_STORE(sign, bits, size) VEC_OPERATION_DECL(sign, bits, size, void, store, (v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]))
#define VEC_DECL_STORE_ALIGNED(sign, bits, size) VEC_OPERATION_DECL(sign, bits, size, void, store_aligned, (v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]))
#define VEC_DECL_ADD(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, add)
#define VEC_DECL_SUB(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, sub)
#define VEC_DECL_MUL(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, mul)
#define VEC_DECL_DIV(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, div)
#define VEC_DECL_AND(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, and)
#define VEC_DECL_OR(sign, bits, size)    VEC_TWOWAY_DECL(sign, bits, size, or)
#define VEC_DECL_XOR(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, xor)
#define VEC_DECL_AVG(sign, bits, size)   VEC_TWOWAY_DECL(sign, bits, size, avg)
#define VEC_DECL_SHIFT(sign, bits, size, vectype, way) VEC_OPERATION_THIS_DECL(sign, bits, size, vectype##way##shift, (v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2))
#define VEC_DECL_NOT(sign, bits, size) VEC_OPERATION_THIS_DECL(sign, bits, size, not, (v##sign##int##bits##x##size vec))

/* comparisons */
#define VEC_DECL_CMPLT(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmplt)
#define VEC_DECL_CMPGT(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpgt)
#define VEC_DECL_CMPEQ(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpeq)
#define VEC_DECL_CMPLE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmple)
#define VEC_DECL_CMPGE(sign, bits, size) VEC_TWOWAY_DECL(sign, bits, size, cmpge)

/* Generic variations. */
#define VEC_GENERIC_SPLAT(sign, bits, size) \
	VEC_DECL_SPLAT(sign, bits, size) \
	{ \
		sign##int##bits##_t va[size]; \
		for (int i = 0; i < size; i++) va[i] = x; \
		return v##sign##int##bits##x##size##_load(va); \
	}

#define VEC_GENERIC_DIVIDE(sign, bits, size) \
	VEC_DECL_DIV(sign, bits, size) \
	{ \
		sign##int##bits##_t vec1a[size], vec2a[size]; \
	\
		v##sign##int##bits##x##size##_store(vec1, vec1a); \
		v##sign##int##bits##x##size##_store(vec2, vec2a); \
	\
		for (int i = 0; i < size; i++) vec1a[i] = (vec2a[i]) ? (vec1a[i] / vec2a[i]) : 0; \
	\
		return v##sign##int##bits##x##size##_load(vec1a); \
	}

#define VEC_GENERIC_SHIFT(sign, bits, size, vectype, way) \
	VEC_DECL_SHIFT(sign, bits, size, vectype, way) \
	{ \
		sign##int##bits##_t vec1a[size], vec2a[size]; \
	\
		v##sign##int##bits##x##size##_store(vec1, vec1a); \
		vuint##bits##x##size##_store(vec2, vec2a); \
	\
		for (int i = 0; i < size; i++) vec1a[i] = vec_##sign##vectype##way##shift(vec1a[i], vec2a[i]); \
	\
		return v##sign##int##bits##x##size##_load(vec1a); \
	}

#define VEC_GENERIC_SHIFTS(sign, bits, size) \
	VEC_GENERIC_SHIFT(sign, bits, size,  , l) /* left shift */ \
	VEC_GENERIC_SHIFT(sign, bits, size,  , r) /* arithmetic right shift */ \
	VEC_GENERIC_SHIFT(sign, bits, size, l, r) /* logical right shift */

#define VEC_GENERIC_AVG(sign, bits, size) \
	VEC_DECL_AVG(sign, bits, size) \
	{ \
		return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_mul(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \
	}

#define VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size) \
	VEC_DECL_NOT(sign, bits, size); \
	\
	VEC_DECL_CMPLE(sign, bits, size) \
	{ \
		return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmpgt(vec1, vec2)); \
	} \
	VEC_DECL_CMPGE(sign, bits, size) \
	{ \
		return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmplt(vec1, vec2)); \
	}

#define VEC_GENERIC_COMPARISON(sign, bits, size, name, op) \
	VEC_DECL_CMP##name(sign, bits, size) \
	{ \
		sign##int##bits##_t vec1a[size], vec2a[size]; \
	\
		v##sign##int##bits##x##size##_store(vec1, vec1a); \
		v##sign##int##bits##x##size##_store(vec2, vec2a); \
	\
		for (int i = 0; i < size; i++) vec1a[i] = (vec1a[i] op vec2a[i]) ? UINT##bits##_MAX : 0; \
	\
		return v##sign##int##bits##x##size##_load(vec1a); \
	}

#define VEC_GENERIC_COMPARISONS(sign, bits, size) \
	VEC_GENERIC_COMPARISON(sign, bits, size, LT, <) \
	VEC_GENERIC_COMPARISON(sign, bits, size, GT, >) \
	VEC_GENERIC_COMPARISON(sign, bits, size, EQ, ==) \
	VEC_GENERIC_THAN_OR_EQUAL(sign, bits, size)

#ifndef VEC_SUPPRESS_HW
/* POWER altivec */
# ifdef __ALTIVEC__
#  include "impl/altivec.h"
# endif
/* x86 SSE2 */
# ifdef __SSE2__
#  include "impl/sse2.h"
# endif
#endif

#ifndef VEC_SUPPRESS_GCC
# ifdef VEC_HAVE_GNUC_VECTORS
#  include "impl/gcc.h"
# endif
#endif

#include "impl/generic.h"

/* ----------------------------------------------------------------- */
/* bitwise NOT is just an XOR with UINT[BITS]_MAX */

#define DEFINE_NOT_OPERATION(sign, bits, size) \
	VEC_DECL_NOT(sign, bits, size) \
	{ \
		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat(UINT##bits##_MAX)); \
	}

DEFINE_NOT_OPERATION(, 8, 16)
DEFINE_NOT_OPERATION(, 16, 8)
DEFINE_NOT_OPERATION(, 32, 4)
DEFINE_NOT_OPERATION(, 64, 2)
DEFINE_NOT_OPERATION(u, 8, 16)
DEFINE_NOT_OPERATION(u, 16, 8)
DEFINE_NOT_OPERATION(u, 32, 4)
DEFINE_NOT_OPERATION(u, 64, 2)

#undef DEFINE_NOT_OPERATION

/* ---------------------------------------------------------------- */

/* cleanup */
#undef VEC_OPERATION_DECL
#undef VEC_OPERATION_THIS_DECL
#undef VEC_TWOWAY_DECL

#undef VEC_DECL_SPLAT
#undef VEC_DECL_LOAD
#undef VEC_DECL_STORE
#undef VEC_DECL_ADD
#undef VEC_DECL_SUB
#undef VEC_DECL_MUL
#undef VEC_DECL_DIV
#undef VEC_DECL_AND
#undef VEC_DECL_OR
#undef VEC_DECL_XOR
#undef VEC_DECL_AVG
#undef VEC_DECL_SHIFT
#undef VEC_DECL_NOT

#undef VEC_DECL_CMPLT
#undef VEC_DECL_CMPGT
#undef VEC_DECL_CMPEQ
#undef VEC_DECL_CMPLE
#undef VEC_DECL_CMPGE

#undef VEC_GENERIC_SPLAT
#undef VEC_GENERIC_DIVIDE
#undef VEC_GENERIC_SHIFT
#undef VEC_GENERIC_SHIFTS
#undef VEC_GENERIC_AVG
#undef VEC_GENERIC_THAN_OR_EQUAL
#undef VEC_GENERIC_COMPARISON
#undef VEC_GENERIC_COMPARISONS

#undef VEC_VINT8X16
#undef VEC_VINT16X8
#undef VEC_VINT32X4
#undef VEC_VINT64X2
#undef VEC_VUINT8X16
#undef VEC_VUINT16X8
#undef VEC_VUINT32X4
#undef VEC_VUINT64X2

/* ---------------------------------------------------------------- */
/* user-friendly alignment crap */

#ifndef VINT8x16_ALIGNMENT
# define VINT8x16_ALIGNMENT 1
#endif

#ifndef VINT16x8_ALIGNMENT
# define VINT16x8_ALIGNMENT 1
#endif

#ifndef VINT32x4_ALIGNMENT
# define VINT32x4_ALIGNMENT 1
#endif

#ifndef VINT64x2_ALIGNMENT
# define VINT64x2_ALIGNMENT 1
#endif

#ifndef VUINT8x16_ALIGNMENT
# define VUINT8x16_ALIGNMENT 1
#endif

#ifndef VUINT16x8_ALIGNMENT
# define VUINT16x8_ALIGNMENT 1
#endif

#ifndef VUINT32x4_ALIGNMENT
# define VUINT32x4_ALIGNMENT 1
#endif

#ifndef VUINT64x2_ALIGNMENT
# define VUINT64x2_ALIGNMENT 1
#endif

/* pointer alignment macros */

#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)

#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)

#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)

#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)

#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)

#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)

#endif /* VEC_VEC_H_ */
