/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#ifndef VEC_VEC_H_
#define VEC_VEC_H_

#include <stdint.h>
#include <string.h>
#include <limits.h>

#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
	(((a) >= (x)) && \
	 ((a) > x || (b) >= (y)) && \
	 ((a) > x || (b) > (y) || (c) >= (z)))

#define VEC_GNUC_ATLEAST(x, y, z) \
	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)

/* GCC/clang attributes */
#if defined(__has_attribute)
# if __has_attribute(__always_inline__)
#  define VEC_ALWAYS_INLINE __attribute__((__always_inline__))
# endif
# if __has_attribute(__aligned__)
#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
# endif
# if __has_attribute(__vector_size__)
#  define VEC_HAVE_GNUC_VECTORS
# endif
#endif

#ifndef VEC_ALIGNED
# if VEC_GNUC_ATLEAST(2, 7, 0)
#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
# endif
#endif

/* FIXME: gcc 4.2 on Mac OS X doesn't have always_inline,
 * even though docs and many online sources say that it
 * should have it. */

#ifndef VEC_ALWAYS_INLINE
# define VEC_ALWAYS_INLINE
#endif

/* Allow users to define all of the symbols externally in
 * one translation unit, or as a shared library. */
#ifdef VEC_EXTERN
# ifdef VEC_EXTERN_DEFINE
#  define VEC_FUNC_KEYWORDS extern inline
# else
#  define VEC_FUNC_KEYWORDS inline
# endif
#else
# define VEC_FUNC_KEYWORDS static inline VEC_ALWAYS_INLINE
#endif

#if (__STDC_VERSION__ >= 201112L)
# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
#else
// C99 static assertion
# define VEC_STATIC_ASSERT(x, msg) \
	extern int (*vec_impl_Static_assert_function_(void)) \
		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
#endif

#ifndef VEC_ASSERT
# ifndef VEC_DISABLE_ASSERTIONS
#  include <assert.h>
#  define VEC_ASSERT(x, msg) assert(msg && x)
# else
#  define VEC_ASSERT(x, msg)
# endif
#endif

/* --------------------------------------------------------------- */
/* Detect compiler SIMD support */

// IIRC `__VEC__' is also defined, but I don't know for sure.
// IBM says that `__ALTIVEC__' is standard though.
#ifdef __ALTIVEC__
# include <altivec.h>
# define VEC_COMPILER_HAS_ALTIVEC

# define VINT8x16_ALIGNMENT 16
# define VINT16x8_ALIGNMENT 16
# define VINT32x4_ALIGNMENT 16
# define VINT64x2_ALIGNMENT 16
#endif

#ifdef __SSE2__
# include <immintrin.h>
# define VEC_COMPILER_HAS_SSE2
# ifdef __SSE42__
#  define VEC_COMPILER_HAS_SSE42
# endif

# define VINT8x16_ALIGNMENT 16
# define VINT16x8_ALIGNMENT 16
# define VINT32x4_ALIGNMENT 16
# define VINT64x2_ALIGNMENT 16
#endif

#ifndef VINT8x16_ALIGNMENT
# define VINT8x16_ALIGNMENT 1
#endif
#ifndef VINT16x8_ALIGNMENT
# define VINT16x8_ALIGNMENT 1
#endif
#ifndef VINT32x4_ALIGNMENT
# define VINT32x4_ALIGNMENT 1
#endif
#ifndef VINT64x2_ALIGNMENT
# define VINT64x2_ALIGNMENT 1
#endif
#ifndef VUINT8x16_ALIGNMENT
# define VUINT8x16_ALIGNMENT 1
#endif
#ifndef VUINT16x8_ALIGNMENT
# define VUINT16x8_ALIGNMENT 1
#endif
#ifndef VUINT32x4_ALIGNMENT
# define VUINT32x4_ALIGNMENT 1
#endif
#ifndef VUINT64x2_ALIGNMENT
# define VUINT64x2_ALIGNMENT 1
#endif

// generic 256-bit is just doubled 128-bit
#ifndef VINT8x32_ALIGNMENT
# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT
#endif
#ifndef VINT16x16_ALIGNMENT
# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT
#endif
#ifndef VINT32x8_ALIGNMENT
# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT
#endif
#ifndef VINT64x4_ALIGNMENT
# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT
#endif
#ifndef VUINT8x32_ALIGNMENT
# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT
#endif
#ifndef VUINT16x16_ALIGNMENT
# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
#endif
#ifndef VUINT32x8_ALIGNMENT
# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT
#endif
#ifndef VUINT64x4_ALIGNMENT
# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT
#endif

// generic 512-bit is just doubled 256-bit
#ifndef VINT8x64_ALIGNMENT
# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
#endif
#ifndef VINT16x32_ALIGNMENT
# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
#endif
#ifndef VINT32x16_ALIGNMENT
# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
#endif
#ifndef VINT64x8_ALIGNMENT
# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
#endif
#ifndef VUINT8x64_ALIGNMENT
# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
#endif
#ifndef VUINT16x32_ALIGNMENT
# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
#endif
#ifndef VUINT32x16_ALIGNMENT
# define VUINT32x16_ALIGNMENT VUINT32x16_ALIGNMENT
#endif
#ifndef VUINT64x8_ALIGNMENT
# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT
#endif

/* --------------------------------------------------------------- */
/* Detect CPU SIMD support */

// stubs for now... will be implemented sometime
#define VEC_CPU_have_SSE2() (0)
#define VEC_CPU_have_SSE42() (0)
#define VEC_CPU_have_ALTIVEC() (0)
#define VEC_CPU_have_ALTIVEC_VSX() (0)

/* --------------------------------------------------------------- */
/* bit shift */

VEC_FUNC_KEYWORDS uintmax_t vec_ulrshift(uintmax_t x, unsigned int y)
{
	return x >> y;
}

VEC_FUNC_KEYWORDS uintmax_t vec_ullshift(uintmax_t x, unsigned int y)
{
	return x << y;
}

VEC_FUNC_KEYWORDS intmax_t vec_lrshift(intmax_t x, unsigned int y)
{
	// reinterpret as unsigned integer and then shift
	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;
	xx.u >> y;
	return xx.d;
}

VEC_FUNC_KEYWORDS intmax_t vec_llshift(intmax_t x, unsigned int y)
{
	// reinterpret as unsigned integer and then shift
	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;
	xx.u << y;
	return xx.d;
}

VEC_FUNC_KEYWORDS uintmax_t vec_urshift(uintmax_t x, unsigned int y)
{
	return x >> y;
}

VEC_FUNC_KEYWORDS uintmax_t vec_ulshift(uintmax_t x, unsigned int y)
{
	return x << y;
}

/**
 * Arithmetic shifts; based off code from OpenMPT, which is under
 * the Boost Software License:
 *
 * Permission is hereby granted, free of charge, to any person or organization
 * obtaining a copy of the software and accompanying documentation covered by
 * this license (the "Software") to use, reproduce, display, distribute,
 * execute, and transmit the Software, and to prepare derivative works of the
 * Software, and to permit third-parties to whom the Software is furnished to
 * do so, all subject to the following:
 * 
 * The copyright notices in the Software and this entire statement, including
 * the above license grant, this restriction and the following disclaimer,
 * must be included in all copies of the Software, in whole or in part, and
 * all derivative works of the Software, unless such copies or derivative
 * works are solely in the form of machine-executable object code generated by
 * a source language processor.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
**/
VEC_FUNC_KEYWORDS intmax_t vec_rshift(intmax_t x, unsigned int y)
{
	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;

	// I have no idea what this does :)
	xx.u += roffset;
	xx.u >>= y;
	xx.u -= roffset >> y;

	return xx.d;
}

VEC_FUNC_KEYWORDS intmax_t vec_lshift(intmax_t x, unsigned int y)
{
	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;

	xx.u += roffset;
	xx.u <<= y;
	xx.u -= roffset << y;

	return xx.d;
}

/* --------------------------------------------------------------- */
/* Array alignment macros */

#include <stdio.h>

#ifdef VEC_ALIGNED
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_ALIGNED(align) type var[length]
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(var))
#else
/* the alignment must be specified in bytes and must be a multiple of the
 * type size. it is always assumed that the type will be on a boundary of
 * its size, which may or may not be true */
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_STATIC_ASSERT(align % sizeof(type) == 0 && align != 0, "vec: alignment needs to be a multiple of the type size and non-zero"); \
	type vec_##var##_unaligned_[(length) + (align / sizeof(type)) - 1]; \
	type *var = (type *)(((uintptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)); \
	VEC_ASSERT(((uintptr_t)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(vec_##var##_unaligned_) - ((align) - 1))
#endif

#define VEC_ALIGNED_ARRAY_LENGTH(var, align) \
	(VEC_ALIGNED_ARRAY_SIZEOF(var, align)/sizeof(*var))

// ------------------------------------------------------------
// predefined variants for each vector type

#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)

#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)

#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)

#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)

#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)

#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)

#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)

#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT)
#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)

#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT)
#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT)
#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)

#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT)
#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)

#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT)
#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)

#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT)
#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT)
#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)

#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT)
#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)

#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT)
#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)

#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x16_ALIGNMENT)
#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT)
#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)

#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT)
#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)

#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT)
#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)

#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x16_ALIGNMENT)
#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT)
#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)

#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT)
#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)

/* --------------------------------------------------------------- */
/* Defines the structures for each vector type */

// 128-bit
typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned char altivec;
#endif
	uint8_t generic[16];
} vuint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned short altivec;
#endif
	uint16_t generic[8];
} vuint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned int altivec;
#endif
	uint32_t generic[4];
} vuint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector unsigned long long altivec;
#endif
	uint64_t generic[2];
} vuint64x2;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed char altivec;
#endif
	int8_t generic[16];
} vint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed short altivec;
#endif
	int16_t generic[8];
} vint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed int altivec;
#endif
	int32_t generic[4];
} vint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector signed long long altivec;
#endif
	int64_t generic[2];
} vint64x2;

// 256-bit
typedef union {
	vuint8x16 generic[2];
} vuint8x32;

typedef union {
	vuint16x8 generic[2];
} vuint16x16;

typedef union {
	vuint32x4 generic[2];
} vuint32x8;

typedef union {
	vuint64x2 generic[2];
} vuint64x4;

typedef union {
	vint8x16 generic[2];
} vint8x32;

typedef union {
	vint16x8 generic[2];
} vint16x16;

typedef union {
	vint32x4 generic[2];
} vint32x8;

typedef union {
	vint64x2 generic[2];
} vint64x4;

// 512-bit
typedef union {
	vuint8x32 generic[2];
} vuint8x64;

typedef union {
	vuint16x16 generic[2];
} vuint16x32;

typedef union {
	vuint32x8 generic[2];
} vuint32x16;

typedef union {
	vuint64x4 generic[2];
} vuint64x8;

typedef union {
	vint8x32 generic[2];
} vint8x64;

typedef union {
	vint16x16 generic[2];
} vint16x32;

typedef union {
	vint32x8 generic[2];
} vint32x16;

typedef union {
	vint64x4 generic[2];
} vint64x8;

// --------------------------------------------------------------------------------
// okay, now onto the actual functions:
//
// we have generic variations of every major operation EXCEPT aligned load and
// aligned store. this means that a vector implementation can be created with
// only aligned load and aligned store implemented, which sucks, but it werks

#define VEC_GENERIC_OPERATION(op, sign, csign, bits, size) \
	do { \
		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \
	\
		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
		v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \
	\
		for (int i = 0; i < size; i++) varr1[i] = (op); \
	\
		return v##sign##int##bits##x##size##_load_aligned(varr1); \
	} while (0)

#define VEC_GENERIC_ADD(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size)
#define VEC_GENERIC_SUB(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size)
#define VEC_GENERIC_MUL(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size)
#define VEC_GENERIC_DIV(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size)
#define VEC_GENERIC_AND(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size)
#define VEC_GENERIC_OR(sign, csign, bits, size)  VEC_GENERIC_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size)
#define VEC_GENERIC_XOR(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size)

#define VEC_GENERIC_CMP(op, sign, csign, bits, size) \
	VEC_GENERIC_OPERATION((varr1[i] op varr1[i]) ? csign##INT##bits##_MAX : 0, sign, csign, bits, size)

#define VEC_GENERIC_CMPLT(sign, csign, bits, size) VEC_GENERIC_CMP(<,  sign, csign, bits, size)
#define VEC_GENERIC_CMPLE(sign, csign, bits, size) VEC_GENERIC_CMP(<=, sign, csign, bits, size)
#define VEC_GENERIC_CMPEQ(sign, csign, bits, size) VEC_GENERIC_CMP(==, sign, csign, bits, size)
#define VEC_GENERIC_CMPGE(sign, csign, bits, size) VEC_GENERIC_CMP(>=, sign, csign, bits, size)
#define VEC_GENERIC_CMPGT(sign, csign, bits, size) VEC_GENERIC_CMP(>,  sign, csign, bits, size)

#define VEC_GENERIC_SHIFT(op, sign, csign, bits, size) \
	do { \
		V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \
		VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \
	\
		v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \
		vuint##bits##x##size##_store_aligned(vec2, varr2); \
	\
		for (int i = 0; i < size; i++) varr1[i] = (op); \
	\
		return v##sign##int##bits##x##size##_load_aligned(varr1); \
	} while (0)

#define VEC_GENERIC_LSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size)
#define VEC_GENERIC_RSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size)
#define VEC_GENERIC_LRSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size)

#ifdef VEC_COMPILER_HAS_SSE2
// these are shared between SSE2 variations
# define VEC_SSE2_MUL_8x16(sign) \
	do { \
		/* unpack and multiply */ \
		__m128i dst_even = _mm_mullo_epi16(vec1.sse, vec2.sse); \
		__m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \
	\
		/* repack */ \
		return (v##sign##int8x16){ .sse = _mm_or_si128( \
			_mm_slli_epi16(dst_odd, 8), \
			_mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
		)}; \
	} while (0)

# define VEC_SSE2_MUL_16x8(sign) \
	do { \
		/* we have a real instruction for this */ \
		return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \
	} while (0)

# define VEC_SSE2_MUL_32x4(sign) \
	do { \
		/* this was stolen from... somewhere :) */ \
		__m128i a13    = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \
		__m128i b13    = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \
		__m128i prod02 = _mm_mul_epu32(vec1, vec2);         /* (-,a2*b2,-,a0*b0) */ \
		__m128i prod13 = _mm_mul_epu32(a13, b13);           /* (-,a3*b3,-,a1*b1) */ \
		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
		return (v##sign##int32x4) {.sse = _mm_unpacklo_epi64(prod01, prod23)}; /* (ab3,ab2,ab1,ab0) */ \
	} while (0)

# define VEC_SSE2_MUL_64x2(sign) \
	do { \
		__m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \
		__m128i b  = _mm_srli_epi64(vec1.sse, 32);      /* b = vec1 >> 32; */ \
		__m128i bc = _mm_mul_epu32(b, vec2.sse);        /* bc = b * (vec2 & UINT32_MAX); */ \
		__m128i d  = _mm_srli_epi64(vec2.sse, 32);      /* d = vec2 >> 32; */ \
		__m128i ad = _mm_mul_epu32(vec1.sse, d);        /* ad = (vec1 & UINT32_MAX) * d; */ \
		__m128i hi = _mm_add_epi64(bc, ad);             /* hi = bc + ad; */ \
		hi = _mm_slli_epi64(hi, 32);                    /* hi <<= 32; */ \
		return (v##sign##int64x2) {.sse = _mm_add_epi64(hi, ac); } /* return ac + hi; */ \
	} while (0)
#endif

// --------------------------------------------------------------------------------
// vuint8x16 implementation

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load_aligned(const uint8_t in[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vuint8x16) { .sse = _mm_load_si128((__m128i *)in) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_ld(0, in);
	} else
#endif
	{
		vuint8x16 vec;
		memcpy(vec.generic, in, sizeof(vec.generic));
		return vec;
	}

	VEC_ASSERT(0, "No suitable load_aligned variant found");

	return (vuint8x16){ 0 };
}

VEC_FUNC_KEYWORDS void vuint8x16_store_aligned(vuint8x16 vec, uint8_t out[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		_mm_store_si128((__m128i *)out, vec.sse);
		return;
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		vec_st(vec.altivec, 0, out);
		return;
	} else
#endif
	{
		memcpy(out, vec.generic, sizeof(vec.generic));
		return;
	}

	VEC_ASSERT(0, "No suitable aligned store variant found");
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_splat(uint8_t x)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return (vuint8x16){ .altivec = vec_splat_u8(x) };
	} else
#endif
	{
		return (vuint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} };
	}

	// okay, we don't have a regular thing. call the load function with a splatted array
	VUINT8x16_ALIGNED_ARRAY(arr);
	for (int i = 0; i < 16; i++) arr[i] = x;
	return vuint8x16_load_aligned(arr);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load(const uint8_t in[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vuint8x16) { .sse = _mm_loadu_si128((__m128i *)in) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in));
	} else
#endif
	{
		vuint8x16 vec;
		memcpy(vec.generic, in, sizeof(vec.generic));
		return vec;
	}

	// ok, we don't have unaligned load, copy the array
	// and call the aligned load function
	VUINT8x16_ALIGNED_ARRAY(aligned_in);
	memcpy(aligned_in, in, 16);
	return vuint8x16_load_aligned(aligned_in);
}

VEC_FUNC_KEYWORDS void vuint8x16_store(vuint8x16 vec, uint8_t out[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		_mm_storeu_si128((__m128i *)out, vec.sse);
		return;
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// noop
	} else
#endif
	{
		memcpy(out, vec.generic, sizeof(vec.generic));
		return;
	}

	// no unaligned store? use the aligned version
	VUINT8x16_ALIGNED_ARRAY(aligned_out);
	vuint8x16_store_aligned(vec, aligned_out);

	// then copy to the output buffer
	memcpy(out, aligned_out, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_add(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vuint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_add(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_ADD(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_sub(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vuint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_sub(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_SUB(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_mul(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		VEC_SSE2_MUL_8x16(u);
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
# ifdef vec_mul // this isn't available on older compilers
		return vec_mul(vec1.altivec, vec2.altivec);
# endif
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_MUL(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_div(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	if (VEC_CPU_have_ALTIVEC_VSX()) {
		return vec_div(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0;
		return vec1;
	}

	VEC_GENERIC_DIV(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_and(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vuint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_and(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_AND(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_or(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vuint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_or(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_OR(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_xor(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vuint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_xor(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_XOR(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lshift(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		//noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_sl(vec1, vec2);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulshift(vec1.generic[i], vec2.generic[i]);
		return vec1;
	}

	VEC_GENERIC_LSHIFT(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_rshift(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		//noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_sl(vec1, vec2);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_urshift(vec1.generic[i], vec2.generic[i]);
		return vec1;
	}

	VEC_GENERIC_RSHIFT(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lrshift(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		//noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_sl(vec1, vec2);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulrshift(vec1.generic[i], vec2.generic[i]);
		return vec1;
	}

	VEC_GENERIC_LRSHIFT(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_avg(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_avg(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (uint8_t)(vec1.generic[i] + vec2.generic[i]) / 2;
		return vec1;
	}

	return vuint8x16_div(vuint8x16_add(vec1, vec2), vuint8x16_splat(2));
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmplt(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPLT(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmple(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPLE(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpeq(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPEQ(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpgt(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPGT(u, U, 8, 16);
}

VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpge(vuint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPGE(u, U, 8, 16);
}

// --------------------------------------------------------------------------------
// vint8x16 implementation

VEC_FUNC_KEYWORDS vint8x16 vint8x16_load_aligned(const int8_t in[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vint8x16) { .sse = _mm_load_si128((__m128i *)in) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return (vint8x16) { .altivec = vec_ld(0, in) };
	} else
#endif
	{
		vint8x16 vec;
		memcpy(vec.generic, in, sizeof(vec.generic));
		return vec;
	}

	VEC_ASSERT(0, "No suitable load_aligned variant found");

	return (vint8x16){ 0 };
}

VEC_FUNC_KEYWORDS void vint8x16_store_aligned(vint8x16 vec, int8_t out[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		_mm_store_si128((__m128i *)out, vec.sse);
		return;
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		vec_st(vec.altivec, 0, out);
		return;
	} else
#endif
	{
		memcpy(out, vec.generic, sizeof(vec.generic));
		return;
	}

	VEC_ASSERT(0, "No suitable aligned store variant found");
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_splat(int8_t x)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return (vint8x16){ .altivec = vec_splat_s8(x) };
	} else
#endif
	{
		return (vint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} };
	}

	// okay, we don't have a regular thing. call the load function with a splatted array
	VINT8x16_ALIGNED_ARRAY(arr);
	for (int i = 0; i < 16; i++) arr[i] = x;
	return vint8x16_load_aligned(arr);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_load(const int8_t in[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vint8x16) { .sse = _mm_loadu_si128((__m128i *)in) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return (vint8x16) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) };
	} else
#endif
	{
		vint8x16 vec;
		memcpy(vec.generic, in, sizeof(vec.generic));
		return vec;
	}

	// ok, we don't have unaligned load, copy the array
	// and call the aligned load function
	VINT8x16_ALIGNED_ARRAY(aligned_in);
	memcpy(aligned_in, in, 16);
	return vint8x16_load_aligned(aligned_in);
}

VEC_FUNC_KEYWORDS void vint8x16_store(vint8x16 vec, int8_t out[16])
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		_mm_storeu_si128((__m128i *)out, vec.sse);
		return;
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// noop
	} else
#endif
	{
		memcpy(out, vec.generic, sizeof(vec.generic));
		return;
	}

	// no unaligned store? use the aligned version
	VINT8x16_ALIGNED_ARRAY(aligned_out);
	vint8x16_store_aligned(vec, aligned_out);

	// then copy to the output buffer
	memcpy(out, aligned_out, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_add(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return (vint8x16) { .altivec = vec_add(vec1.altivec, vec2.altivec) };
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_ADD(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_sub(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return (vint8x16) { .altivec = vec_sub(vec1.altivec, vec2.altivec) };
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_SUB(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_mul(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		VEC_SSE2_MUL_8x16();
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
# ifdef vec_mul // this isn't available on older compilers
		return (vint8x16) { .altivec = vec_mul(vec1.altivec, vec2.altivec) };
# endif
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_MUL(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_div(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	if (VEC_CPU_have_ALTIVEC_VSX()) {
		return (vint8x16) { .altivec = vec_div(vec1.altivec, vec2.altivec) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// noop
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0;
		return vec1;
	}

	VEC_GENERIC_DIV(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_and(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return (vint8x16) {.altivec = vec_and(vec1.altivec, vec2.altivec) };
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_ADD(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_or(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_or(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_OR(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_xor(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		return (vint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) };
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_xor(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i];
		return vec1;
	}

	VEC_GENERIC_XOR(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_lshift(vint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		//noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_sl(vec1, vec2);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lshift(vec1.generic[i], vec2.generic[i]);
		return vec1;
	}

	VEC_GENERIC_LSHIFT(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_rshift(vint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		//noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_sl(vec1, vec2);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_rshift(vec1.generic[i], vec2.generic[i]);
		return vec1;
	}

	VEC_GENERIC_RSHIFT(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_lrshift(vint8x16 vec1, vuint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		//noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_sl(vec1, vec2);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lrshift(vec1.generic[i], vec2.generic[i]);
		return vec1;
	}

	VEC_GENERIC_LRSHIFT(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_avg(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		return vec_avg(vec1.altivec, vec2.altivec);
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (int8_t)(vec1.generic[i] + vec2.generic[i]) / 2;
		return vec1;
	}

	return vint8x16_div(vint8x16_add(vec1, vec2), vint8x16_splat(2));
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmplt(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPLT(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmple(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPLE(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpeq(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho :)
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPEQ(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpgt(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPGT(, , 8, 16);
}

VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpge(vint8x16 vec1, vint8x16 vec2)
{
#ifdef VEC_COMPILER_HAS_SSE2
	if (VEC_CPU_have_SSE2()) {
		// noop
	} else
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (VEC_CPU_have_ALTIVEC()) {
		// these functions exist, no internet rn tho
	} else
#endif
	{
		for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0;
		return vec1;
	}

	VEC_GENERIC_CMPGE(, , 8, 16);
}

/* ----------------------------------------------------------------- */
/* bitwise NOT is just an XOR with UINT[BITS]_MAX */

#define DEFINE_NOT_OPERATION(sign, bits, size) \
	VEC_FUNC_KEYWORDS v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
	{ \
		return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \
	}

DEFINE_NOT_OPERATION(, 8, 16)
DEFINE_NOT_OPERATION(u, 8, 16)

#undef DEFINE_NOT_OPERATION

/* ---------------------------------------------------------------- */

/* cleanup */
#undef VEC_OPERATION_DECL
#undef VEC_OPERATION_THIS_DECL
#undef VEC_TWOWAY_DECL

#undef VEC_DECL_SPLAT
#undef VEC_DECL_LOAD
#undef VEC_DECL_STORE
#undef VEC_DECL_ADD
#undef VEC_DECL_SUB
#undef VEC_DECL_MUL
#undef VEC_DECL_DIV
#undef VEC_DECL_AND
#undef VEC_DECL_OR
#undef VEC_DECL_XOR
#undef VEC_DECL_AVG
#undef VEC_DECL_SHIFT
#undef VEC_DECL_NOT

#undef VEC_DECL_CMPLT
#undef VEC_DECL_CMPGT
#undef VEC_DECL_CMPEQ
#undef VEC_DECL_CMPLE
#undef VEC_DECL_CMPGE

#undef VEC_GENERIC_SPLAT
#undef VEC_GENERIC_DIVIDE
#undef VEC_GENERIC_SHIFT
#undef VEC_GENERIC_SHIFTS
#undef VEC_GENERIC_LSHIFT
#undef VEC_GENERIC_RSHIFT
#undef VEC_GENERIC_LRSHIFT
#undef VEC_GENERIC_AVG
#undef VEC_GENERIC_THAN_OR_EQUAL
#undef VEC_GENERIC_COMPARISON
#undef VEC_GENERIC_COMPARISONS

#undef VEC_VINT8X16
#undef VEC_VINT16X8
#undef VEC_VINT32X4
#undef VEC_VINT64X2
#undef VEC_VUINT8X16
#undef VEC_VUINT16X8
#undef VEC_VUINT32X4
#undef VEC_VUINT64X2

#undef VEC_VINT8X32
#undef VEC_VINT16X16
#undef VEC_VINT32X8
#undef VEC_VINT64X4
#undef VEC_VUINT8X32
#undef VEC_VUINT16X16
#undef VEC_VUINT32X8
#undef VEC_VUINT64X4

#undef VEC_VINT8X64
#undef VEC_VINT16X32
#undef VEC_VINT32X16
#undef VEC_VINT64X8
#undef VEC_VUINT8X64
#undef VEC_VUINT16X32
#undef VEC_VUINT32X16
#undef VEC_VUINT64X8

#endif /* VEC_VEC_H_ */
