/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#ifndef VEC_VEC_H_
#define VEC_VEC_H_

#include <stdint.h>
#include <string.h>
#include <limits.h>

#define VEC_MAX(a, b) (((a) > (b)) ? (a) : (b))
#define VEC_MIN(a, b) (((a) < (b)) ? (a) : (b))
#define VEC_CLAMP(x, min, max) (VEC_MIN(VEC_MAX((x), (min)), (max)))

#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
	(((a) >= (x)) && \
	 ((a) > x || (b) >= (y)) && \
	 ((a) > x || (b) > (y) || (c) >= (z)))

#define VEC_GNUC_ATLEAST(x, y, z) \
	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)

/* GCC/clang attributes */
#if defined(__has_attribute)
# if __has_attribute(__aligned__)
#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
# endif
# if __has_attribute(__vector_size__)
#  define VEC_COMPILER_HAS_GNUC_VECTORS
# endif
#endif

#ifndef VEC_ALIGNED
# if VEC_GNUC_ATLEAST(2, 7, 0)
#  define VEC_ALIGNED(x) __attribute__((__aligned__(x)))
# endif
#endif

#if (__STDC_VERSION__ >= 201112L)
# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
#else
// C99 static assertion
# define VEC_STATIC_ASSERT(x, msg) \
	extern int (*vec_impl_Static_assert_function_(void)) \
		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
#endif

#ifndef VEC_ASSERT
# ifndef VEC_DISABLE_ASSERTIONS
#  include <assert.h>
#  define VEC_ASSERT(x, msg) assert(msg && x)
# else
#  define VEC_ASSERT(x, msg)
# endif
#endif

/* --------------------------------------------------------------- */
/* Detect compiler SIMD support */

#define VEC_GENERIC_ALIGNMENT 1
#define VEC_ALTIVEC_ALIGNMENT 16
#define VEC_SSE2_ALIGNMENT    16
#define VEC_AVX2_ALIGNMENT    32
#define VEC_AVX512F_ALIGNMENT 64

// for the generic implementation, 64-bit
#define VINT8x8_ALIGNMENT   VEC_GENERIC_ALIGNMENT
#define VINT16x4_ALIGNMENT  VEC_GENERIC_ALIGNMENT
#define VINT32x2_ALIGNMENT  VEC_GENERIC_ALIGNMENT
#define VUINT8x8_ALIGNMENT  VEC_GENERIC_ALIGNMENT
#define VUINT16x4_ALIGNMENT VEC_GENERIC_ALIGNMENT
#define VUINT32x2_ALIGNMENT VEC_GENERIC_ALIGNMENT

#define VINT8x16_ALIGNMENT  VINT8x8_ALIGNMENT
#define VINT16x8_ALIGNMENT  VINT16x4_ALIGNMENT
#define VINT32x4_ALIGNMENT  VINT32x2_ALIGNMENT
#define VINT64x2_ALIGNMENT  VEC_GENERIC_ALIGNMENT
#define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT
#define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT
#define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT
#define VUINT64x2_ALIGNMENT VEC_GENERIC_ALIGNMENT

#define VINT8x32_ALIGNMENT   VINT8x16_ALIGNMENT
#define VINT16x16_ALIGNMENT  VINT16x8_ALIGNMENT
#define VINT32x8_ALIGNMENT   VINT32x4_ALIGNMENT
#define VINT64x4_ALIGNMENT   VINT64x2_ALIGNMENT
#define VUINT8x32_ALIGNMENT  VUINT8x16_ALIGNMENT
#define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
#define VUINT32x8_ALIGNMENT  VUINT32x4_ALIGNMENT
#define VUINT64x4_ALIGNMENT  VUINT64x2_ALIGNMENT

#define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
#define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
#define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
#define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
#define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
#define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
#define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT
#define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT

#ifndef VEC_SUPPRESS_HW

// IIRC `__VEC__' is also defined, but I don't know for sure.
// IBM says that `__ALTIVEC__' is standard though.
#ifdef __ALTIVEC__
# include <altivec.h>
# define VEC_COMPILER_HAS_ALTIVEC
# if defined(__POWER8__) && defined(__VSX__)
#  define VEC_COMPILER_HAS_ALTIVEC_VSX
# endif
# if VINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
#endif

#ifdef __MMX__
# include <mmintrin.h>
# define VEC_COMPILER_HAS_MMX
#endif

#ifdef __SSE2__
# include <emmintrin.h>
# define VEC_COMPILER_HAS_SSE2
# ifdef __SSE4_1__
#  define VEC_COMPILER_HAS_SSE41
# endif
# if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
#endif

#ifdef __AVX2__
# include <immintrin.h>
# define VEC_COMPILER_HAS_AVX2
# if VINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT8x32_ALIGNMENT
#  define VINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT16x16_ALIGNMENT
#  define VINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT32x8_ALIGNMENT
#  define VINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT64x4_ALIGNMENT
#  define VINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT8x32_ALIGNMENT
#  define VUINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT16x16_ALIGNMENT
#  define VUINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT32x8_ALIGNMENT
#  define VUINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT64x4_ALIGNMENT
#  define VUINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
#endif

#ifdef __AVX512F__
# include <immintrin.h>
# define VEC_COMPILER_HAS_AVX512F
# if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT8x64_ALIGNMENT
#  define VINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT16x32_ALIGNMENT
#  define VINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT32x16_ALIGNMENT
#  define VINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT64x8_ALIGNMENT
#  define VINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT8x64_ALIGNMENT
#  define VUINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT16x32_ALIGNMENT
#  define VUINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT32x16_ALIGNMENT
#  define VUINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT64x8_ALIGNMENT
#  define VUINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
#endif

#endif

/* --------------------------------------------------------------- */
/* bit shift */

inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y)
{
	return x >> y;
}

inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y)
{
	return x << y;
}

inline intmax_t vec_lrshift(intmax_t x, unsigned int y)
{
	// reinterpret as unsigned integer and then shift
	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;
	xx.u >> y;
	return xx.d;
}

inline intmax_t vec_llshift(intmax_t x, unsigned int y)
{
	// reinterpret as unsigned integer and then shift
	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;
	xx.u << y;
	return xx.d;
}

inline uintmax_t vec_urshift(uintmax_t x, unsigned int y)
{
	return x >> y;
}

inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y)
{
	return x << y;
}

/**
 * Arithmetic shifts; based off code from OpenMPT, which is under
 * the Boost Software License:
 *
 * Permission is hereby granted, free of charge, to any person or organization
 * obtaining a copy of the software and accompanying documentation covered by
 * this license (the "Software") to use, reproduce, display, distribute,
 * execute, and transmit the Software, and to prepare derivative works of the
 * Software, and to permit third-parties to whom the Software is furnished to
 * do so, all subject to the following:
 * 
 * The copyright notices in the Software and this entire statement, including
 * the above license grant, this restriction and the following disclaimer,
 * must be included in all copies of the Software, in whole or in part, and
 * all derivative works of the Software, unless such copies or derivative
 * works are solely in the form of machine-executable object code generated by
 * a source language processor.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
**/
inline intmax_t vec_rshift(intmax_t x, unsigned int y)
{
	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;

	// I have no idea what this does :)
	xx.u += roffset;
	xx.u >>= y;
	xx.u -= roffset >> y;

	return xx.d;
}

inline intmax_t vec_lshift(intmax_t x, unsigned int y)
{
	static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1);

	union {
		intmax_t d;
		uintmax_t u;
	} xx;

	xx.d = x;

	xx.u += roffset;
	xx.u <<= y;
	xx.u -= roffset << y;

	return xx.d;
}

#ifdef VEC_IMPLEMENTATION
extern inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y);
extern inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y);
extern inline intmax_t vec_lrshift(intmax_t x, unsigned int y);
extern inline intmax_t vec_llshift(intmax_t x, unsigned int y);
extern inline uintmax_t vec_urshift(uintmax_t x, unsigned int y);
extern inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y);
extern inline intmax_t vec_rshift(intmax_t x, unsigned int y);
extern inline intmax_t vec_lshift(intmax_t x, unsigned int y);
#endif

/* --------------------------------------------------------------- */
/* Array alignment macros */

/* the alignment must be specified in bytes and must be a multiple of the
 * type size. it is always assumed that the type will be on a boundary of
 * its size, which may or may not be true */
#ifdef VEC_ALIGNED
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_ALIGNED(align) type var[length]
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(var))
#else
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \
	type vec_##var##_unaligned_[(length) + (align / sizeof(type))]; \
	type *var = (type *)(((uintptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)); \
	VEC_ASSERT(((uintptr_t)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(vec_##var##_unaligned_) - (align - 1))
#endif

#define VEC_ALIGNED_ARRAY_LENGTH(var) \
	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))

// ------------------------------------------------------------
// predefined variants for each vector type

#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 8, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)

#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 4, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)

#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 2, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)

#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 8, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)

#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 4, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)

#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 2, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)

#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)

#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)

#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)

#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)

#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)

#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)

#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)

#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)

#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)

#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)

#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)

#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)

#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)

#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)

#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)

#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)

#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)

#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)

#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)

/* --------------------------------------------------------------- */
/* Defines the structures for each vector type */

// 64-bit
typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif

	uint8_t generic[8];
} vuint8x8;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif

	uint16_t generic[4];
} vuint16x4;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif

	uint32_t generic[2];
} vuint32x2;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif

	int8_t generic[8];
} vint8x8;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif

	int16_t generic[4];
} vint16x4;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif

	int32_t generic[2];
} vint32x2;

// 128-bit
typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned char altivec;
#endif
	vuint8x8 generic[2];
} vuint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned short altivec;
#endif
	vuint16x4 generic[2];
} vuint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned int altivec;
#endif
	vuint32x2 generic[2];
} vuint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector unsigned long long altivec;
#endif
	uint64_t generic[2];
} vuint64x2;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed char altivec;
#endif
	vint8x8 generic[2];
} vint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed short altivec;
#endif
	vint16x4 generic[2];
} vint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed int altivec;
#endif
	vint32x2 generic[2];
} vint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector signed long long altivec;
#endif
	int64_t generic[2];
} vint64x2;

// 256-bit
typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint8x16 generic[2];
} vuint8x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint16x8 generic[2];
} vuint16x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint32x4 generic[2];
} vuint32x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint64x2 generic[2];
} vuint64x4;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint8x16 generic[2];
} vint8x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint16x8 generic[2];
} vint16x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint32x4 generic[2];
} vint32x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint64x2 generic[2];
} vint64x4;

// 512-bit
typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint8x32 generic[2];
} vuint8x64;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint16x16 generic[2];
} vuint16x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint32x8 generic[2];
} vuint32x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint64x4 generic[2];
} vuint64x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint8x32 generic[2];
} vint8x64;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint16x16 generic[2];
} vint16x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint32x8 generic[2];
} vint32x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint64x4 generic[2];
} vint64x8;

// ---------------------------------------------------------------------------------
// function declarations

int vec_init(void);

#define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]); \
	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);

#define VEC_DECLARE_OPERATIONS(bits, size) \
	VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \
	VEC_DECLARE_OPERATIONS_SIGN(u, bits, size)

// 64-bit
VEC_DECLARE_OPERATIONS(8, 8)
VEC_DECLARE_OPERATIONS(16, 4)
VEC_DECLARE_OPERATIONS(32, 2)

// 128-bit
VEC_DECLARE_OPERATIONS(8, 16)
VEC_DECLARE_OPERATIONS(16, 8)
VEC_DECLARE_OPERATIONS(32, 4)
VEC_DECLARE_OPERATIONS(64, 2)

// 256-bit
VEC_DECLARE_OPERATIONS(8, 32)
VEC_DECLARE_OPERATIONS(16, 16)
VEC_DECLARE_OPERATIONS(32, 8)
VEC_DECLARE_OPERATIONS(64, 4)

// 512-bit
VEC_DECLARE_OPERATIONS(8, 64)
VEC_DECLARE_OPERATIONS(16, 32)
VEC_DECLARE_OPERATIONS(32, 16)
VEC_DECLARE_OPERATIONS(64, 8)

#undef VEC_DECLARE_OPERATIONS
#undef VEC_DECLARE_OPERATIONS_SIGN

// ---------------------------------------------------------------------------------
// okay, now we can actually implement the functions

#ifdef VEC_IMPLEMENTATION

// Fallback functions, need to be defined before everything else.
#include "impl/fallback.h"

// okay, these are filled in for each supported backend
#define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \
	typedef struct { \
		v##sign##int##bits##x##size (*splat)(sign##int##bits##_t x); \
		v##sign##int##bits##x##size (*load_aligned)(const sign##int##bits##_t in[size]); \
		v##sign##int##bits##x##size (*load)(const sign##int##bits##_t in[size]); \
		void (*store_aligned)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
		void (*store)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \
		v##sign##int##bits##x##size (*add)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*sub)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*mul)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*div)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*avg)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*and)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*or)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*xor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*not)(v##sign##int##bits##x##size vec); \
		v##sign##int##bits##x##size (*cmplt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmple)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmpeq)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmpge)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmpgt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
		v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
		v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	} v##sign##int##bits##x##size##_impl;

#define VEC_DEFINE_IMPL_STRUCT(bits, size) \
	VEC_DEFINE_IMPL_STRUCT_SIGN( , bits, size) \
	VEC_DEFINE_IMPL_STRUCT_SIGN(u, bits, size)

// 64-bit
VEC_DEFINE_IMPL_STRUCT(8, 8)
VEC_DEFINE_IMPL_STRUCT(16, 4)
VEC_DEFINE_IMPL_STRUCT(32, 2)

// 128-bit
VEC_DEFINE_IMPL_STRUCT(8, 16)
VEC_DEFINE_IMPL_STRUCT(16, 8)
VEC_DEFINE_IMPL_STRUCT(32, 4)
VEC_DEFINE_IMPL_STRUCT(64, 2)

// 256-bit
VEC_DEFINE_IMPL_STRUCT(8, 32)
VEC_DEFINE_IMPL_STRUCT(16, 16)
VEC_DEFINE_IMPL_STRUCT(32, 8)
VEC_DEFINE_IMPL_STRUCT(64, 4)

// 512-bit
VEC_DEFINE_IMPL_STRUCT(8, 64)
VEC_DEFINE_IMPL_STRUCT(16, 32)
VEC_DEFINE_IMPL_STRUCT(32, 16)
VEC_DEFINE_IMPL_STRUCT(64, 8)

#undef VEC_DEFINE_IMPL_STRUCT
#undef VEC_DEFINE_IMPL_STRUCT_SIGN

// ------------------------------------------------------------------------

#ifdef VEC_COMPILER_HAS_ALTIVEC
# include "impl/ppc/altivec.h"
#endif

#ifdef VEC_COMPILER_HAS_AVX512F
# include "impl/x86/avx512f.h"
#endif

#ifdef VEC_COMPILER_HAS_AVX2
# include "impl/x86/avx2.h"
#endif

#ifdef VEC_COMPILER_HAS_SSE2
# include "impl/x86/sse2.h"
#endif

// depends on SSE2 functions; the only thing SSE4.1 provides for us
// is a native 32-bit multiply
#ifdef VEC_COMPILER_HAS_SSE41
# include "impl/x86/sse41.h"
#endif

#ifdef VEC_COMPILER_HAS_MMX
# include "impl/x86/mmx.h"
#endif

#include "impl/generic.h"

/* ---------------------------------------------------------------- */

#include "impl/cpu.h" // CPU detection crap

// 64-bit
static vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
static vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
static vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
static vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
static vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
static vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;

// 128-bit
static vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
static vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
static vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
static vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
static vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
static vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
static vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
static vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;

// 256-bit
static vint8x32_impl  *vint8x32_impl_cpu  = &vint8x32_impl_generic;
static vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic;
static vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
static vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
static vint32x8_impl  *vint32x8_impl_cpu  = &vint32x8_impl_generic;
static vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic;
static vint64x4_impl  *vint64x4_impl_cpu  = &vint64x4_impl_generic;
static vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic;

// 512-bit
static vint8x64_impl  *vint8x64_impl_cpu  = &vint8x64_impl_generic;
static vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic;
static vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
static vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
static vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
static vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
static vint64x8_impl  *vint64x8_impl_cpu  = &vint64x8_impl_generic;
static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic;

int vec_init(void)
{
	// This function is NOT thread safe. However, once vec
	// is initialized, all of the vector functions are thread-safe.
	//
	// In fact, it's possible to use vec without calling
	// vec_init() at all, but it would be completely useless since
	// it would just use a generic implementation without any
	// vectorization whatsoever (unless maybe the compiler is
	// smart enough to optimize it into vectors)

	vec_get_CPU_features();

#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (vec_CPU_have_ALTIVEC()) {
		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
		if (vec_CPU_have_ALTIVEC_VSX()) {
			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
		}
#endif
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX512F
	if (vec_CPU_have_AVX512F()) {
		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX2
	if (vec_CPU_have_AVX2()) {
		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
	}
#endif
#ifdef VEC_COMPILER_HAS_SSE2
	if (vec_CPU_have_SSE2()) {
		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
# ifdef VEC_COMPILER_HAS_SSE41
		if (vec_CPU_have_SSE41()) {
			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
		} else
# endif
		{
			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
		}
		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
	}
#endif
#ifdef VEC_COMPILER_HAS_MMX
	if (vec_CPU_have_MMX()) {
		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
	}
#endif
	{
		// do nothing, they're already set to generics
	}
}

/* ---------------------------------------------------------------- */

#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->splat) \
			return v##sign##int##bits##x##size##_impl_cpu->splat(x); \
	\
		return v##sign##int##bits##x##size##_fallback_splat(x); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \
			return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \
	\
		VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \
		return (v##sign##int##bits##x##size){0}; \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->load) \
			return v##sign##int##bits##x##size##_impl_cpu->load(in); \
	\
		return v##sign##int##bits##x##size##_fallback_load(in); \
	} \
	\
	void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \
			v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \
			return; \
		} \
	\
		VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \
	} \
	\
	void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->store) { \
			v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \
			return; \
		} \
	\
		v##sign##int##bits##x##size##_fallback_store(vec, out); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->add) \
			v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_add(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->sub) \
			v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_sub(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->mul) \
			v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_mul(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->div) \
			v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_div(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->avg) \
			v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_avg(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->and) \
			v##sign##int##bits##x##size##_impl_cpu->and(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->or) \
			v##sign##int##bits##x##size##_impl_cpu->or(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->xor) \
			v##sign##int##bits##x##size##_impl_cpu->xor(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->not) \
			v##sign##int##bits##x##size##_impl_cpu->not(vec); \
	\
		return v##sign##int##bits##x##size##_fallback_not(vec); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->cmplt) \
			v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_cmplt(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->cmple) \
			v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_cmple(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->cmpeq) \
			v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_cmpeq(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->cmpge) \
			v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_cmpge(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->cmpgt) \
			v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_cmpgt(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->lshift) \
			v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_lshift(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->rshift) \
			v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_rshift(vec1, vec2); \
	} \
	\
	v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
	{ \
		if (v##sign##int##bits##x##size##_impl_cpu->lrshift) \
			v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \
	\
		return v##sign##int##bits##x##size##_fallback_lrshift(vec1, vec2); \
	}

#define VEC_DEFINE_OPERATIONS(bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)

// 64-bit
VEC_DEFINE_OPERATIONS(8, 8)
VEC_DEFINE_OPERATIONS(16, 4)
VEC_DEFINE_OPERATIONS(32, 2)

// 128-bit
VEC_DEFINE_OPERATIONS(8, 16)
VEC_DEFINE_OPERATIONS(16, 8)
VEC_DEFINE_OPERATIONS(32, 4)
VEC_DEFINE_OPERATIONS(64, 2)

// 256-bit
VEC_DEFINE_OPERATIONS(8, 32)
VEC_DEFINE_OPERATIONS(16, 16)
VEC_DEFINE_OPERATIONS(32, 8)
VEC_DEFINE_OPERATIONS(64, 4)

// 512-bit
VEC_DEFINE_OPERATIONS(8, 64)
VEC_DEFINE_OPERATIONS(16, 32)
VEC_DEFINE_OPERATIONS(32, 16)
VEC_DEFINE_OPERATIONS(64, 8)

#undef VEC_DEFINE_OPERATIONS
#undef VEC_DEFINE_OPERATIONS_SIGN

#endif /* VEC_IMPLEMENTATION */

#endif /* VEC_VEC_H_ */
