/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024-2025 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#ifndef VEC_VEC_H_
#define VEC_VEC_H_

#ifdef __cplusplus
extern "C" {
#endif


#ifdef VEC_HAVE_IMPL_INTEGER_H
# include "impl/integer.h"
#else
# if __cplusplus >= (201103L)
#  include <cstdint>
#  include <cstddef>
typedef std::size_t    vec_uintsize;

typedef std::uint8_t   vec_uint8;
typedef std::uint16_t  vec_uint16;
typedef std::uint32_t  vec_uint32;
typedef std::uint64_t  vec_uint64;
typedef std::uintmax_t vec_uintmax;
typedef std::uintptr_t vec_uintptr;

typedef std::int8_t    vec_int8;
typedef std::int16_t   vec_int16;
typedef std::int32_t   vec_int32;
typedef std::int64_t   vec_int64;
typedef std::intmax_t  vec_intmax;
# elif __STDC_VERSION__ >= 199901L
#  include <stdint.h>
#  include <stddef.h>
typedef uint8_t   vec_uint8;
typedef uint16_t  vec_uint16;
typedef uint32_t  vec_uint32;
typedef uint64_t  vec_uint64;
typedef uintmax_t vec_uintmax;
typedef uintptr_t vec_uintptr;
typedef size_t    vec_uintsize;
typedef int8_t    vec_int8;
typedef int16_t   vec_int16;
typedef int32_t   vec_int32;
typedef int64_t   vec_int64;
typedef intmax_t  vec_intmax;
# else
#  error Unable to find integer types with known size.
# endif
#endif

#include <string.h>
#include <stdlib.h>

#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
	(((a) >= (x)) && \
	 ((a) > x || (b) >= (y)) && \
	 ((a) > x || (b) > (y) || (c) >= (z)))

#ifdef __GNUC__
# define VEC_GNUC_ATLEAST(x, y, z) \
	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)
#else
# define VEC_GNUC_ATLEAST(x, y, z) (0)
#endif

/* GCC/clang attributes */
#if defined(__has_attribute)
# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x)
#else
# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch)
#endif

#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg)
#elif (__STDC_VERSION__ >= 201112L)
# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
#else
# define VEC_STATIC_ASSERT(x, msg) \
	extern int (*vec_impl_Static_assert_function_(void)) \
		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
#endif

#ifndef VEC_ASSERT
# ifndef VEC_DISABLE_ASSERTIONS
#  include <assert.h>
#  define VEC_ASSERT(x, msg) assert(msg && x)
# else
#  define VEC_ASSERT(x, msg)
# endif
#endif

#if VEC_GNUC_HAS_ATTRIBUTE(__always_inline__, 4, 0, 0)
# define VEC_ALWAYS_INLINE __attribute__((__always_inline__))
#else
# define VEC_ALWAYS_INLINE
#endif

#define VEC_FUNC_IMPL static inline VEC_ALWAYS_INLINE

/* --------------------------------------------------------------- */
/* Get maximum value for type */

#define VEC_TYPE_SIGNED(t) (((t)(-1)) < ((t)0))

#define VEC_MAX_EX(t, TOPBIT) \
	(((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | \
      ((TOPBIT) << ((sizeof(t) * 8ULL) - 4ULL)))

#define VEC_MAX_OF_UNSIGNED(t) VEC_MAX_EX(t, 0xFULL)
#define VEC_MAX_OF_SIGNED(t) VEC_MAX_EX(t, 0x7ULL)

#define VEC_MAX_OF_TYPE(t) \
	((unsigned long long)(VEC_TYPE_SIGNED(t) \
		? VEC_MAX_OF_SIGNED(t) \
		: VEC_MAX_OF_UNSIGNED(t)))

/* --------------------------------------------------------------- */
/* Detect compiler SIMD support */

#define VEC_NEON_ALIGNMENT    16
#define VEC_ALTIVEC_ALIGNMENT 16
#define VEC_SSE2_ALIGNMENT    16
#define VEC_AVX2_ALIGNMENT    32
#define VEC_AVX512F_ALIGNMENT 64

// for the generic implementation
#define VINT8x2_ALIGNMENT   1
#define VUINT8x2_ALIGNMENT  1

#define VINT8x4_ALIGNMENT   VINT8x2_ALIGNMENT
#define VINT16x2_ALIGNMENT  2
#define VUINT8x4_ALIGNMENT  VUINT8x2_ALIGNMENT
#define VUINT16x2_ALIGNMENT 2

#define VINT8x8_ALIGNMENT   VINT8x4_ALIGNMENT
#define VINT16x4_ALIGNMENT  VINT16x2_ALIGNMENT
#define VINT32x2_ALIGNMENT  4
#define VUINT8x8_ALIGNMENT  VUINT8x4_ALIGNMENT
#define VUINT16x4_ALIGNMENT VUINT16x2_ALIGNMENT
#define VUINT32x2_ALIGNMENT 4

#define VINT8x16_ALIGNMENT  VINT8x8_ALIGNMENT
#define VINT16x8_ALIGNMENT  VINT16x4_ALIGNMENT
#define VINT32x4_ALIGNMENT  VINT32x2_ALIGNMENT
#define VINT64x2_ALIGNMENT  8
#define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT
#define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT
#define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT
#define VUINT64x2_ALIGNMENT 8

#define VINT8x32_ALIGNMENT   VINT8x16_ALIGNMENT
#define VINT16x16_ALIGNMENT  VINT16x8_ALIGNMENT
#define VINT32x8_ALIGNMENT   VINT32x4_ALIGNMENT
#define VINT64x4_ALIGNMENT   VINT64x2_ALIGNMENT
#define VUINT8x32_ALIGNMENT  VUINT8x16_ALIGNMENT
#define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
#define VUINT32x8_ALIGNMENT  VUINT32x4_ALIGNMENT
#define VUINT64x4_ALIGNMENT  VUINT64x2_ALIGNMENT

#define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
#define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
#define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
#define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
#define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
#define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
#define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT
#define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT

#ifndef VEC_SUPPRESS_HW

// IIRC `__VEC__' is also defined, but I don't know for sure.
// IBM says that `__ALTIVEC__' is standard though.
#ifdef __ALTIVEC__
# include <altivec.h>
# define VEC_COMPILER_HAS_ALTIVEC
# if defined(__POWER8__) && defined(__VSX__)
#  define VEC_COMPILER_HAS_ALTIVEC_VSX
# endif
# if VINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
#endif

#ifdef __ARM_NEON
# include <arm_neon.h>
# define VEC_COMPILER_HAS_NEON
# if VINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT8x8_ALIGNMENT
#  define VINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT16x4_ALIGNMENT
#  define VINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT8x8_ALIGNMENT
#  define VUINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT16x4_ALIGNMENT
#  define VUINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
#endif

#ifdef __MMX__
# include <mmintrin.h>
# define VEC_COMPILER_HAS_MMX
#endif

#ifdef __SSE2__
# include <emmintrin.h>
# define VEC_COMPILER_HAS_SSE2
# ifdef __SSE3__
#  include <pmmintrin.h>
#  define VEC_COMPILER_HAS_SSE3
# endif
# ifdef __SSE4_1__
#  include <smmintrin.h>
#  define VEC_COMPILER_HAS_SSE41
# endif
# ifdef __SSE4_2__
#  include <nmmintrin.h>
#  define VEC_COMPILER_HAS_SSE42
# endif
# if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
#endif

#ifdef __AVX2__
# include <immintrin.h>
# define VEC_COMPILER_HAS_AVX2
# if VINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT8x32_ALIGNMENT
#  define VINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT16x16_ALIGNMENT
#  define VINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT32x8_ALIGNMENT
#  define VINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT64x4_ALIGNMENT
#  define VINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT8x32_ALIGNMENT
#  define VUINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT16x16_ALIGNMENT
#  define VUINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT32x8_ALIGNMENT
#  define VUINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT64x4_ALIGNMENT
#  define VUINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
#endif

#ifdef __AVX512F__
# include <immintrin.h>
# define VEC_COMPILER_HAS_AVX512F
# if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT8x64_ALIGNMENT
#  define VINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT16x32_ALIGNMENT
#  define VINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT32x16_ALIGNMENT
#  define VINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT64x8_ALIGNMENT
#  define VINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT8x64_ALIGNMENT
#  define VUINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT16x32_ALIGNMENT
#  define VUINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT32x16_ALIGNMENT
#  define VUINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT64x8_ALIGNMENT
#  define VUINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
#endif

#endif

/* --------------------------------------------------------------- */
/* bit shift */

VEC_FUNC_IMPL vec_uintmax vec_urshift(vec_uintmax x, unsigned int y)
{
	return x >> y;
}

VEC_FUNC_IMPL vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y)
{
	return x << y;
}

VEC_FUNC_IMPL vec_intmax vec_rshift(vec_intmax x, unsigned int y)
{
	return (x < 0) ? (~(~x >> y)) : (x >> y);
}

VEC_FUNC_IMPL vec_intmax vec_lshift(vec_intmax x, unsigned int y)
{
	union {
		vec_intmax d;
		vec_uintmax u;
	} xx;

	xx.d = x;
	xx.u <<= y;
	return xx.d;
}

VEC_FUNC_IMPL vec_intmax vec_avg(vec_intmax x, vec_intmax y)
{
    vec_intmax x_d_rem    = (x % 2);
    vec_intmax y_d_rem    = (y % 2);
    vec_intmax rem_d_quot = ((x_d_rem + y_d_rem) / 2);
    vec_intmax rem_d_rem  = ((x_d_rem + y_d_rem) % 2);

    return ((x / 2) + (y / 2)) + (rem_d_quot) + (rem_d_rem == 1);
}

VEC_FUNC_IMPL vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y)
{
    return (x >> 1) + (y >> 1) + ((x | y) & 1);
}

/* --------------------------------------------------------------- */
/* Array alignment macros */

#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
# define VEC_ALIGNAS(x) alignas(x)
#elif (__STDC_VERSION__ >= 201112L)
# define VEC_ALIGNAS(x) _Alignas(x)
#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
# define VEC_ALIGNAS(x) __attribute__((__aligned__(x)))
#endif

/* the alignment must be specified in bytes and must be a multiple of the
 * type size. it is always assumed that the type will be on a boundary of
 * its size, which may or may not be true */
#ifdef VEC_ALIGNAS
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_ALIGNAS(align) type var[length]
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(var))
#else
// use unions to get an aligned offset without triggering strict aliasing
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \
	union vec_aligned_union_##var##_ { \
		type arr[length]; \
		unsigned char bytes[sizeof(type) * length]; \
	}; \
	unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \
	type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \
	VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned")
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(vec_unaligned_##var##_) - (align - 1))
#endif

#define VEC_ALIGNED_ARRAY_LENGTH(var) \
	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))

//////////////////////////////////////////////////////////////////////////////////////
// predefined variants for each vector type

//////////////////////////////////////////////////////////////////////////////////////
// 16-bit

#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT)
#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT)
#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT)
#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0)

#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT)
#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT)
#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT)
#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 32-bit

#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT)
#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT)
#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT)
#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0)

#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT)
#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT)
#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT)
#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0)

#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT)
#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT)
#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT)
#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0)

#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT)
#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT)
#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT)
#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 64-bit

#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)

#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)

#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)

#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)

#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)

#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 128-bit

#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)

#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)

#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)

#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)

#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)

#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)

#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 256-bit

#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)

#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)

#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)

#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)

#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)

#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 512-bit

#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)

#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)

#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)

#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)

#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)

#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)

/* --------------------------------------------------------------- */
/* Defines the structures for each vector type */

// 16-bit
typedef union {
	vec_uint8 generic[2];
} vuint8x2;

typedef union {
	vec_int8 generic[2];
} vint8x2;

// 32-bit
typedef union {
	vuint8x2 generic[2];
} vuint8x4;

typedef union {
	vec_uint16 generic[2];
} vuint16x2;

typedef union {
	vint8x2 generic[2];
} vint8x4;

typedef union {
	vec_int16 generic[2];
} vint16x2;

// 64-bit
typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint8x8_t neon;
#endif

	vuint8x4 generic[2];
} vuint8x8;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint16x4_t neon;
#endif

	vuint16x2 generic[2];
} vuint16x4;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint32x2_t neon;
#endif

	vec_uint32 generic[2];
} vuint32x2;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int8x8_t neon;
#endif

	vint8x4 generic[2];
} vint8x8;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int16x4_t neon;
#endif

	vint16x2 generic[2];
} vint16x4;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int32x2_t neon;
#endif

	vec_int32 generic[2];
} vint32x2;

// 128-bit
typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned char altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint8x16_t neon;
#endif
	vuint8x8 generic[2];
} vuint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned short altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint16x8_t neon;
#endif
	vuint16x4 generic[2];
} vuint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned int altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint32x4_t neon;
#endif
	vuint32x2 generic[2];
} vuint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector unsigned long long altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint64x2_t neon;
#endif
	vec_uint64 generic[2];
} vuint64x2;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed char altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int8x16_t neon;
#endif
	vint8x8 generic[2];
} vint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed short altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int16x8_t neon;
#endif
	vint16x4 generic[2];
} vint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed int altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int32x4_t neon;
#endif
	vint32x2 generic[2];
} vint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector signed long long altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int64x2_t neon;
#endif
	vec_int64 generic[2];
} vint64x2;

// 256-bit
typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint8x16 generic[2];
} vuint8x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint16x8 generic[2];
} vuint16x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint32x4 generic[2];
} vuint32x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vuint64x2 generic[2];
} vuint64x4;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint8x16 generic[2];
} vint8x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint16x8 generic[2];
} vint16x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint32x4 generic[2];
} vint32x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
	vint64x2 generic[2];
} vint64x4;

// 512-bit
typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint8x32 generic[2];
} vuint8x64;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint16x16 generic[2];
} vuint16x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint32x8 generic[2];
} vuint32x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vuint64x4 generic[2];
} vuint64x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint8x32 generic[2];
} vint8x64;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint16x16 generic[2];
} vint16x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint32x8 generic[2];
} vint32x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
	vint64x4 generic[2];
} vint64x8;

/* ------------------------------------------------------------------------ */
/* finally; we can import the real implementations */

#ifdef VEC_COMPILER_HAS_AVX512F
# include "impl/x86/avx512f.h"
#endif

#ifdef VEC_COMPILER_HAS_AVX2
# include "impl/x86/avx2.h"
#endif

#ifdef VEC_COMPILER_HAS_SSE42
# include "impl/x86/sse42.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE41
# include "impl/x86/sse41.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE3
# include "impl/x86/sse3.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE2
# include "impl/x86/sse2.h"
#endif
#ifdef VEC_COMPILER_HAS_MMX
# include "impl/x86/mmx.h"
#endif

#include "impl/generic.h"

/* ------------------------------------------------------------------------ */
/* very minimal aligned malloc */

#define VEC_MALLOC_ALIGNMENT (64)

VEC_STATIC_ASSERT(!(VEC_MALLOC_ALIGNMENT & (VEC_MALLOC_ALIGNMENT - 1))
	&& (VEC_MALLOC_ALIGNMENT > 0),
	"VEC_MALLOC_ALIGNMENT must be a power of two");

typedef unsigned char vec_alignment_type;

#define VEC_MALLOC_ADDITIONAL_SIZE (sizeof(vec_alignment_type) + (VEC_MALLOC_ALIGNMENT - 1))
#define VEC_MALLOC_MAX_SIZE (SIZE_MAX - VEC_MALLOC_ADDITIONAL_SIZE)

VEC_FUNC_IMPL void *vec_internal_align_ptr_(void *q)
{
	vec_alignment_type diff;

	diff = (((uintptr_t)q + (VEC_MALLOC_ALIGNMENT - 1)) & ~(VEC_MALLOC_ALIGNMENT - 1)) - (uintptr_t)q;
	q = (char *)q + diff;

	memcpy((char *)q - sizeof(diff), &diff, sizeof(diff));

	return q;
}

/* reverses vec_align_ptr */
VEC_FUNC_IMPL void *vec_internal_unalign_ptr_(void *q)
{
	vec_alignment_type diff;

	memcpy(&diff, (char *)q - sizeof(diff), sizeof(diff));
	q = (char *)q - diff;

	return q;
}

VEC_FUNC_IMPL void *vec_malloc(size_t size)
{
	void *q;

	if (size > VEC_MALLOC_MAX_SIZE)
		return NULL;

	/* allocate space for the diff (we have to do this,
	 * for realloc has no way of knowing the original ptr) */
	q = malloc(size + VEC_MALLOC_ADDITIONAL_SIZE);
	if (!q)
		return NULL;

	return vec_internal_align_ptr_(q);
}

VEC_FUNC_IMPL void *vec_calloc(size_t count, size_t nmemb)
{
	size_t size;
	void *q;

	size = count * nmemb;
	if ((size && size / count != nmemb)
		|| size > VEC_MALLOC_MAX_SIZE)
		return NULL; /* nope */

	q = calloc(size + VEC_MALLOC_ADDITIONAL_SIZE, 1);
	if (!q)
		return NULL;

	return vec_internal_align_ptr_(q);
}

VEC_FUNC_IMPL void *vec_realloc(void *ptr, size_t newsize)
{
	void *q;

	if (!ptr)
		return vec_malloc(newsize);

	if (newsize > VEC_MALLOC_MAX_SIZE)
		return NULL;

	q = realloc(vec_internal_unalign_ptr_(ptr), VEC_MALLOC_ADDITIONAL_SIZE);
	if (!q)
		return NULL;

	return vec_internal_align_ptr_(q);
}

VEC_FUNC_IMPL void vec_free(void *ptr)
{
	if (ptr)
		free(vec_internal_unalign_ptr_(ptr));
}

#ifdef __cplusplus
}
#endif

#endif /* VEC_VEC_H_ */
