/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024-2025 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#ifndef VEC_VEC_H_
#define VEC_VEC_H_

#include "defs.h"

/* --------------------------------------------------------------- */
/* Detect compiler SIMD support */

#define VEC_NEON_ALIGNMENT    16
#define VEC_ALTIVEC_ALIGNMENT 16
#define VEC_SSE2_ALIGNMENT    16
#define VEC_AVX2_ALIGNMENT    32
#define VEC_AVX512F_ALIGNMENT 64

/* for the generic implementation. note that due to delayed expansion,
 * one for a larger implementation is basically always guaranteed to
 * have at least the alignment of a smaller one (i.e. f64x8 alignment
 * will be >= f64x4 alignment). This is by design. */
#define VINT8x2_ALIGNMENT   1
#define VUINT8x2_ALIGNMENT  1

#define VINT8x4_ALIGNMENT   VINT8x2_ALIGNMENT
#define VINT16x2_ALIGNMENT  2
#define VUINT8x4_ALIGNMENT  VUINT8x2_ALIGNMENT
#define VUINT16x2_ALIGNMENT 2

#define VINT8x8_ALIGNMENT   VINT8x4_ALIGNMENT
#define VINT16x4_ALIGNMENT  VINT16x2_ALIGNMENT
#define VINT32x2_ALIGNMENT  4
#define VUINT8x8_ALIGNMENT  VUINT8x4_ALIGNMENT
#define VUINT16x4_ALIGNMENT VUINT16x2_ALIGNMENT
#define VUINT32x2_ALIGNMENT 4

#define VINT8x16_ALIGNMENT  VINT8x8_ALIGNMENT
#define VINT16x8_ALIGNMENT  VINT16x4_ALIGNMENT
#define VINT32x4_ALIGNMENT  VINT32x2_ALIGNMENT
#define VINT64x2_ALIGNMENT  8
#define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT
#define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT
#define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT
#define VUINT64x2_ALIGNMENT 8

#define VINT8x32_ALIGNMENT   VINT8x16_ALIGNMENT
#define VINT16x16_ALIGNMENT  VINT16x8_ALIGNMENT
#define VINT32x8_ALIGNMENT   VINT32x4_ALIGNMENT
#define VINT64x4_ALIGNMENT   VINT64x2_ALIGNMENT
#define VUINT8x32_ALIGNMENT  VUINT8x16_ALIGNMENT
#define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT
#define VUINT32x8_ALIGNMENT  VUINT32x4_ALIGNMENT
#define VUINT64x4_ALIGNMENT  VUINT64x2_ALIGNMENT

#define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT
#define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT
#define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT
#define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT
#define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT
#define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT
#define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT
#define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT

/* float */

#define VF32x2_ALIGNMENT 4

#define VF32x4_ALIGNMENT VF32x2_ALIGNMENT
#define VF64x2_ALIGNMENT 8

#define VF32x8_ALIGNMENT VF32x4_ALIGNMENT
#define VF64x4_ALIGNMENT VF64x2_ALIGNMENT

#define VF32x16_ALIGNMENT VF32x8_ALIGNMENT
#define VF64x8_ALIGNMENT  VF64x4_ALIGNMENT

/* allow to suppress hardware, so that we can make sure
 * the generic impl isn't *painfully* slow ;) */
#ifndef VEC_SUPPRESS_HW

// IIRC `__VEC__' is also defined, but I don't know for sure.
// IBM says that `__ALTIVEC__' is standard though.
#ifdef __ALTIVEC__
# include <altivec.h>
# define VEC_COMPILER_HAS_ALTIVEC
# if defined(__POWER8__) && defined(__VSX__)
#  define VEC_COMPILER_HAS_ALTIVEC_VSX
# endif
# if VINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VF32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VF32x4_ALIGNMENT
#  define VF32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
# if VF64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT
#  undef VF64x2_ALIGNMENT
#  define VF64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT
# endif
#endif

#ifdef __ARM_NEON
# include <arm_neon.h>
# define VEC_COMPILER_HAS_NEON
# if VINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT8x8_ALIGNMENT
#  define VINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT16x4_ALIGNMENT
#  define VINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT8x8_ALIGNMENT
#  define VUINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT16x4_ALIGNMENT
#  define VUINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT
# endif
#endif

#ifdef __MMX__
# include <mmintrin.h>
# define VEC_COMPILER_HAS_MMX
#endif

#ifdef __SSE2__
# include <emmintrin.h>
# define VEC_COMPILER_HAS_SSE2
# ifdef __SSE3__
#  include <pmmintrin.h>
#  define VEC_COMPILER_HAS_SSE3
# endif
# ifdef __SSE4_1__
#  include <smmintrin.h>
#  define VEC_COMPILER_HAS_SSE41
# endif
# ifdef __SSE4_2__
#  include <nmmintrin.h>
#  define VEC_COMPILER_HAS_SSE42
# endif
# if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT8x16_ALIGNMENT
#  define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT16x8_ALIGNMENT
#  define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT32x4_ALIGNMENT
#  define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VINT64x2_ALIGNMENT
#  define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT8x16_ALIGNMENT
#  define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT16x8_ALIGNMENT
#  define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT32x4_ALIGNMENT
#  define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
# if VUINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT
#  undef VUINT64x2_ALIGNMENT
#  define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT
# endif
#endif

#ifdef __AVX2__
# include <immintrin.h>
# define VEC_COMPILER_HAS_AVX2
# if VINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT8x32_ALIGNMENT
#  define VINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT16x16_ALIGNMENT
#  define VINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT32x8_ALIGNMENT
#  define VINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VINT64x4_ALIGNMENT
#  define VINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT8x32_ALIGNMENT
#  define VUINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT16x16_ALIGNMENT
#  define VUINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT32x8_ALIGNMENT
#  define VUINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
# if VUINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT
#  undef VUINT64x4_ALIGNMENT
#  define VUINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT
# endif
#endif

#ifdef __AVX512F__
# include <immintrin.h>
# ifdef __AVX512BW__
#  define VEC_COMPILER_HAS_AVX512BW
# endif
# define VEC_COMPILER_HAS_AVX512F
# if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT8x64_ALIGNMENT
#  define VINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT16x32_ALIGNMENT
#  define VINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT32x16_ALIGNMENT
#  define VINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VINT64x8_ALIGNMENT
#  define VINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT8x64_ALIGNMENT
#  define VUINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT16x32_ALIGNMENT
#  define VUINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT32x16_ALIGNMENT
#  define VUINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
# if VUINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT
#  undef VUINT64x8_ALIGNMENT
#  define VUINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT
# endif
#endif

#endif /* !defined(VEC_SUPPRESS_HW) */

#if VEC_GNUC_ATLEAST(4, 0, 0)
# define VEC_COMPILER_HAS_GCC_VECTORS
# ifdef __BIGGEST_ALIGNMENT__
#  if VINT8x2_ALIGNMENT < __BIGGEST_ALIGNMENT__
#   undef VINT8x2_ALIGNMENT
#   define VINT8x2_ALIGNMENT __BIGGEST_ALIGNMENT__
#  endif
#  if VINT16x2_ALIGNMENT < __BIGGEST_ALIGNMENT__
#   undef VINT16x2_ALIGNMENT
#   define VINT16x2_ALIGNMENT __BIGGEST_ALIGNMENT__
#  endif
#  if VINT32x2_ALIGNMENT < __BIGGEST_ALIGNMENT__
#   undef VINT32x2_ALIGNMENT
#   define VINT32x2_ALIGNMENT __BIGGEST_ALIGNMENT__
#  endif
#  if VINT64x2_ALIGNMENT < __BIGGEST_ALIGNMENT__
#   undef VINT64x2_ALIGNMENT
#   define VINT64x2_ALIGNMENT __BIGGEST_ALIGNMENT__
#  endif
#  if VF32x2_ALIGNMENT < __BIGGEST_ALIGNMENT__
#   undef VF32x4_ALIGNMENT
#   define VF32x4_ALIGNMENT __BIGGEST_ALIGNMENT__
#  endif
#  if VF64x2_ALIGNMENT < __BIGGEST_ALIGNMENT__
#   undef VF64x2_ALIGNMENT
#   define VF64x2_ALIGNMENT __BIGGEST_ALIGNMENT__
#  endif
# endif
#endif

/* I don't think this happens on any platform yet, but we should
 * probably take extra care to make sure the alignment of each
 * is at least the alignment of the one half the size... */

#ifdef __cplusplus
extern "C" {
#endif

/* --------------------------------------------------------------- */
/* bit shift */

VEC_FUNC_IMPL vec_uintmax vec_urshift(vec_uintmax x, unsigned int y)
{
	return x >> y;
}

VEC_FUNC_IMPL vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y)
{
	return x << y;
}

VEC_FUNC_IMPL vec_intmax vec_rshift(vec_intmax x, unsigned int y)
{
	return (x < 0) ? (~(~x >> y)) : (x >> y);
}

VEC_FUNC_IMPL vec_intmax vec_lshift(vec_intmax x, unsigned int y)
{
	union {
		vec_intmax d;
		vec_uintmax u;
	} xx;

	xx.d = x;
	xx.u <<= y;
	return xx.d;
}

/* this is the general algorithm vec uses for its average
 * implementation :) */
VEC_FUNC_IMPL vec_intmax vec_imavg(vec_intmax x, vec_intmax y)
{
	vec_intmax x_d_rem    = (x % 2);
	vec_intmax y_d_rem    = (y % 2);
	vec_intmax rem_d_quot = ((x_d_rem + y_d_rem) / 2);
	vec_intmax rem_d_rem  = ((x_d_rem + y_d_rem) % 2);

	return ((x / 2) + (y / 2)) + (rem_d_quot) + (rem_d_rem == 1);
}

VEC_FUNC_IMPL vec_uintmax vec_imuavg(vec_uintmax x, vec_uintmax y)
{
	return (x >> 1) + (y >> 1) + ((x | y) & 1);
}

/* --------------------------------------------------------------- */
/* Array alignment macros */

#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
# define VEC_ALIGNAS(x) alignas(x)
#elif (__STDC_VERSION__ >= 201112L)
# define VEC_ALIGNAS(x) _Alignas(x)
#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
# define VEC_ALIGNAS(x) __attribute__((__aligned__(x)))
#endif

/* the alignment must be specified in bytes and must be a multiple of the
 * type size. it is always assumed that the type will be on a boundary of
 * its size, which may or may not be true */
#ifdef VEC_ALIGNAS
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_ALIGNAS(align) type var[length]
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(var))
#else
// use unions to get an aligned offset without triggering strict aliasing
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	type vec_unaligned_##var##_[length + (align) - 1]; \
	type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr;
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(vec_unaligned_##var##_) - (align - 1))
#endif

#define VEC_ALIGNED_ARRAY_LENGTH(var) \
	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))

//////////////////////////////////////////////////////////////////////////////////////
// predefined variants for each vector type

//////////////////////////////////////////////////////////////////////////////////////
// 16-bit

#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT)
#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT)
#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT)
#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0)

#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT)
#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT)
#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT)
#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 32-bit

#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT)
#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT)
#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT)
#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0)

#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT)
#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT)
#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT)
#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0)

#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT)
#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT)
#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT)
#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0)

#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT)
#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT)
#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT)
#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 64-bit

#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)

#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)

#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)

#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)

#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)

#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 128-bit

#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)

#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)

#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)

#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)

#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)

#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)

#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 256-bit

#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)

#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)

#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)

#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)

#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)

#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 512-bit

#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)

#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)

#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)

#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)

#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)

#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)

/* --------------------------------------------------------------- */
/* floating point */

#define VF32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_f32, var, 2, VF32x2_ALIGNMENT)
#define VF32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VF32x2_ALIGNMENT)
#define VF32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VF32x2_ALIGNMENT)
#define VF32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VF32x2_ALIGNMENT == 0)

#define VF32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_f32, var, 4, VF32x4_ALIGNMENT)
#define VF32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VF32x4_ALIGNMENT)
#define VF32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VF32x4_ALIGNMENT)
#define VF32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VF32x4_ALIGNMENT == 0)

#define VF32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_f32, var, 8, VF32x8_ALIGNMENT)
#define VF32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VF32x8_ALIGNMENT)
#define VF32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VF32x8_ALIGNMENT)
#define VF32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VF32x8_ALIGNMENT == 0)

#define VF32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_f32, var, 16, VF32x16_ALIGNMENT)
#define VF32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VF32x16_ALIGNMENT)
#define VF32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VF32x16_ALIGNMENT)
#define VF32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VF32x16_ALIGNMENT == 0)

/* --------------------------------------------------------------- */
/* double precision floating point */

#define VF64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_f64, var, 2, VF64x2_ALIGNMENT)
#define VF64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VF64x2_ALIGNMENT)
#define VF64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VF64x2_ALIGNMENT)
#define VF64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VF64x2_ALIGNMENT == 0)

#define VF64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_f64, var, 4, VF64x4_ALIGNMENT)
#define VF64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VF64x4_ALIGNMENT)
#define VF64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VF64x4_ALIGNMENT)
#define VF64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VF64x4_ALIGNMENT == 0)

#define VF64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_f64, var, 8, VF64x8_ALIGNMENT)
#define VF64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VF64x8_ALIGNMENT)
#define VF64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VF64x8_ALIGNMENT)
#define VF64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VF64x8_ALIGNMENT == 0)

/* --------------------------------------------------------------- */
/* Defines the structures for each vector type */

// 16-bit
typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint8 __attribute__((__vector_size__(2))) gcc;
#endif
	vec_uint8 generic[2];
} vuint8x2;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int8 __attribute__((__vector_size__(2))) gcc;
#endif
	vec_int8 generic[2];
} vint8x2;

// 32-bit
typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint8 __attribute__((__vector_size__(4))) gcc;
#endif
	vuint8x2 dbl[2];

	vec_uint8 generic[4];
} vuint8x4;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint16 __attribute__((__vector_size__(4))) gcc;
#endif
	vec_uint16 generic[2];
} vuint16x2;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int8 __attribute__((__vector_size__(4))) gcc;
#endif
	vint8x2 dbl[2];

	vec_int8 generic[4];
} vint8x4;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int16 __attribute__((__vector_size__(4))) gcc;
#endif
	vec_int16 generic[2];
} vint16x2;

// 64-bit
typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint8x8_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint8 __attribute__((__vector_size__(8))) gcc;
#endif

	vuint8x4 dbl[2];

	vec_uint8 generic[8];
} vuint8x8;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint16x4_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint16 __attribute__((__vector_size__(8))) gcc;
#endif

	vuint16x2 dbl[2];

	vec_uint16 generic[4];
} vuint16x4;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint32x2_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint32 __attribute__((__vector_size__(8))) gcc;
#endif

	vec_uint32 generic[2];
} vuint32x2;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int8x8_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int8 __attribute__((__vector_size__(8))) gcc;
#endif

	vec_int8 generic[8];

	vint8x4 dbl[2];
} vint8x8;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int16x4_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int16 __attribute__((__vector_size__(8))) gcc;
#endif

	vec_int16 generic[4];

	vint16x2 dbl[2];
} vint16x4;

typedef union {
#ifdef VEC_COMPILER_HAS_MMX
	__m64 mmx;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int32x2_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int32 __attribute__((__vector_size__(8))) gcc;
#endif

	vec_int32 generic[2];
} vint32x2;

// 128-bit
typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned char altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint8x16_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint8 __attribute__((__vector_size__(16))) gcc;
#endif
	vuint8x8 dbl[2];

	vec_uint8 generic[16];
} vuint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned short altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint16x8_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint16 __attribute__((__vector_size__(16))) gcc;
#endif
	vuint16x4 dbl[2];

	vec_uint16 generic[8];
} vuint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector unsigned int altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint32x4_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint32 __attribute__((__vector_size__(16))) gcc;
#endif
	vuint32x2 dbl[2];

	vec_uint32 generic[4];
} vuint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector unsigned long long altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	uint64x2_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint64 __attribute__((__vector_size__(16))) gcc;
#endif
	vec_uint64 generic[2];
} vuint64x2;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed char altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int8x16_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int8 __attribute__((__vector_size__(16))) gcc;
#endif
	vint8x8 dbl[2];

	vec_int8 generic[16];
} vint8x16;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed short altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int16x8_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int16 __attribute__((__vector_size__(16))) gcc;
#endif
	vint16x4 dbl[2];

	vec_int16 generic[8];
} vint16x8;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector signed int altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int32x4_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int32 __attribute__((__vector_size__(16))) gcc;
#endif
	vint32x2 dbl[2];

	vec_int32 generic[4];
} vint32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_SSE2
	__m128i sse;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector signed long long altivec;
#endif
#ifdef VEC_COMPILER_HAS_NEON
	int64x2_t neon;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int64 __attribute__((__vector_size__(16))) gcc;
#endif
	vec_int64 generic[2];
} vint64x2;

// 256-bit
typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint8 __attribute__((__vector_size__(32))) gcc;
#endif
	vuint8x16 dbl[2];

	vec_uint8 generic[32];
} vuint8x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint16 __attribute__((__vector_size__(32))) gcc;
#endif
	vuint16x8 dbl[2];

	vec_uint16 generic[16];
} vuint16x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint32 __attribute__((__vector_size__(32))) gcc;
#endif
	vuint32x4 dbl[2];

	vec_uint32 generic[8];
} vuint32x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint64 __attribute__((__vector_size__(32))) gcc;
#endif
	vuint64x2 dbl[2];

	vec_uint64 generic[4];
} vuint64x4;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int8 __attribute__((__vector_size__(32))) gcc;
#endif
	vint8x16 dbl[2];

	vec_int8 generic[32];
} vint8x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int16 __attribute__((__vector_size__(32))) gcc;
#endif
	vint16x8 dbl[2];

	vec_int16 generic[16];
} vint16x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int32 __attribute__((__vector_size__(32))) gcc;
#endif
	vint32x4 dbl[2];

	vec_int32 generic[8];
} vint32x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX2
	__m256i avx2;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int64 __attribute__((__vector_size__(32))) gcc;
#endif
	vint64x2 dbl[2];

	vec_int64 generic[4];
} vint64x4;

// 512-bit
typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint8 __attribute__((__vector_size__(64))) gcc;
#endif
	vuint8x32 dbl[2];

	vec_uint8 generic[64];
} vuint8x64;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint16 __attribute__((__vector_size__(64))) gcc;
#endif
	vuint16x16 dbl[2];

	vec_uint16 generic[32];
} vuint16x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint32 __attribute__((__vector_size__(64))) gcc;
#endif
	vuint32x8 dbl[2];

	vec_uint32 generic[16];
} vuint32x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_uint64 __attribute__((__vector_size__(64))) gcc;
#endif
	vuint64x4 dbl[2];

	vec_uint64 generic[8];
} vuint64x8;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int8 __attribute__((__vector_size__(64))) gcc;
#endif
	vint8x32 dbl[2];

	vec_int8 generic[64];
} vint8x64;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int16 __attribute__((__vector_size__(64))) gcc;
#endif
	vint16x16 dbl[2];

	vec_int16 generic[32];
} vint16x32;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int32 __attribute__((__vector_size__(64))) gcc;
#endif
	vint32x8 dbl[2];

	vec_int32 generic[16];
} vint32x16;

typedef union {
#ifdef VEC_COMPILER_HAS_AVX512F
	__m512i avx512f;
#endif
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_int64 __attribute__((__vector_size__(64))) gcc;
#endif
	vint64x4 dbl[2];

	vec_int64 generic[8];
} vint64x8;

/* ------- Floating-point types */

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_f32 __attribute__((__vector_size__(8))) gcc;
#endif
	vec_f32 generic[2];
} vf32x2;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_f32 __attribute__((__vector_size__(16))) gcc;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	vector float altivec;
#endif

	vf32x2 dbl[2];

	vec_f32 generic[4];
} vf32x4;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_f32 __attribute__((__vector_size__(32))) gcc;
#endif

	vf32x4 dbl[2];

	vec_f32 generic[8];
} vf32x8;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_f32 __attribute__((__vector_size__(64))) gcc;
#endif

	vf32x8 dbl[2];

	vec_f32 generic[16];
} vf32x16;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_f64 __attribute__((__vector_size__(16))) gcc;
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
	vector double altivec;
#endif

	vec_f64 generic[2];
} vf64x2;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_f64 __attribute__((__vector_size__(32))) gcc;
#endif

	vf64x2 dbl[2];

	vec_f64 generic[4];
} vf64x4;

typedef union {
#ifdef VEC_COMPILER_HAS_GCC_VECTORS
	vec_f64 __attribute__((__vector_size__(64))) gcc;
#endif

	vf64x4 dbl[2];

	vec_f64 generic[8];
} vf64x8;

/* ------------------------------------------------------------------------ */
/* x86 */

#ifdef VEC_COMPILER_HAS_AVX512BW
# include "impl/x86/avx512bw.h"
#endif

#ifdef VEC_COMPILER_HAS_AVX512F
# include "impl/x86/avx512f.h"
#endif

#ifdef VEC_COMPILER_HAS_AVX2
# include "impl/x86/avx2.h"
#endif

#ifdef VEC_COMPILER_HAS_SSE42
# include "impl/x86/sse42.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE41
# include "impl/x86/sse41.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE3
# include "impl/x86/sse3.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE2
# include "impl/x86/sse2.h"
#endif
#ifdef VEC_COMPILER_HAS_MMX
# include "impl/x86/mmx.h"
#endif

/* ------------------------------------------------------------------------ */
/* PowerPC */

#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
# include "impl/ppc/vsx.h"
#endif

#ifdef VEC_COMPILER_HAS_ALTIVEC
# include "impl/ppc/altivec.h"
#endif

/* ------------------------------------------------------------------------ */
/* By this point, if we've defined native intrinsics, we'll just want to
 * double them, rather than use GCC extensions.
 * In particular with very old GCC, it can generate very bad asm that
 * can perform even worse than non-vectorized code. */

#include "impl/double.h"

/* ------------------------------------------------------------------------ */
/* Use GCC's vector extensions, if available. */

#ifdef VEC_COMPILER_HAS_GCC_VECTORS
# include "impl/gcc.h"
#endif

/*we don't need to double here, because gcc defines literally everything :)*/

/* ------------------------------------------------------------------------ */
/* Fill in anything remaining with a generic array-based implementation. */

#include "impl/generic.h"

/* ------------------------------------------------------------------------ */

#ifdef __cplusplus
}
#endif

#endif /* VEC_VEC_H_ */
