/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#ifndef VEC_VEC_H_
#define VEC_VEC_H_

#ifdef __cplusplus
extern "C" {
#endif

// different on every implementation
#include "vec/types.h"

#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \
	(((a) >= (x)) && \
	 ((a) > x || (b) >= (y)) && \
	 ((a) > x || (b) > (y) || (c) >= (z)))

// MSVC sucks and its a pain in the ass to find out this stuff
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
# define VEC_MSVC_VERSION_MAJOR (_MSC_FULL_VER / 10000000)
# define VEC_MSVC_VERSION_MINOR ((_MSC_FULL_VER % 10000000) / 100000)
# define VEC_MSVC_VERSION_PATCH (_MSC_FULL_VER % 100000)
#elif defined(_MSC_FULL_VER)
# define VEC_MSVC_VERSION_MAJOR (_MSC_FULL_VER / 1000000)
# define VEC_MSVC_VERSION_MINOR ((_MSC_FULL_VER % 1000000) / 10000)
# define VEC_MSVC_VERSION_PATCH (_MSC_FULL_VER % 10000)
#elif defined(_MSC_VER)
# define VEC_MSVC_VERSION_MAJOR (_MSC_VER / 100)
# define VEC_MSVC_VERSION_MINOR (_MSC_VER % 100)
# define VEC_MSVC_VERSION_PATCH (0)
#endif

#ifdef VEC_MSVC_VERSION_MAJOR
# define VEC_MSVC_ATLEAST(x, y, z) \
	VEC_SEMVER_ATLEAST(VEC_MSVC_VERSION_MAJOR, VEC_MSVC_VERSION_MINOR, VEC_MSVC_VERSION_PATCH, x, y, z)
#else
# define VEC_MSVC_ATLEAST(x, y, z) (0)
#endif

// now we get to GNU C stuff (not necessarily GCC)
#ifdef __GNUC__
# define VEC_GNUC_ATLEAST(x, y, z) \
	VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z)
#else
# define VEC_GNUC_ATLEAST(x, y, z) (0)
#endif

#if defined(__has_attribute)
# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x)
#else
# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch)
#endif

// this isn't used anywhere (yet!) but still useful to have
#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg)
#elif (__STDC_VERSION__ >= 201112L)
# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg)
#else
# define VEC_STATIC_ASSERT(x, msg) \
	extern int (*vec_impl_Static_assert_function_(void)) \
		[!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })]
#endif

//////////////////////////////////////////////////////////////////////////////
// Detect compiler SIMD support

// Current known alignments for each implementation, ordered by
// architecture and instruction set:
//
// /---------------------------------------------------\
// | Architecture | Instruction Set | Bits | Alignment |
// |---------------------------------------------------|
// | ARM          | NEON            | 64   | 8 bytes   |
// | ARM          | NEON            | 128  | 16 bytes  |
// | PowerPC      | AltiVec         | 128  | 16 bytes  |
// | x86          | MMX             | 64   | None?     |
// | x86          | SSE2            | 128  | 16 bytes  |
// | x86          | AVX2            | 256  | 32 bytes  |
// | x86          | AVX512-F        | 512  | 64 bytes  |
// \---------------------------------------------------/
//
// If these ever have to be extended or changed, there absolutely *must*
// be a new major release of vec, since that would change the ABI...

#define VINT8x2_ALIGNMENT   2
#define VUINT8x2_ALIGNMENT  2

#define VINT8x4_ALIGNMENT   4
#define VINT16x2_ALIGNMENT  4
#define VUINT8x4_ALIGNMENT  4
#define VUINT16x2_ALIGNMENT 4

// 64-bit
#define VINT8x8_ALIGNMENT   8
#define VINT16x4_ALIGNMENT  8
#define VINT32x2_ALIGNMENT  8
#define VUINT8x8_ALIGNMENT  8
#define VUINT16x4_ALIGNMENT 8
#define VUINT32x2_ALIGNMENT 8

// 128-bit
#define VINT8x16_ALIGNMENT  16
#define VINT16x8_ALIGNMENT  16
#define VINT32x4_ALIGNMENT  16
#define VINT64x2_ALIGNMENT  16
#define VUINT8x16_ALIGNMENT 16
#define VUINT16x8_ALIGNMENT 16
#define VUINT32x4_ALIGNMENT 16
#define VUINT64x2_ALIGNMENT 16

// 256-bit
#define VINT8x32_ALIGNMENT   32
#define VINT16x16_ALIGNMENT  32
#define VINT32x8_ALIGNMENT   32
#define VINT64x4_ALIGNMENT   32
#define VUINT8x32_ALIGNMENT  32
#define VUINT16x16_ALIGNMENT 32
#define VUINT32x8_ALIGNMENT  32
#define VUINT64x4_ALIGNMENT  32

// 512-bit
#define VINT8x64_ALIGNMENT   64
#define VINT16x32_ALIGNMENT  64
#define VINT32x16_ALIGNMENT  64
#define VINT64x8_ALIGNMENT   64
#define VUINT8x64_ALIGNMENT  64
#define VUINT16x32_ALIGNMENT 64
#define VUINT32x16_ALIGNMENT 64
#define VUINT64x8_ALIGNMENT  64

//////////////////////////////////////////////////////////////////////////////
// portable bit shift

// these functions aren't very necessary :/
inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y)
{
	return x >> y;
}

inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y)
{
	return x << y;
}

inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y)
{
	return x >> y;
}

inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y)
{
	return x << y;
}

/**
 * Arithmetic shifts; based off code from OpenMPT, which is under
 * the Boost Software License:
 *
 * Permission is hereby granted, free of charge, to any person or organization
 * obtaining a copy of the software and accompanying documentation covered by
 * this license (the "Software") to use, reproduce, display, distribute,
 * execute, and transmit the Software, and to prepare derivative works of the
 * Software, and to permit third-parties to whom the Software is furnished to
 * do so, all subject to the following:
 * 
 * The copyright notices in the Software and this entire statement, including
 * the above license grant, this restriction and the following disclaimer,
 * must be included in all copies of the Software, in whole or in part, and
 * all derivative works of the Software, unless such copies or derivative
 * works are solely in the form of machine-executable object code generated by
 * a source language processor.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
**/
inline vec_intmax vec_rshift(vec_intmax x, unsigned int y)
{
	static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1);

	union {
		vec_intmax d;
		vec_uintmax u;
	} xx;

	xx.d = x;

	xx.u += roffset;
	xx.u >>= y;
	xx.u -= roffset >> y;

	return xx.d;
}

inline vec_intmax vec_lshift(vec_intmax x, unsigned int y)
{
	static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1);

	union {
		vec_intmax d;
		vec_uintmax u;
	} xx;

	xx.d = x;

	xx.u += roffset;
	xx.u <<= y;
	xx.u -= roffset << y;

	return xx.d;
}

//////////////////////////////////////////////////////////////////////////////
// array alignment

#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
# define VEC_ALIGNOF(type) alignof(type)
#elif (__STDC_VERSION__ >= 201112L)
# define VEC_ALIGNOF(type) _Alignof(type)
#elif defined(HAVE_STDDEF_H) // already included
# define VEC_ALIGNOF(type) \
	(offsetof(struct { char slot1; type slot2; }, slot2))
#else
// inline offsetof
# define VEC_ALIGNOF(type) \
	((vec_uintsize)((char *)&((struct { char slot1; type slot2; } *)0)->slot2 - (char *)0))
#endif

#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L)
# define VEC_ALIGNAS(x) alignas(x)
#elif (__STDC_VERSION__ >= 201112L)
# define VEC_ALIGNAS(x) _Alignas(x)
#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0)
# define VEC_ALIGNAS(x) __attribute__((__aligned__(x)))
#elif VEC_MSVC_ATLEAST(0, 0, 0) // FIXME which version?
# define VEC_ALIGNAS(x) __declspec(align(x))
#else
# error vec: vec requires compiler alignment support
#endif

// this wart is here because originally vec didn't require that
// there be compiler support for alignment. now that we *do*,
// we should at least keep providing this macro...
#ifdef VEC_ALIGNAS
# define VEC_ALIGNED_ARRAY(type, var, length, align) \
	VEC_ALIGNAS(align) type var[length]
# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \
	(sizeof(var))
#endif

#define VEC_ALIGNED_ARRAY_LENGTH(var) \
	(VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var))

//////////////////////////////////////////////////////////////////////////////////////
// predefined variants for each vector type

//////////////////////////////////////////////////////////////////////////////////////
// 16-bit

#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT)
#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT)
#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT)
#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0)

#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT)
#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT)
#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT)
#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 32-bit

#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT)
#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT)
#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT)
#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0)

#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT)
#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT)
#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT)
#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0)

#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT)
#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT)
#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT)
#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0)

#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT)
#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT)
#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT)
#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 64-bit

#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT)
#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT)
#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0)

#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT)
#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT)
#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0)

#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT)
#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT)
#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0)

#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT)
#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0)

#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT)
#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0)

#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT)
#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 128-bit

#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT)
#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT)
#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0)

#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT)
#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT)
#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0)

#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT)
#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT)
#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0)

#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT)
#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT)
#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0)

#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT)
#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0)

#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT)
#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0)

#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT)
#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0)

#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT)
#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 256-bit

#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT)
#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT)
#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0)

#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT)
#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT)
#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT)
#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT)
#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0)

#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT)
#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT)
#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0)

#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT)
#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0)

#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT)
#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT)
#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0)

#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT)
#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////////////
// 512-bit

#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT)
#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT)
#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0)

#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT)
#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT)
#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0)

#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT)
#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT)
#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0)

#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT)
#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT)
#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0)

#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT)
#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0)

#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT)
#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0)

#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT)
#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0)

#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT)
#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0)

//////////////////////////////////////////////////////////////////////////////
// Defines the structures for each vector type

// 16-bit
typedef struct {
	VEC_ALIGNAS(VUINT8x2_ALIGNMENT) vec_uint8 bytes[2];
} vuint8x2;

typedef struct {
	VEC_ALIGNAS(VINT8x2_ALIGNMENT) vec_uint8 bytes[2];
} vint8x2;

// 32-bit
typedef struct {
	VEC_ALIGNAS(VUINT8x4_ALIGNMENT) vec_uint8 bytes[4];
} vuint8x4;

typedef struct {
	VEC_ALIGNAS(VUINT16x2_ALIGNMENT) vec_uint8 bytes[4];
} vuint16x2;

typedef struct {
	VEC_ALIGNAS(VINT8x4_ALIGNMENT) vec_uint8 bytes[4];
} vint8x4;

typedef struct {
	VEC_ALIGNAS(VINT16x2_ALIGNMENT) vec_uint8 bytes[4];
} vint16x2;

// 64-bit
typedef struct {
	VEC_ALIGNAS(VUINT8x8_ALIGNMENT) vec_uint8 bytes[8];
} vuint8x8;

typedef struct {
	VEC_ALIGNAS(VUINT16x4_ALIGNMENT) vec_uint8 bytes[8];
} vuint16x4;

typedef struct {
	VEC_ALIGNAS(VUINT32x2_ALIGNMENT) vec_uint8 bytes[8];
} vuint32x2;

typedef struct {
	VEC_ALIGNAS(VINT8x8_ALIGNMENT) vec_uint8 bytes[8];
} vint8x8;

typedef struct {
	VEC_ALIGNAS(VINT16x4_ALIGNMENT) vec_uint8 bytes[8];
} vint16x4;

typedef struct {
	VEC_ALIGNAS(VINT32x2_ALIGNMENT) vec_uint8 bytes[8];
} vint32x2;

// 128-bit
typedef union {
	VEC_ALIGNAS(VUINT8x16_ALIGNMENT) vec_uint8 bytes[16];
} vuint8x16;

typedef union {
	VEC_ALIGNAS(VUINT16x8_ALIGNMENT) vec_uint8 bytes[16];
} vuint16x8;

typedef union {
	VEC_ALIGNAS(VUINT32x4_ALIGNMENT) vec_uint8 bytes[16];
} vuint32x4;

typedef union {
	VEC_ALIGNAS(VUINT64x2_ALIGNMENT) vec_uint8 bytes[16];
} vuint64x2;

typedef union {
	VEC_ALIGNAS(VINT8x16_ALIGNMENT) vec_uint8 bytes[16];
} vint8x16;

typedef union {
	VEC_ALIGNAS(VINT16x8_ALIGNMENT) vec_uint8 bytes[16];
} vint16x8;

typedef union {
	VEC_ALIGNAS(VINT32x4_ALIGNMENT) vec_uint8 bytes[16];
} vint32x4;

typedef union {
	VEC_ALIGNAS(VINT64x2_ALIGNMENT) vec_uint8 bytes[16];
} vint64x2;

// 256-bit
typedef union {
	VEC_ALIGNAS(VUINT8x32_ALIGNMENT) vec_uint8 bytes[32];
} vuint8x32;

typedef union {
	VEC_ALIGNAS(VUINT16x16_ALIGNMENT) vec_uint8 bytes[32];
} vuint16x16;

typedef union {
	VEC_ALIGNAS(VUINT32x8_ALIGNMENT) vec_uint8 bytes[32];
} vuint32x8;

typedef union {
	VEC_ALIGNAS(VUINT64x4_ALIGNMENT) vec_uint8 bytes[32];
} vuint64x4;

typedef union {
	VEC_ALIGNAS(VINT8x32_ALIGNMENT) vec_uint8 bytes[32];
} vint8x32;

typedef union {
	VEC_ALIGNAS(VINT16x16_ALIGNMENT) vec_uint8 bytes[32];
} vint16x16;

typedef union {
	VEC_ALIGNAS(VINT32x8_ALIGNMENT) vec_uint8 bytes[32];
} vint32x8;

typedef union {
	VEC_ALIGNAS(VINT64x4_ALIGNMENT) vec_uint8 bytes[32];
} vint64x4;

// 512-bit
typedef union {
	VEC_ALIGNAS(VUINT8x64_ALIGNMENT) vec_uint8 bytes[64];
} vuint8x64;

typedef union {
	VEC_ALIGNAS(VUINT16x32_ALIGNMENT) vec_uint8 bytes[64];
} vuint16x32;

typedef union {
	VEC_ALIGNAS(VUINT32x16_ALIGNMENT) vec_uint8 bytes[64];
} vuint32x16;

typedef union {
	VEC_ALIGNAS(VUINT64x8_ALIGNMENT) vec_uint8 bytes[64];
} vuint64x8;

typedef union {
	VEC_ALIGNAS(VINT8x64_ALIGNMENT) vec_uint8 bytes[64];
} vint8x64;

typedef union {
	VEC_ALIGNAS(VINT16x32_ALIGNMENT) vec_uint8 bytes[64];
} vint16x32;

typedef union {
	VEC_ALIGNAS(VINT32x16_ALIGNMENT) vec_uint8 bytes[64];
} vint32x16;

typedef union {
	VEC_ALIGNAS(VINT64x8_ALIGNMENT) vec_uint8 bytes[64];
} vint64x8;

// ---------------------------------------------------------------------------------
// function declarations

int vec_init(void);

//////////////////////////////////////////////////////////////////////////////
// these are, for the most part, meant to be used internally

// okay, these are filled in for each supported backend.
// `and', `or', `xor', and `nor' have to be prefixed with
// `b' because of <iso646.h>/cxxisms
#define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \
	typedef struct { \
		v##sign##int##bits##x##size (*splat)(vec_##sign##int##bits x); \
		v##sign##int##bits##x##size (*load_aligned)(const vec_##sign##int##bits in[size]); \
		v##sign##int##bits##x##size (*load)(const vec_##sign##int##bits in[size]); \
		void (*store_aligned)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
		void (*store)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
		v##sign##int##bits##x##size (*add)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*sub)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*mul)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*div)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*avg)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*band)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*bor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*bxor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*bnot)(v##sign##int##bits##x##size vec); \
		v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
		v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
		v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmplt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmple)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmpeq)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmpge)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
		v##sign##int##bits##x##size (*cmpgt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	} v##sign##int##bits##x##size##_impl;

#define VEC_DEFINE_IMPL_STRUCT(bits, size) \
	VEC_DEFINE_IMPL_STRUCT_SIGN( , bits, size) \
	VEC_DEFINE_IMPL_STRUCT_SIGN(u, bits, size)

// 16-bit
VEC_DEFINE_IMPL_STRUCT(8, 2)

// 32-bit
VEC_DEFINE_IMPL_STRUCT(8, 4)
VEC_DEFINE_IMPL_STRUCT(16, 2)

// 64-bit
VEC_DEFINE_IMPL_STRUCT(8, 8)
VEC_DEFINE_IMPL_STRUCT(16, 4)
VEC_DEFINE_IMPL_STRUCT(32, 2)

// 128-bit
VEC_DEFINE_IMPL_STRUCT(8, 16)
VEC_DEFINE_IMPL_STRUCT(16, 8)
VEC_DEFINE_IMPL_STRUCT(32, 4)
VEC_DEFINE_IMPL_STRUCT(64, 2)

// 256-bit
VEC_DEFINE_IMPL_STRUCT(8, 32)
VEC_DEFINE_IMPL_STRUCT(16, 16)
VEC_DEFINE_IMPL_STRUCT(32, 8)
VEC_DEFINE_IMPL_STRUCT(64, 4)

// 512-bit
VEC_DEFINE_IMPL_STRUCT(8, 64)
VEC_DEFINE_IMPL_STRUCT(16, 32)
VEC_DEFINE_IMPL_STRUCT(32, 16)
VEC_DEFINE_IMPL_STRUCT(64, 8)

#undef VEC_DEFINE_IMPL_STRUCT
#undef VEC_DEFINE_IMPL_STRUCT_SIGN

// 16-bit
extern const vint8x2_impl   *vint8x2_impl_cpu;
extern const vuint8x2_impl  *vuint8x2_impl_cpu;

// 32-bit
extern const vint8x4_impl   *vint8x4_impl_cpu;
extern const vuint8x4_impl  *vuint8x4_impl_cpu;
extern const vint16x2_impl  *vint16x2_impl_cpu;
extern const vuint16x2_impl *vuint16x2_impl_cpu;

// 64-bit
extern const vint8x8_impl   *vint8x8_impl_cpu;
extern const vuint8x8_impl  *vuint8x8_impl_cpu;
extern const vint16x4_impl  *vint16x4_impl_cpu;
extern const vuint16x4_impl *vuint16x4_impl_cpu;
extern const vint32x2_impl  *vint32x2_impl_cpu;
extern const vuint32x2_impl *vuint32x2_impl_cpu;

// 128-bit
extern const vint8x16_impl  *vint8x16_impl_cpu;
extern const vuint8x16_impl *vuint8x16_impl_cpu;
extern const vint16x8_impl  *vint16x8_impl_cpu;
extern const vuint16x8_impl *vuint16x8_impl_cpu;
extern const vint32x4_impl  *vint32x4_impl_cpu;
extern const vuint32x4_impl *vuint32x4_impl_cpu;
extern const vint64x2_impl  *vint64x2_impl_cpu;
extern const vuint64x2_impl *vuint64x2_impl_cpu;

// 256-bit
extern const vint8x32_impl   *vint8x32_impl_cpu;
extern const vuint8x32_impl  *vuint8x32_impl_cpu;
extern const vint16x16_impl  *vint16x16_impl_cpu;
extern const vuint16x16_impl *vuint16x16_impl_cpu;
extern const vint32x8_impl   *vint32x8_impl_cpu;
extern const vuint32x8_impl  *vuint32x8_impl_cpu;
extern const vint64x4_impl   *vint64x4_impl_cpu;
extern const vuint64x4_impl  *vuint64x4_impl_cpu;

// 512-bit
extern const vint8x64_impl  *vint8x64_impl_cpu;
extern const vuint8x64_impl *vuint8x64_impl_cpu;
extern const vint16x32_impl  *vint16x32_impl_cpu;
extern const vuint16x32_impl *vuint16x32_impl_cpu;
extern const vint32x16_impl  *vint32x16_impl_cpu;
extern const vuint32x16_impl *vuint32x16_impl_cpu;
extern const vint64x8_impl  *vint64x8_impl_cpu;
extern const vuint64x8_impl *vuint64x8_impl_cpu;

//////////////////////////////////////////////////////////////////////////////
// declared as inline for  ! performance : )

#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->splat(x); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->load(in); \
	} \
	\
	inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
	{ \
		v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \
	} \
	\
	inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \
	} \
	\
	inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \
	{ \
		return v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \
	}

#define VEC_DEFINE_OPERATIONS(bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)

// 16-bit
VEC_DEFINE_OPERATIONS(8, 2)

// 32-bit
VEC_DEFINE_OPERATIONS(8, 4)
VEC_DEFINE_OPERATIONS(16, 2)

// 64-bit
VEC_DEFINE_OPERATIONS(8, 8)
VEC_DEFINE_OPERATIONS(16, 4)
VEC_DEFINE_OPERATIONS(32, 2)

// 128-bit
VEC_DEFINE_OPERATIONS(8, 16)
VEC_DEFINE_OPERATIONS(16, 8)
VEC_DEFINE_OPERATIONS(32, 4)
VEC_DEFINE_OPERATIONS(64, 2)

// 256-bit
VEC_DEFINE_OPERATIONS(8, 32)
VEC_DEFINE_OPERATIONS(16, 16)
VEC_DEFINE_OPERATIONS(32, 8)
VEC_DEFINE_OPERATIONS(64, 4)

// 512-bit
VEC_DEFINE_OPERATIONS(8, 64)
VEC_DEFINE_OPERATIONS(16, 32)
VEC_DEFINE_OPERATIONS(32, 16)
VEC_DEFINE_OPERATIONS(64, 8)

#undef VEC_DEFINE_OPERATIONS
#undef VEC_DEFINE_OPERATIONS_SIGN

#ifdef __cplusplus
}
#endif

#endif /* VEC_VEC_H_ */
