view src/vec.c @ 28:c6c99ab1088a

*: add min/max functions and a big big refactor (again) agh, this time I added a few more implementations (and generally made the code just a little faster...)
author Paper <paper@tflc.us>
date Thu, 24 Apr 2025 00:54:02 -0400
parents 92156fe32755
children e59c91d050c0
line wrap: on
line source

/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#include "vec/vec.h"
#include "vec/cpu.h"
#include "vec/impl/generic.h"
#include "vec/impl/fallback.h"
#ifdef VEC_COMPILER_HAS_MMX
# include "vec/impl/x86/mmx.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE2
# include "vec/impl/x86/sse2.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE3
# include "vec/impl/x86/sse3.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE41
# include "vec/impl/x86/sse41.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE42
# include "vec/impl/x86/sse42.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX2
# include "vec/impl/x86/avx2.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX512F
# include "vec/impl/x86/avx512f.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX512BW
# include "vec/impl/x86/avx512bw.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX512DQ
# include "vec/impl/x86/avx512dq.h"
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
# include "vec/impl/ppc/altivec.h"
#endif
#ifdef VEC_COMPILER_HAS_NEON
# include "vec/impl/arm/neon.h"
#endif

extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);

extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y);
extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y);

// 16-bit
vint8x2_impl   vint8x2_impl_cpu   = {0};
vuint8x2_impl  vuint8x2_impl_cpu  = {0};

// 32-bit
vint8x4_impl   vint8x4_impl_cpu   = {0};
vuint8x4_impl  vuint8x4_impl_cpu  = {0};
vint16x2_impl  vint16x2_impl_cpu  = {0};
vuint16x2_impl vuint16x2_impl_cpu = {0};

// 64-bit
vint8x8_impl   vint8x8_impl_cpu   = {0};
vuint8x8_impl  vuint8x8_impl_cpu  = {0};
vint16x4_impl  vint16x4_impl_cpu  = {0};
vuint16x4_impl vuint16x4_impl_cpu = {0};
vint32x2_impl  vint32x2_impl_cpu  = {0};
vuint32x2_impl vuint32x2_impl_cpu = {0};

// 128-bit
vint8x16_impl  vint8x16_impl_cpu  = {0};
vuint8x16_impl vuint8x16_impl_cpu = {0};
vint16x8_impl  vint16x8_impl_cpu  = {0};
vuint16x8_impl vuint16x8_impl_cpu = {0};
vint32x4_impl  vint32x4_impl_cpu  = {0};
vuint32x4_impl vuint32x4_impl_cpu = {0};
vint64x2_impl  vint64x2_impl_cpu  = {0};
vuint64x2_impl vuint64x2_impl_cpu = {0};

// 256-bit
vint8x32_impl   vint8x32_impl_cpu   = {0};
vuint8x32_impl  vuint8x32_impl_cpu  = {0};
vint16x16_impl  vint16x16_impl_cpu  = {0};
vuint16x16_impl vuint16x16_impl_cpu = {0};
vint32x8_impl   vint32x8_impl_cpu   = {0};
vuint32x8_impl  vuint32x8_impl_cpu  = {0};
vint64x4_impl   vint64x4_impl_cpu   = {0};
vuint64x4_impl  vuint64x4_impl_cpu  = {0};

// 512-bit
vint8x64_impl   vint8x64_impl_cpu   = {0};
vuint8x64_impl  vuint8x64_impl_cpu  = {0};
vint16x32_impl  vint16x32_impl_cpu  = {0};
vuint16x32_impl vuint16x32_impl_cpu = {0};
vint32x16_impl  vint32x16_impl_cpu  = {0};
vuint32x16_impl vuint32x16_impl_cpu = {0};
vint64x8_impl   vint64x8_impl_cpu   = {0};
vuint64x8_impl  vuint64x8_impl_cpu  = {0};

static int vec_init_spinner = 0;

#define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \
	do { \
		if (!(cpu).func && (impl).func) \
			(cpu).func = (impl).func; \
	} while (0)

#define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \
	do { \
		FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, load); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, store); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, add); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, div); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, band); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, min); \
		FILL_GIVEN_FUNC_PTR(cpu, impl, max); \
	} while (0)

#define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \
	FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl)

// returns 0 or a negative error code on failure
int vec_init(void)
{
	// This function is NOT thread safe. However, once vec
	// is initialized, all of the vector functions are thread-safe.

	if (vec_init_spinner)
		return 0; // already initialized, do nothing

	vec_uint32 cpu = vec_get_CPU_features();

	/* Okay, this might be a little confusing:
	 * The way we do this is because of x86. For weird reasons,
	 * Intel decided to extend their prior CPU extensions to
	 * where SSE4.1 has some extended features of SSE2, AVX2
	 * has some extended features that should've been in SSE
	 * in general, etc.
	 *
	 * For this, I've just decided to keep the function
	 * definitions private, and fill in as we go, with newer
	 * intrinsics preferred. Others are arbitrary and are
	 * mutually exclusive (i.e. Altivec vs NEON). This is simply
	 * the easiest way to go about it :) */

	/* --- 512-bit */
#ifdef VEC_COMPILER_HAS_AVX512DQ
	if (cpu & VEC_CPU_HAS_AVX512DQ) {
		/* these give us native multiply instructions */
		FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq);
		FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq);
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX512BW
	if (cpu & VEC_CPU_HAS_AVX512BW) {
		FILL_GIVEN_FUNC_PTRS( , 8,  64, avx512bw);
		FILL_GIVEN_FUNC_PTRS(u, 8,  64, avx512bw);
		FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw);
		FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw);
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX512F
	if (cpu & VEC_CPU_HAS_AVX512F) {
		FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f);
		FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f);
		FILL_GIVEN_FUNC_PTRS( , 64, 8,  avx512f);
		FILL_GIVEN_FUNC_PTRS(u, 64, 8,  avx512f);
	}
#endif

	/* --- 256-bit */
#ifdef VEC_COMPILER_HAS_AVX2
	if (cpu & VEC_CPU_HAS_AVX2) {
		FILL_GIVEN_FUNC_PTRS( , 8, 32,  avx2);
		FILL_GIVEN_FUNC_PTRS(u, 8, 32,  avx2);
		FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2);
		FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2);
		FILL_GIVEN_FUNC_PTRS( , 32, 8,  avx2);
		FILL_GIVEN_FUNC_PTRS(u, 32, 8,  avx2);
		FILL_GIVEN_FUNC_PTRS( , 64, 4,  avx2);
		FILL_GIVEN_FUNC_PTRS(u, 64, 4,  avx2);
	}
#endif

	/* --- 128-bit */
#ifdef VEC_COMPILER_HAS_SSE42
	if (cpu & VEC_CPU_HAS_SSE41) {
		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42);
		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42);
	}
#endif
#ifdef VEC_COMPILER_HAS_SSE41
	if (cpu & VEC_CPU_HAS_SSE41) {
		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41);
		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41);
		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41);
		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41);
		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41);
		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41);
		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41);
		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41);
	}
#endif
#ifdef VEC_COMPILER_HAS_SSE3
	if (cpu & VEC_CPU_HAS_SSE3) {
		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3);
		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3);
		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3);
		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3);
		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3);
		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3);
		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3);
		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3);
	}
#endif
#ifdef VEC_COMPILER_HAS_SSE2
	if (cpu & VEC_CPU_HAS_SSE2) {
		FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2);
		FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2);
		FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2);
		FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2);
		FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2);
		FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2);
		FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2);
		FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2);
	}
#endif
#ifdef VEC_COMPILER_HAS_NEON
	if (cpu & VEC_CPU_HAS_NEON) {
		FILL_GIVEN_FUNC_PTRS( , 8, 16, neon);
		FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon);
		FILL_GIVEN_FUNC_PTRS( , 16, 8, neon);
		FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon);
		FILL_GIVEN_FUNC_PTRS( , 32, 4, neon);
		FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon);
		FILL_GIVEN_FUNC_PTRS( , 64, 2, neon);
		FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon);
	}
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (cpu & VEC_CPU_HAS_ALTIVEC) {
		FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec);
		FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec);
		FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec);
		FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec);
		FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec);
		FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec);
	}
#endif

	/* --- 64-bit */
#ifdef VEC_COMPILER_HAS_MMX
	if (cpu & VEC_CPU_HAS_MMX) {
		FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx);
		FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx);
		FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx);
		FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx);
		FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx);
		FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx);
	}
#endif
#ifdef VEC_COMPILER_HAS_NEON
	if (cpu & VEC_CPU_HAS_NEON) {
		FILL_GIVEN_FUNC_PTRS( , 8, 8, neon);
		FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon);
		FILL_GIVEN_FUNC_PTRS( , 16, 4, neon);
		FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon);
		FILL_GIVEN_FUNC_PTRS( , 32, 2, neon);
		FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon);
	}
#endif

	/* fill any remaining function pointers with generics */
	FILL_GIVEN_FUNC_PTRS( , 8, 64,  generic);
	FILL_GIVEN_FUNC_PTRS(u, 8, 64,  generic);
	FILL_GIVEN_FUNC_PTRS( , 16, 32, generic);
	FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic);
	FILL_GIVEN_FUNC_PTRS( , 32, 16, generic);
	FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic);
	FILL_GIVEN_FUNC_PTRS( , 64, 8,  generic);
	FILL_GIVEN_FUNC_PTRS(u, 64, 8,  generic);

	FILL_GIVEN_FUNC_PTRS( , 8, 32,  generic);
	FILL_GIVEN_FUNC_PTRS(u, 8, 32,  generic);
	FILL_GIVEN_FUNC_PTRS( , 16, 16, generic);
	FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic);
	FILL_GIVEN_FUNC_PTRS( , 32, 8,  generic);
	FILL_GIVEN_FUNC_PTRS(u, 32, 8,  generic);
	FILL_GIVEN_FUNC_PTRS( , 64, 4,  generic);
	FILL_GIVEN_FUNC_PTRS(u, 64, 4,  generic);

	FILL_GIVEN_FUNC_PTRS( , 8, 16, generic);
	FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic);
	FILL_GIVEN_FUNC_PTRS( , 16, 8, generic);
	FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic);
	FILL_GIVEN_FUNC_PTRS( , 32, 4, generic);
	FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic);
	FILL_GIVEN_FUNC_PTRS( , 64, 2, generic);
	FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic);

	FILL_GIVEN_FUNC_PTRS( , 8, 8, generic);
	FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic);
	FILL_GIVEN_FUNC_PTRS( , 16, 4, generic);
	FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic);
	FILL_GIVEN_FUNC_PTRS( , 32, 2, generic);
	FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic);

	FILL_GIVEN_FUNC_PTRS( , 8, 4, generic);
	FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic);
	FILL_GIVEN_FUNC_PTRS( , 16, 2, generic);
	FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic);

	FILL_GIVEN_FUNC_PTRS( , 8, 2, generic);
	FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic);

	vec_init_spinner++;

	return 0;
}

/* ---------------------------------------------------------------- */

#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
	extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
	extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2);

#define VEC_DEFINE_OPERATIONS(bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)

// 16-bit
VEC_DEFINE_OPERATIONS(8, 2)

// 32-bit
VEC_DEFINE_OPERATIONS(8, 4)
VEC_DEFINE_OPERATIONS(16, 2)

// 64-bit
VEC_DEFINE_OPERATIONS(8, 8)
VEC_DEFINE_OPERATIONS(16, 4)
VEC_DEFINE_OPERATIONS(32, 2)

// 128-bit
VEC_DEFINE_OPERATIONS(8, 16)
VEC_DEFINE_OPERATIONS(16, 8)
VEC_DEFINE_OPERATIONS(32, 4)
VEC_DEFINE_OPERATIONS(64, 2)

// 256-bit
VEC_DEFINE_OPERATIONS(8, 32)
VEC_DEFINE_OPERATIONS(16, 16)
VEC_DEFINE_OPERATIONS(32, 8)
VEC_DEFINE_OPERATIONS(64, 4)

// 512-bit
VEC_DEFINE_OPERATIONS(8, 64)
VEC_DEFINE_OPERATIONS(16, 32)
VEC_DEFINE_OPERATIONS(32, 16)
VEC_DEFINE_OPERATIONS(64, 8)

#undef VEC_DEFINE_OPERATIONS
#undef VEC_DEFINE_OPERATIONS_SIGN