diff src/vec.c @ 36:677c03c382b8

Backed out changeset e26874655738
author Paper <paper@tflc.us>
date Fri, 25 Apr 2025 17:40:55 -0400
parents 8b5e0974fd41
children
line wrap: on
line diff
--- a/src/vec.c	Fri Apr 25 17:40:51 2025 -0400
+++ b/src/vec.c	Fri Apr 25 17:40:55 2025 -0400
@@ -1,286 +1,2 @@
-/**
- * vec - a tiny SIMD vector library in C99
- * 
- * Copyright (c) 2024 Paper
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
-**/
-
+#define VEC_IMPLEMENTATION
 #include "vec/vec.h"
-#include "vec/cpu.h"
-#include "vec/impl/generic.h"
-#include "vec/impl/fallback.h"
-#ifdef VEC_COMPILER_HAS_MMX
-# include "vec/impl/x86/mmx.h"
-#endif
-#ifdef VEC_COMPILER_HAS_SSE2
-# include "vec/impl/x86/sse2.h"
-#endif
-#ifdef VEC_COMPILER_HAS_SSE41
-# include "vec/impl/x86/sse41.h"
-#endif
-#ifdef VEC_COMPILER_HAS_AVX2
-# include "vec/impl/x86/avx2.h"
-#endif
-#ifdef VEC_COMPILER_HAS_AVX512F
-# include "vec/impl/x86/avx512f.h"
-#endif
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-# include "vec/impl/ppc/altivec.h"
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-# include "vec/impl/arm/neon.h"
-#endif
-
-extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y);
-extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y);
-extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
-extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
-extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
-extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);
-
-// 16-bit
-const vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
-const vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;
-
-// 32-bit
-const vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
-const vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
-const vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
-const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;
-
-// 64-bit
-const vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
-const vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
-const vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
-const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
-const vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
-const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;
-
-// 128-bit
-const vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
-const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
-const vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
-const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
-const vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
-const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
-const vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
-const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;
-
-// 256-bit
-const vint8x32_impl   *vint8x32_impl_cpu   = &vint8x32_impl_generic;
-const vuint8x32_impl  *vuint8x32_impl_cpu  = &vuint8x32_impl_generic;
-const vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
-const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
-const vint32x8_impl   *vint32x8_impl_cpu   = &vint32x8_impl_generic;
-const vuint32x8_impl  *vuint32x8_impl_cpu  = &vuint32x8_impl_generic;
-const vint64x4_impl   *vint64x4_impl_cpu   = &vint64x4_impl_generic;
-const vuint64x4_impl  *vuint64x4_impl_cpu  = &vuint64x4_impl_generic;
-
-// 512-bit
-const vint8x64_impl   *vint8x64_impl_cpu   = &vint8x64_impl_generic;
-const vuint8x64_impl  *vuint8x64_impl_cpu  = &vuint8x64_impl_generic;
-const vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
-const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
-const vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
-const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
-const vint64x8_impl   *vint64x8_impl_cpu   = &vint64x8_impl_generic;
-const vuint64x8_impl  *vuint64x8_impl_cpu  = &vuint64x8_impl_generic;
-
-static int vec_init_spinner = 0;
-
-// returns 0 or a negative error code on failure
-int vec_init(void)
-{
-	// This function is NOT thread safe. However, once vec
-	// is initialized, all of the vector functions are thread-safe.
-	//
-	// In fact, it's possible to use vec without calling
-	// vec_init() at all, but it would be completely useless since
-	// it would just use a generic implementation without any
-	// vectorization whatsoever (unless maybe the compiler is
-	// smart enough to optimize it into vectors)
-
-	if (vec_init_spinner)
-		return 0; // already initialized, do nothing
-
-	vec_uint32 cpu = vec_get_CPU_features();
-
-#ifdef VEC_COMPILER_HAS_ALTIVEC
-	if (cpu & VEC_CPU_HAS_ALTIVEC) {
-		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
-		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
-		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
-		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
-		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
-		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
-#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
-		if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
-			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
-			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
-		}
-#endif
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_AVX512F
-	if (cpu & VEC_CPU_HAS_AVX512F) {
-		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
-		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
-		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
-		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
-		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
-		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
-		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
-		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_AVX2
-	if (cpu & VEC_CPU_HAS_AVX2) {
-		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
-		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
-		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
-		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
-		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
-		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
-		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
-		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_SSE2
-	if (cpu & VEC_CPU_HAS_SSE2) {
-		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
-		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
-		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
-		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
-# ifdef VEC_COMPILER_HAS_SSE41
-		if (cpu & VEC_CPU_HAS_SSE41) {
-			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
-			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
-		} else
-# endif
-		{
-			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
-			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
-		}
-		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
-		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_MMX
-	if (cpu & VEC_CPU_HAS_MMX) {
-		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
-		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
-		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
-		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
-		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
-		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
-	}
-#endif
-#ifdef VEC_COMPILER_HAS_NEON
-	if (cpu & VEC_CPU_HAS_NEON) {
-		// 64-bit
-		vint8x8_impl_cpu  = &vint8x8_impl_neon;
-		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
-		vint16x4_impl_cpu  = &vint16x4_impl_neon;
-		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
-		vint32x2_impl_cpu  = &vint32x2_impl_neon;
-		vuint32x2_impl_cpu = &vuint32x2_impl_neon;
-
-		// 128-bit
-		vint8x16_impl_cpu  = &vint8x16_impl_neon;
-		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
-		vint16x8_impl_cpu  = &vint16x8_impl_neon;
-		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
-		vint32x4_impl_cpu  = &vint32x4_impl_neon;
-		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
-		vint64x2_impl_cpu  = &vint64x2_impl_neon;
-		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
-	}
-#endif
-	{
-		// do nothing, they're already set to generics
-	}
-
-	vec_init_spinner++;
-
-	return 0;
-}
-
-/* ---------------------------------------------------------------- */
-
-#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
-	extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
-	extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
-	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);
-
-#define VEC_DEFINE_OPERATIONS(bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
-	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)
-
-// 16-bit
-VEC_DEFINE_OPERATIONS(8, 2)
-
-// 32-bit
-VEC_DEFINE_OPERATIONS(8, 4)
-VEC_DEFINE_OPERATIONS(16, 2)
-
-// 64-bit
-VEC_DEFINE_OPERATIONS(8, 8)
-VEC_DEFINE_OPERATIONS(16, 4)
-VEC_DEFINE_OPERATIONS(32, 2)
-
-// 128-bit
-VEC_DEFINE_OPERATIONS(8, 16)
-VEC_DEFINE_OPERATIONS(16, 8)
-VEC_DEFINE_OPERATIONS(32, 4)
-VEC_DEFINE_OPERATIONS(64, 2)
-
-// 256-bit
-VEC_DEFINE_OPERATIONS(8, 32)
-VEC_DEFINE_OPERATIONS(16, 16)
-VEC_DEFINE_OPERATIONS(32, 8)
-VEC_DEFINE_OPERATIONS(64, 4)
-
-// 512-bit
-VEC_DEFINE_OPERATIONS(8, 64)
-VEC_DEFINE_OPERATIONS(16, 32)
-VEC_DEFINE_OPERATIONS(32, 16)
-VEC_DEFINE_OPERATIONS(64, 8)
-
-#undef VEC_DEFINE_OPERATIONS
-#undef VEC_DEFINE_OPERATIONS_SIGN