view src/vec.c @ 27:d00b95f95dd1 default tip

impl/arm/neon: it compiles again, but is untested
author Paper <paper@tflc.us>
date Mon, 25 Nov 2024 00:33:02 -0500
parents 92156fe32755
children
line wrap: on
line source

/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#include "vec/vec.h"
#include "vec/cpu.h"
#include "vec/impl/generic.h"
#include "vec/impl/fallback.h"
#ifdef VEC_COMPILER_HAS_MMX
# include "vec/impl/x86/mmx.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE2
# include "vec/impl/x86/sse2.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE41
# include "vec/impl/x86/sse41.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX2
# include "vec/impl/x86/avx2.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX512F
# include "vec/impl/x86/avx512f.h"
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
# include "vec/impl/ppc/altivec.h"
#endif
#ifdef VEC_COMPILER_HAS_NEON
# include "vec/impl/arm/neon.h"
#endif

extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);

extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y);
extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y);

// 16-bit
const vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
const vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;

// 32-bit
const vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
const vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
const vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;

// 64-bit
const vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
const vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
const vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
const vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;

// 128-bit
const vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
const vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
const vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
const vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;

// 256-bit
const vint8x32_impl   *vint8x32_impl_cpu   = &vint8x32_impl_generic;
const vuint8x32_impl  *vuint8x32_impl_cpu  = &vuint8x32_impl_generic;
const vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
const vint32x8_impl   *vint32x8_impl_cpu   = &vint32x8_impl_generic;
const vuint32x8_impl  *vuint32x8_impl_cpu  = &vuint32x8_impl_generic;
const vint64x4_impl   *vint64x4_impl_cpu   = &vint64x4_impl_generic;
const vuint64x4_impl  *vuint64x4_impl_cpu  = &vuint64x4_impl_generic;

// 512-bit
const vint8x64_impl   *vint8x64_impl_cpu   = &vint8x64_impl_generic;
const vuint8x64_impl  *vuint8x64_impl_cpu  = &vuint8x64_impl_generic;
const vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
const vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
const vint64x8_impl   *vint64x8_impl_cpu   = &vint64x8_impl_generic;
const vuint64x8_impl  *vuint64x8_impl_cpu  = &vuint64x8_impl_generic;

static int vec_init_spinner = 0;

// returns 0 or a negative error code on failure
int vec_init(void)
{
	// This function is NOT thread safe. However, once vec
	// is initialized, all of the vector functions are thread-safe.
	//
	// In fact, it's possible to use vec without calling
	// vec_init() at all, but it would be completely useless since
	// it would just use a generic implementation without any
	// vectorization whatsoever (unless maybe the compiler is
	// smart enough to optimize it into vectors)

	if (vec_init_spinner)
		return 0; // already initialized, do nothing

	vec_uint32 cpu = vec_get_CPU_features();

#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (cpu & VEC_CPU_HAS_ALTIVEC) {
		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
		if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
		}
#endif
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX512F
	if (cpu & VEC_CPU_HAS_AVX512F) {
		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX2
	if (cpu & VEC_CPU_HAS_AVX2) {
		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
	}
#endif
#ifdef VEC_COMPILER_HAS_SSE2
	if (cpu & VEC_CPU_HAS_SSE2) {
		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
# ifdef VEC_COMPILER_HAS_SSE41
		if (cpu & VEC_CPU_HAS_SSE41) {
			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
		} else
# endif
		{
			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
		}
		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
	}
#endif
#ifdef VEC_COMPILER_HAS_MMX
	if (cpu & VEC_CPU_HAS_MMX) {
		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
	}
#endif
#ifdef VEC_COMPILER_HAS_NEON
	if (cpu & VEC_CPU_HAS_NEON) {
		// 64-bit
		vint8x8_impl_cpu  = &vint8x8_impl_neon;
		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
		vint16x4_impl_cpu  = &vint16x4_impl_neon;
		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
		vint32x2_impl_cpu  = &vint32x2_impl_neon;
		vuint32x2_impl_cpu = &vuint32x2_impl_neon;

		// 128-bit
		vint8x16_impl_cpu  = &vint8x16_impl_neon;
		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
		vint16x8_impl_cpu  = &vint16x8_impl_neon;
		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
		vint32x4_impl_cpu  = &vint32x4_impl_neon;
		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
		vint64x2_impl_cpu  = &vint64x2_impl_neon;
		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
	}
#endif
	{
		// do nothing, they're already set to generics
	}

	vec_init_spinner++;

	return 0;
}

/* ---------------------------------------------------------------- */

#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
	extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
	extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);

#define VEC_DEFINE_OPERATIONS(bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)

// 16-bit
VEC_DEFINE_OPERATIONS(8, 2)

// 32-bit
VEC_DEFINE_OPERATIONS(8, 4)
VEC_DEFINE_OPERATIONS(16, 2)

// 64-bit
VEC_DEFINE_OPERATIONS(8, 8)
VEC_DEFINE_OPERATIONS(16, 4)
VEC_DEFINE_OPERATIONS(32, 2)

// 128-bit
VEC_DEFINE_OPERATIONS(8, 16)
VEC_DEFINE_OPERATIONS(16, 8)
VEC_DEFINE_OPERATIONS(32, 4)
VEC_DEFINE_OPERATIONS(64, 2)

// 256-bit
VEC_DEFINE_OPERATIONS(8, 32)
VEC_DEFINE_OPERATIONS(16, 16)
VEC_DEFINE_OPERATIONS(32, 8)
VEC_DEFINE_OPERATIONS(64, 4)

// 512-bit
VEC_DEFINE_OPERATIONS(8, 64)
VEC_DEFINE_OPERATIONS(16, 32)
VEC_DEFINE_OPERATIONS(32, 16)
VEC_DEFINE_OPERATIONS(64, 8)

#undef VEC_DEFINE_OPERATIONS
#undef VEC_DEFINE_OPERATIONS_SIGN