view src/vec.c @ 25:92156fe32755

impl/ppc/altivec: update to new implementation the signed average function is wrong; it needs to round up the number when only one of them is odd, but that doesn't necessarily seem to be true because altivec is weird, and that's what we need to emulate the quirks for. ugh. also the altivec backend uses the generic functions instead of fallbacks because it does indeed use the exact same memory structure as the generic implementation...
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 11:15:59 +0000
parents e26874655738
children
line wrap: on
line source

/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#include "vec/vec.h"
#include "vec/cpu.h"
#include "vec/impl/generic.h"
#include "vec/impl/fallback.h"
#ifdef VEC_COMPILER_HAS_MMX
# include "vec/impl/x86/mmx.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE2
# include "vec/impl/x86/sse2.h"
#endif
#ifdef VEC_COMPILER_HAS_SSE41
# include "vec/impl/x86/sse41.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX2
# include "vec/impl/x86/avx2.h"
#endif
#ifdef VEC_COMPILER_HAS_AVX512F
# include "vec/impl/x86/avx512f.h"
#endif
#ifdef VEC_COMPILER_HAS_ALTIVEC
# include "vec/impl/ppc/altivec.h"
#endif
#ifdef VEC_COMPILER_HAS_NEON
# include "vec/impl/arm/neon.h"
#endif

extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y);
extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y);
extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y);
extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y);

extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y);
extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y);

// 16-bit
const vint8x2_impl   *vint8x2_impl_cpu   = &vint8x2_impl_generic;
const vuint8x2_impl  *vuint8x2_impl_cpu  = &vuint8x2_impl_generic;

// 32-bit
const vint8x4_impl   *vint8x4_impl_cpu   = &vint8x4_impl_generic;
const vuint8x4_impl  *vuint8x4_impl_cpu  = &vuint8x4_impl_generic;
const vint16x2_impl  *vint16x2_impl_cpu  = &vint16x2_impl_generic;
const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic;

// 64-bit
const vint8x8_impl   *vint8x8_impl_cpu   = &vint8x8_impl_generic;
const vuint8x8_impl  *vuint8x8_impl_cpu  = &vuint8x8_impl_generic;
const vint16x4_impl  *vint16x4_impl_cpu  = &vint16x4_impl_generic;
const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic;
const vint32x2_impl  *vint32x2_impl_cpu  = &vint32x2_impl_generic;
const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic;

// 128-bit
const vint8x16_impl  *vint8x16_impl_cpu  = &vint8x16_impl_generic;
const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic;
const vint16x8_impl  *vint16x8_impl_cpu  = &vint16x8_impl_generic;
const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic;
const vint32x4_impl  *vint32x4_impl_cpu  = &vint32x4_impl_generic;
const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic;
const vint64x2_impl  *vint64x2_impl_cpu  = &vint64x2_impl_generic;
const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic;

// 256-bit
const vint8x32_impl   *vint8x32_impl_cpu   = &vint8x32_impl_generic;
const vuint8x32_impl  *vuint8x32_impl_cpu  = &vuint8x32_impl_generic;
const vint16x16_impl  *vint16x16_impl_cpu  = &vint16x16_impl_generic;
const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic;
const vint32x8_impl   *vint32x8_impl_cpu   = &vint32x8_impl_generic;
const vuint32x8_impl  *vuint32x8_impl_cpu  = &vuint32x8_impl_generic;
const vint64x4_impl   *vint64x4_impl_cpu   = &vint64x4_impl_generic;
const vuint64x4_impl  *vuint64x4_impl_cpu  = &vuint64x4_impl_generic;

// 512-bit
const vint8x64_impl   *vint8x64_impl_cpu   = &vint8x64_impl_generic;
const vuint8x64_impl  *vuint8x64_impl_cpu  = &vuint8x64_impl_generic;
const vint16x32_impl  *vint16x32_impl_cpu  = &vint16x32_impl_generic;
const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic;
const vint32x16_impl  *vint32x16_impl_cpu  = &vint32x16_impl_generic;
const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic;
const vint64x8_impl   *vint64x8_impl_cpu   = &vint64x8_impl_generic;
const vuint64x8_impl  *vuint64x8_impl_cpu  = &vuint64x8_impl_generic;

static int vec_init_spinner = 0;

// returns 0 or a negative error code on failure
int vec_init(void)
{
	// This function is NOT thread safe. However, once vec
	// is initialized, all of the vector functions are thread-safe.
	//
	// In fact, it's possible to use vec without calling
	// vec_init() at all, but it would be completely useless since
	// it would just use a generic implementation without any
	// vectorization whatsoever (unless maybe the compiler is
	// smart enough to optimize it into vectors)

	if (vec_init_spinner)
		return 0; // already initialized, do nothing

	vec_uint32 cpu = vec_get_CPU_features();

#ifdef VEC_COMPILER_HAS_ALTIVEC
	if (cpu & VEC_CPU_HAS_ALTIVEC) {
		vint8x16_impl_cpu  = &vint8x16_impl_altivec;
		vuint8x16_impl_cpu = &vuint8x16_impl_altivec;
		vint16x8_impl_cpu  = &vint16x8_impl_altivec;
		vuint16x8_impl_cpu = &vuint16x8_impl_altivec;
		vint32x4_impl_cpu  = &vint32x4_impl_altivec;
		vuint32x4_impl_cpu = &vuint32x4_impl_altivec;
#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX
		if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) {
			vint64x2_impl_cpu  = &vint64x2_impl_altivec;
			vuint64x2_impl_cpu = &vuint64x2_impl_altivec;
		}
#endif
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX512F
	if (cpu & VEC_CPU_HAS_AVX512F) {
		vint8x64_impl_cpu  = &vint8x64_impl_avx512f;
		vuint8x64_impl_cpu = &vuint8x64_impl_avx512f;
		vint16x32_impl_cpu  = &vint16x32_impl_avx512f;
		vuint16x32_impl_cpu = &vuint16x32_impl_avx512f;
		vint32x16_impl_cpu  = &vint32x16_impl_avx512f;
		vuint32x16_impl_cpu = &vuint32x16_impl_avx512f;
		vint64x8_impl_cpu  = &vint64x8_impl_avx512f;
		vuint64x8_impl_cpu = &vuint64x8_impl_avx512f;
	}
#endif
#ifdef VEC_COMPILER_HAS_AVX2
	if (cpu & VEC_CPU_HAS_AVX2) {
		vint8x32_impl_cpu  = &vint8x32_impl_avx2;
		vuint8x32_impl_cpu = &vuint8x32_impl_avx2;
		vint16x16_impl_cpu  = &vint16x16_impl_avx2;
		vuint16x16_impl_cpu = &vuint16x16_impl_avx2;
		vint32x8_impl_cpu  = &vint32x8_impl_avx2;
		vuint32x8_impl_cpu = &vuint32x8_impl_avx2;
		vint64x4_impl_cpu  = &vint64x4_impl_avx2;
		vuint64x4_impl_cpu = &vuint64x4_impl_avx2;
	}
#endif
#ifdef VEC_COMPILER_HAS_SSE2
	if (cpu & VEC_CPU_HAS_SSE2) {
		vint8x16_impl_cpu  = &vint8x16_impl_sse2;
		vuint8x16_impl_cpu = &vuint8x16_impl_sse2;
		vint16x8_impl_cpu  = &vint16x8_impl_sse2;
		vuint16x8_impl_cpu = &vuint16x8_impl_sse2;
# ifdef VEC_COMPILER_HAS_SSE41
		if (cpu & VEC_CPU_HAS_SSE41) {
			vint32x4_impl_cpu  = &vint32x4_impl_sse41;
			vuint32x4_impl_cpu = &vuint32x4_impl_sse41;
		} else
# endif
		{
			vint32x4_impl_cpu  = &vint32x4_impl_sse2;
			vuint32x4_impl_cpu = &vuint32x4_impl_sse2;
		}
		vint64x2_impl_cpu  = &vint64x2_impl_sse2;
		vuint64x2_impl_cpu = &vuint64x2_impl_sse2;
	}
#endif
#ifdef VEC_COMPILER_HAS_MMX
	if (cpu & VEC_CPU_HAS_MMX) {
		vint8x8_impl_cpu  = &vint8x8_impl_mmx;
		vuint8x8_impl_cpu = &vuint8x8_impl_mmx;
		vint16x4_impl_cpu  = &vint16x4_impl_mmx;
		vuint16x4_impl_cpu = &vuint16x4_impl_mmx;
		vint32x2_impl_cpu  = &vint32x2_impl_mmx;
		vuint32x2_impl_cpu = &vuint32x2_impl_mmx;
	}
#endif
#ifdef VEC_COMPILER_HAS_NEON
	if (cpu & VEC_CPU_HAS_NEON) {
		// 64-bit
		vint8x8_impl_cpu  = &vint8x8_impl_neon;
		vuint8x8_impl_cpu = &vuint8x8_impl_neon;
		vint16x4_impl_cpu  = &vint16x4_impl_neon;
		vuint16x4_impl_cpu = &vuint16x4_impl_neon;
		vint32x2_impl_cpu  = &vint32x2_impl_neon;
		vuint32x2_impl_cpu = &vuint32x2_impl_neon;

		// 128-bit
		vint8x16_impl_cpu  = &vint8x16_impl_neon;
		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
		vint16x8_impl_cpu  = &vint16x8_impl_neon;
		vuint16x8_impl_cpu = &vuint16x8_impl_neon;
		vint32x4_impl_cpu  = &vint32x4_impl_neon;
		vuint32x4_impl_cpu = &vuint32x4_impl_neon;
		vint64x2_impl_cpu  = &vint64x2_impl_neon;
		vuint64x2_impl_cpu = &vuint64x2_impl_neon;
	}
#endif
	{
		// do nothing, they're already set to generics
	}

	vec_init_spinner++;

	return 0;
}

/* ---------------------------------------------------------------- */

#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \
	extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
	extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \
	extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2);

#define VEC_DEFINE_OPERATIONS(bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \
	VEC_DEFINE_OPERATIONS_SIGN(u, bits, size)

// 16-bit
VEC_DEFINE_OPERATIONS(8, 2)

// 32-bit
VEC_DEFINE_OPERATIONS(8, 4)
VEC_DEFINE_OPERATIONS(16, 2)

// 64-bit
VEC_DEFINE_OPERATIONS(8, 8)
VEC_DEFINE_OPERATIONS(16, 4)
VEC_DEFINE_OPERATIONS(32, 2)

// 128-bit
VEC_DEFINE_OPERATIONS(8, 16)
VEC_DEFINE_OPERATIONS(16, 8)
VEC_DEFINE_OPERATIONS(32, 4)
VEC_DEFINE_OPERATIONS(64, 2)

// 256-bit
VEC_DEFINE_OPERATIONS(8, 32)
VEC_DEFINE_OPERATIONS(16, 16)
VEC_DEFINE_OPERATIONS(32, 8)
VEC_DEFINE_OPERATIONS(64, 4)

// 512-bit
VEC_DEFINE_OPERATIONS(8, 64)
VEC_DEFINE_OPERATIONS(16, 32)
VEC_DEFINE_OPERATIONS(32, 16)
VEC_DEFINE_OPERATIONS(64, 8)

#undef VEC_DEFINE_OPERATIONS
#undef VEC_DEFINE_OPERATIONS_SIGN