Mercurial > vec
view src/vec.c @ 28:c6c99ab1088a
*: add min/max functions and a big big refactor (again)
agh, this time I added a few more implementations (and generally
made the code just a little faster...)
author | Paper <paper@tflc.us> |
---|---|
date | Thu, 24 Apr 2025 00:54:02 -0400 |
parents | 92156fe32755 |
children | e59c91d050c0 |
line wrap: on
line source
/** * vec - a tiny SIMD vector library in C99 * * Copyright (c) 2024 Paper * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. **/ #include "vec/vec.h" #include "vec/cpu.h" #include "vec/impl/generic.h" #include "vec/impl/fallback.h" #ifdef VEC_COMPILER_HAS_MMX # include "vec/impl/x86/mmx.h" #endif #ifdef VEC_COMPILER_HAS_SSE2 # include "vec/impl/x86/sse2.h" #endif #ifdef VEC_COMPILER_HAS_SSE3 # include "vec/impl/x86/sse3.h" #endif #ifdef VEC_COMPILER_HAS_SSE41 # include "vec/impl/x86/sse41.h" #endif #ifdef VEC_COMPILER_HAS_SSE42 # include "vec/impl/x86/sse42.h" #endif #ifdef VEC_COMPILER_HAS_AVX2 # include "vec/impl/x86/avx2.h" #endif #ifdef VEC_COMPILER_HAS_AVX512F # include "vec/impl/x86/avx512f.h" #endif #ifdef VEC_COMPILER_HAS_AVX512BW # include "vec/impl/x86/avx512bw.h" #endif #ifdef VEC_COMPILER_HAS_AVX512DQ # include "vec/impl/x86/avx512dq.h" #endif #ifdef VEC_COMPILER_HAS_ALTIVEC # include "vec/impl/ppc/altivec.h" #endif #ifdef VEC_COMPILER_HAS_NEON # include "vec/impl/arm/neon.h" #endif extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y); extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y); extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); // 16-bit vint8x2_impl vint8x2_impl_cpu = {0}; vuint8x2_impl vuint8x2_impl_cpu = {0}; // 32-bit vint8x4_impl vint8x4_impl_cpu = {0}; vuint8x4_impl vuint8x4_impl_cpu = {0}; vint16x2_impl vint16x2_impl_cpu = {0}; vuint16x2_impl vuint16x2_impl_cpu = {0}; // 64-bit vint8x8_impl vint8x8_impl_cpu = {0}; vuint8x8_impl vuint8x8_impl_cpu = {0}; vint16x4_impl vint16x4_impl_cpu = {0}; vuint16x4_impl vuint16x4_impl_cpu = {0}; vint32x2_impl vint32x2_impl_cpu = {0}; vuint32x2_impl vuint32x2_impl_cpu = {0}; // 128-bit vint8x16_impl vint8x16_impl_cpu = {0}; vuint8x16_impl vuint8x16_impl_cpu = {0}; vint16x8_impl vint16x8_impl_cpu = {0}; vuint16x8_impl vuint16x8_impl_cpu = {0}; vint32x4_impl vint32x4_impl_cpu = {0}; vuint32x4_impl vuint32x4_impl_cpu = {0}; vint64x2_impl vint64x2_impl_cpu = {0}; vuint64x2_impl vuint64x2_impl_cpu = {0}; // 256-bit vint8x32_impl vint8x32_impl_cpu = {0}; vuint8x32_impl vuint8x32_impl_cpu = {0}; vint16x16_impl vint16x16_impl_cpu = {0}; vuint16x16_impl vuint16x16_impl_cpu = {0}; vint32x8_impl vint32x8_impl_cpu = {0}; vuint32x8_impl vuint32x8_impl_cpu = {0}; vint64x4_impl vint64x4_impl_cpu = {0}; vuint64x4_impl vuint64x4_impl_cpu = {0}; // 512-bit vint8x64_impl vint8x64_impl_cpu = {0}; vuint8x64_impl vuint8x64_impl_cpu = {0}; vint16x32_impl vint16x32_impl_cpu = {0}; vuint16x32_impl vuint16x32_impl_cpu = {0}; vint32x16_impl vint32x16_impl_cpu = {0}; vuint32x16_impl vuint32x16_impl_cpu = {0}; vint64x8_impl vint64x8_impl_cpu = {0}; vuint64x8_impl vuint64x8_impl_cpu = {0}; static int vec_init_spinner = 0; #define FILL_GIVEN_FUNC_PTR(cpu, impl, func) \ do { \ if (!(cpu).func && (impl).func) \ (cpu).func = (impl).func; \ } while (0) #define FILL_GIVEN_FUNC_PTRS_EX(cpu, impl) \ do { \ FILL_GIVEN_FUNC_PTR(cpu, impl, splat); \ FILL_GIVEN_FUNC_PTR(cpu, impl, load_aligned); \ FILL_GIVEN_FUNC_PTR(cpu, impl, load); \ FILL_GIVEN_FUNC_PTR(cpu, impl, store_aligned); \ FILL_GIVEN_FUNC_PTR(cpu, impl, store); \ FILL_GIVEN_FUNC_PTR(cpu, impl, add); \ FILL_GIVEN_FUNC_PTR(cpu, impl, sub); \ FILL_GIVEN_FUNC_PTR(cpu, impl, mul); \ FILL_GIVEN_FUNC_PTR(cpu, impl, div); \ FILL_GIVEN_FUNC_PTR(cpu, impl, avg); \ FILL_GIVEN_FUNC_PTR(cpu, impl, band); \ FILL_GIVEN_FUNC_PTR(cpu, impl, bor); \ FILL_GIVEN_FUNC_PTR(cpu, impl, bxor); \ FILL_GIVEN_FUNC_PTR(cpu, impl, lshift); \ FILL_GIVEN_FUNC_PTR(cpu, impl, rshift); \ FILL_GIVEN_FUNC_PTR(cpu, impl, lrshift); \ FILL_GIVEN_FUNC_PTR(cpu, impl, cmplt); \ FILL_GIVEN_FUNC_PTR(cpu, impl, cmple); \ FILL_GIVEN_FUNC_PTR(cpu, impl, cmpeq); \ FILL_GIVEN_FUNC_PTR(cpu, impl, cmpge); \ FILL_GIVEN_FUNC_PTR(cpu, impl, cmpgt); \ FILL_GIVEN_FUNC_PTR(cpu, impl, min); \ FILL_GIVEN_FUNC_PTR(cpu, impl, max); \ } while (0) #define FILL_GIVEN_FUNC_PTRS(sign, bits, size, impl) \ FILL_GIVEN_FUNC_PTRS_EX(v##sign##int##bits##x##size##_impl_cpu, v##sign##int##bits##x##size##_impl_##impl) // returns 0 or a negative error code on failure int vec_init(void) { // This function is NOT thread safe. However, once vec // is initialized, all of the vector functions are thread-safe. if (vec_init_spinner) return 0; // already initialized, do nothing vec_uint32 cpu = vec_get_CPU_features(); /* Okay, this might be a little confusing: * The way we do this is because of x86. For weird reasons, * Intel decided to extend their prior CPU extensions to * where SSE4.1 has some extended features of SSE2, AVX2 * has some extended features that should've been in SSE * in general, etc. * * For this, I've just decided to keep the function * definitions private, and fill in as we go, with newer * intrinsics preferred. Others are arbitrary and are * mutually exclusive (i.e. Altivec vs NEON). This is simply * the easiest way to go about it :) */ /* --- 512-bit */ #ifdef VEC_COMPILER_HAS_AVX512DQ if (cpu & VEC_CPU_HAS_AVX512DQ) { /* these give us native multiply instructions */ FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512dq); FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512dq); } #endif #ifdef VEC_COMPILER_HAS_AVX512BW if (cpu & VEC_CPU_HAS_AVX512BW) { FILL_GIVEN_FUNC_PTRS( , 8, 64, avx512bw); FILL_GIVEN_FUNC_PTRS(u, 8, 64, avx512bw); FILL_GIVEN_FUNC_PTRS( , 16, 32, avx512bw); FILL_GIVEN_FUNC_PTRS(u, 16, 32, avx512bw); } #endif #ifdef VEC_COMPILER_HAS_AVX512F if (cpu & VEC_CPU_HAS_AVX512F) { FILL_GIVEN_FUNC_PTRS( , 32, 16, avx512f); FILL_GIVEN_FUNC_PTRS(u, 32, 16, avx512f); FILL_GIVEN_FUNC_PTRS( , 64, 8, avx512f); FILL_GIVEN_FUNC_PTRS(u, 64, 8, avx512f); } #endif /* --- 256-bit */ #ifdef VEC_COMPILER_HAS_AVX2 if (cpu & VEC_CPU_HAS_AVX2) { FILL_GIVEN_FUNC_PTRS( , 8, 32, avx2); FILL_GIVEN_FUNC_PTRS(u, 8, 32, avx2); FILL_GIVEN_FUNC_PTRS( , 16, 16, avx2); FILL_GIVEN_FUNC_PTRS(u, 16, 16, avx2); FILL_GIVEN_FUNC_PTRS( , 32, 8, avx2); FILL_GIVEN_FUNC_PTRS(u, 32, 8, avx2); FILL_GIVEN_FUNC_PTRS( , 64, 4, avx2); FILL_GIVEN_FUNC_PTRS(u, 64, 4, avx2); } #endif /* --- 128-bit */ #ifdef VEC_COMPILER_HAS_SSE42 if (cpu & VEC_CPU_HAS_SSE41) { FILL_GIVEN_FUNC_PTRS( , 64, 2, sse42); FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse42); } #endif #ifdef VEC_COMPILER_HAS_SSE41 if (cpu & VEC_CPU_HAS_SSE41) { FILL_GIVEN_FUNC_PTRS( , 8, 16, sse41); FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse41); FILL_GIVEN_FUNC_PTRS( , 16, 8, sse41); FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse41); FILL_GIVEN_FUNC_PTRS( , 32, 4, sse41); FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse41); FILL_GIVEN_FUNC_PTRS( , 64, 2, sse41); FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse41); } #endif #ifdef VEC_COMPILER_HAS_SSE3 if (cpu & VEC_CPU_HAS_SSE3) { FILL_GIVEN_FUNC_PTRS( , 8, 16, sse3); FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse3); FILL_GIVEN_FUNC_PTRS( , 16, 8, sse3); FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse3); FILL_GIVEN_FUNC_PTRS( , 32, 4, sse3); FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse3); FILL_GIVEN_FUNC_PTRS( , 64, 2, sse3); FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse3); } #endif #ifdef VEC_COMPILER_HAS_SSE2 if (cpu & VEC_CPU_HAS_SSE2) { FILL_GIVEN_FUNC_PTRS( , 8, 16, sse2); FILL_GIVEN_FUNC_PTRS(u, 8, 16, sse2); FILL_GIVEN_FUNC_PTRS( , 16, 8, sse2); FILL_GIVEN_FUNC_PTRS(u, 16, 8, sse2); FILL_GIVEN_FUNC_PTRS( , 32, 4, sse2); FILL_GIVEN_FUNC_PTRS(u, 32, 4, sse2); FILL_GIVEN_FUNC_PTRS( , 64, 2, sse2); FILL_GIVEN_FUNC_PTRS(u, 64, 2, sse2); } #endif #ifdef VEC_COMPILER_HAS_NEON if (cpu & VEC_CPU_HAS_NEON) { FILL_GIVEN_FUNC_PTRS( , 8, 16, neon); FILL_GIVEN_FUNC_PTRS(u, 8, 16, neon); FILL_GIVEN_FUNC_PTRS( , 16, 8, neon); FILL_GIVEN_FUNC_PTRS(u, 16, 8, neon); FILL_GIVEN_FUNC_PTRS( , 32, 4, neon); FILL_GIVEN_FUNC_PTRS(u, 32, 4, neon); FILL_GIVEN_FUNC_PTRS( , 64, 2, neon); FILL_GIVEN_FUNC_PTRS(u, 64, 2, neon); } #endif #ifdef VEC_COMPILER_HAS_ALTIVEC if (cpu & VEC_CPU_HAS_ALTIVEC) { FILL_GIVEN_FUNC_PTRS( , 8, 16, altivec); FILL_GIVEN_FUNC_PTRS(u, 8, 16, altivec); FILL_GIVEN_FUNC_PTRS( , 16, 8, altivec); FILL_GIVEN_FUNC_PTRS(u, 16, 8, altivec); FILL_GIVEN_FUNC_PTRS( , 32, 4, altivec); FILL_GIVEN_FUNC_PTRS(u, 32, 4, altivec); } #endif /* --- 64-bit */ #ifdef VEC_COMPILER_HAS_MMX if (cpu & VEC_CPU_HAS_MMX) { FILL_GIVEN_FUNC_PTRS( , 8, 8, mmx); FILL_GIVEN_FUNC_PTRS(u, 8, 8, mmx); FILL_GIVEN_FUNC_PTRS( , 16, 4, mmx); FILL_GIVEN_FUNC_PTRS(u, 16, 4, mmx); FILL_GIVEN_FUNC_PTRS( , 32, 2, mmx); FILL_GIVEN_FUNC_PTRS(u, 32, 2, mmx); } #endif #ifdef VEC_COMPILER_HAS_NEON if (cpu & VEC_CPU_HAS_NEON) { FILL_GIVEN_FUNC_PTRS( , 8, 8, neon); FILL_GIVEN_FUNC_PTRS(u, 8, 8, neon); FILL_GIVEN_FUNC_PTRS( , 16, 4, neon); FILL_GIVEN_FUNC_PTRS(u, 16, 4, neon); FILL_GIVEN_FUNC_PTRS( , 32, 2, neon); FILL_GIVEN_FUNC_PTRS(u, 32, 2, neon); } #endif /* fill any remaining function pointers with generics */ FILL_GIVEN_FUNC_PTRS( , 8, 64, generic); FILL_GIVEN_FUNC_PTRS(u, 8, 64, generic); FILL_GIVEN_FUNC_PTRS( , 16, 32, generic); FILL_GIVEN_FUNC_PTRS(u, 16, 32, generic); FILL_GIVEN_FUNC_PTRS( , 32, 16, generic); FILL_GIVEN_FUNC_PTRS(u, 32, 16, generic); FILL_GIVEN_FUNC_PTRS( , 64, 8, generic); FILL_GIVEN_FUNC_PTRS(u, 64, 8, generic); FILL_GIVEN_FUNC_PTRS( , 8, 32, generic); FILL_GIVEN_FUNC_PTRS(u, 8, 32, generic); FILL_GIVEN_FUNC_PTRS( , 16, 16, generic); FILL_GIVEN_FUNC_PTRS(u, 16, 16, generic); FILL_GIVEN_FUNC_PTRS( , 32, 8, generic); FILL_GIVEN_FUNC_PTRS(u, 32, 8, generic); FILL_GIVEN_FUNC_PTRS( , 64, 4, generic); FILL_GIVEN_FUNC_PTRS(u, 64, 4, generic); FILL_GIVEN_FUNC_PTRS( , 8, 16, generic); FILL_GIVEN_FUNC_PTRS(u, 8, 16, generic); FILL_GIVEN_FUNC_PTRS( , 16, 8, generic); FILL_GIVEN_FUNC_PTRS(u, 16, 8, generic); FILL_GIVEN_FUNC_PTRS( , 32, 4, generic); FILL_GIVEN_FUNC_PTRS(u, 32, 4, generic); FILL_GIVEN_FUNC_PTRS( , 64, 2, generic); FILL_GIVEN_FUNC_PTRS(u, 64, 2, generic); FILL_GIVEN_FUNC_PTRS( , 8, 8, generic); FILL_GIVEN_FUNC_PTRS(u, 8, 8, generic); FILL_GIVEN_FUNC_PTRS( , 16, 4, generic); FILL_GIVEN_FUNC_PTRS(u, 16, 4, generic); FILL_GIVEN_FUNC_PTRS( , 32, 2, generic); FILL_GIVEN_FUNC_PTRS(u, 32, 2, generic); FILL_GIVEN_FUNC_PTRS( , 8, 4, generic); FILL_GIVEN_FUNC_PTRS(u, 8, 4, generic); FILL_GIVEN_FUNC_PTRS( , 16, 2, generic); FILL_GIVEN_FUNC_PTRS(u, 16, 2, generic); FILL_GIVEN_FUNC_PTRS( , 8, 2, generic); FILL_GIVEN_FUNC_PTRS(u, 8, 2, generic); vec_init_spinner++; return 0; } /* ---------------------------------------------------------------- */ #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); #define VEC_DEFINE_OPERATIONS(bits, size) \ VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) // 16-bit VEC_DEFINE_OPERATIONS(8, 2) // 32-bit VEC_DEFINE_OPERATIONS(8, 4) VEC_DEFINE_OPERATIONS(16, 2) // 64-bit VEC_DEFINE_OPERATIONS(8, 8) VEC_DEFINE_OPERATIONS(16, 4) VEC_DEFINE_OPERATIONS(32, 2) // 128-bit VEC_DEFINE_OPERATIONS(8, 16) VEC_DEFINE_OPERATIONS(16, 8) VEC_DEFINE_OPERATIONS(32, 4) VEC_DEFINE_OPERATIONS(64, 2) // 256-bit VEC_DEFINE_OPERATIONS(8, 32) VEC_DEFINE_OPERATIONS(16, 16) VEC_DEFINE_OPERATIONS(32, 8) VEC_DEFINE_OPERATIONS(64, 4) // 512-bit VEC_DEFINE_OPERATIONS(8, 64) VEC_DEFINE_OPERATIONS(16, 32) VEC_DEFINE_OPERATIONS(32, 16) VEC_DEFINE_OPERATIONS(64, 8) #undef VEC_DEFINE_OPERATIONS #undef VEC_DEFINE_OPERATIONS_SIGN