Mercurial > vec
view src/vec.c @ 27:d00b95f95dd1 default tip
impl/arm/neon: it compiles again, but is untested
author | Paper <paper@tflc.us> |
---|---|
date | Mon, 25 Nov 2024 00:33:02 -0500 |
parents | 92156fe32755 |
children |
line wrap: on
line source
/** * vec - a tiny SIMD vector library in C99 * * Copyright (c) 2024 Paper * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. **/ #include "vec/vec.h" #include "vec/cpu.h" #include "vec/impl/generic.h" #include "vec/impl/fallback.h" #ifdef VEC_COMPILER_HAS_MMX # include "vec/impl/x86/mmx.h" #endif #ifdef VEC_COMPILER_HAS_SSE2 # include "vec/impl/x86/sse2.h" #endif #ifdef VEC_COMPILER_HAS_SSE41 # include "vec/impl/x86/sse41.h" #endif #ifdef VEC_COMPILER_HAS_AVX2 # include "vec/impl/x86/avx2.h" #endif #ifdef VEC_COMPILER_HAS_AVX512F # include "vec/impl/x86/avx512f.h" #endif #ifdef VEC_COMPILER_HAS_ALTIVEC # include "vec/impl/ppc/altivec.h" #endif #ifdef VEC_COMPILER_HAS_NEON # include "vec/impl/arm/neon.h" #endif extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y); extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y); extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); extern inline vec_intmax vec_avg(vec_intmax x, vec_intmax y); extern inline vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y); // 16-bit const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; // 32-bit const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; // 64-bit const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; // 128-bit const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; // 256-bit const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; // 512-bit const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; static int vec_init_spinner = 0; // returns 0 or a negative error code on failure int vec_init(void) { // This function is NOT thread safe. However, once vec // is initialized, all of the vector functions are thread-safe. // // In fact, it's possible to use vec without calling // vec_init() at all, but it would be completely useless since // it would just use a generic implementation without any // vectorization whatsoever (unless maybe the compiler is // smart enough to optimize it into vectors) if (vec_init_spinner) return 0; // already initialized, do nothing vec_uint32 cpu = vec_get_CPU_features(); #ifdef VEC_COMPILER_HAS_ALTIVEC if (cpu & VEC_CPU_HAS_ALTIVEC) { vint8x16_impl_cpu = &vint8x16_impl_altivec; vuint8x16_impl_cpu = &vuint8x16_impl_altivec; vint16x8_impl_cpu = &vint16x8_impl_altivec; vuint16x8_impl_cpu = &vuint16x8_impl_altivec; vint32x4_impl_cpu = &vint32x4_impl_altivec; vuint32x4_impl_cpu = &vuint32x4_impl_altivec; #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { vint64x2_impl_cpu = &vint64x2_impl_altivec; vuint64x2_impl_cpu = &vuint64x2_impl_altivec; } #endif } #endif #ifdef VEC_COMPILER_HAS_AVX512F if (cpu & VEC_CPU_HAS_AVX512F) { vint8x64_impl_cpu = &vint8x64_impl_avx512f; vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; vint16x32_impl_cpu = &vint16x32_impl_avx512f; vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; vint32x16_impl_cpu = &vint32x16_impl_avx512f; vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; vint64x8_impl_cpu = &vint64x8_impl_avx512f; vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; } #endif #ifdef VEC_COMPILER_HAS_AVX2 if (cpu & VEC_CPU_HAS_AVX2) { vint8x32_impl_cpu = &vint8x32_impl_avx2; vuint8x32_impl_cpu = &vuint8x32_impl_avx2; vint16x16_impl_cpu = &vint16x16_impl_avx2; vuint16x16_impl_cpu = &vuint16x16_impl_avx2; vint32x8_impl_cpu = &vint32x8_impl_avx2; vuint32x8_impl_cpu = &vuint32x8_impl_avx2; vint64x4_impl_cpu = &vint64x4_impl_avx2; vuint64x4_impl_cpu = &vuint64x4_impl_avx2; } #endif #ifdef VEC_COMPILER_HAS_SSE2 if (cpu & VEC_CPU_HAS_SSE2) { vint8x16_impl_cpu = &vint8x16_impl_sse2; vuint8x16_impl_cpu = &vuint8x16_impl_sse2; vint16x8_impl_cpu = &vint16x8_impl_sse2; vuint16x8_impl_cpu = &vuint16x8_impl_sse2; # ifdef VEC_COMPILER_HAS_SSE41 if (cpu & VEC_CPU_HAS_SSE41) { vint32x4_impl_cpu = &vint32x4_impl_sse41; vuint32x4_impl_cpu = &vuint32x4_impl_sse41; } else # endif { vint32x4_impl_cpu = &vint32x4_impl_sse2; vuint32x4_impl_cpu = &vuint32x4_impl_sse2; } vint64x2_impl_cpu = &vint64x2_impl_sse2; vuint64x2_impl_cpu = &vuint64x2_impl_sse2; } #endif #ifdef VEC_COMPILER_HAS_MMX if (cpu & VEC_CPU_HAS_MMX) { vint8x8_impl_cpu = &vint8x8_impl_mmx; vuint8x8_impl_cpu = &vuint8x8_impl_mmx; vint16x4_impl_cpu = &vint16x4_impl_mmx; vuint16x4_impl_cpu = &vuint16x4_impl_mmx; vint32x2_impl_cpu = &vint32x2_impl_mmx; vuint32x2_impl_cpu = &vuint32x2_impl_mmx; } #endif #ifdef VEC_COMPILER_HAS_NEON if (cpu & VEC_CPU_HAS_NEON) { // 64-bit vint8x8_impl_cpu = &vint8x8_impl_neon; vuint8x8_impl_cpu = &vuint8x8_impl_neon; vint16x4_impl_cpu = &vint16x4_impl_neon; vuint16x4_impl_cpu = &vuint16x4_impl_neon; vint32x2_impl_cpu = &vint32x2_impl_neon; vuint32x2_impl_cpu = &vuint32x2_impl_neon; // 128-bit vint8x16_impl_cpu = &vint8x16_impl_neon; vuint8x16_impl_cpu = &vuint8x16_impl_neon; vint16x8_impl_cpu = &vint16x8_impl_neon; vuint16x8_impl_cpu = &vuint16x8_impl_neon; vint32x4_impl_cpu = &vint32x4_impl_neon; vuint32x4_impl_cpu = &vuint32x4_impl_neon; vint64x2_impl_cpu = &vint64x2_impl_neon; vuint64x2_impl_cpu = &vuint64x2_impl_neon; } #endif { // do nothing, they're already set to generics } vec_init_spinner++; return 0; } /* ---------------------------------------------------------------- */ #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); #define VEC_DEFINE_OPERATIONS(bits, size) \ VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) // 16-bit VEC_DEFINE_OPERATIONS(8, 2) // 32-bit VEC_DEFINE_OPERATIONS(8, 4) VEC_DEFINE_OPERATIONS(16, 2) // 64-bit VEC_DEFINE_OPERATIONS(8, 8) VEC_DEFINE_OPERATIONS(16, 4) VEC_DEFINE_OPERATIONS(32, 2) // 128-bit VEC_DEFINE_OPERATIONS(8, 16) VEC_DEFINE_OPERATIONS(16, 8) VEC_DEFINE_OPERATIONS(32, 4) VEC_DEFINE_OPERATIONS(64, 2) // 256-bit VEC_DEFINE_OPERATIONS(8, 32) VEC_DEFINE_OPERATIONS(16, 16) VEC_DEFINE_OPERATIONS(32, 8) VEC_DEFINE_OPERATIONS(64, 4) // 512-bit VEC_DEFINE_OPERATIONS(8, 64) VEC_DEFINE_OPERATIONS(16, 32) VEC_DEFINE_OPERATIONS(32, 16) VEC_DEFINE_OPERATIONS(64, 8) #undef VEC_DEFINE_OPERATIONS #undef VEC_DEFINE_OPERATIONS_SIGN