Mercurial > vec
diff src/vec.c @ 36:677c03c382b8
Backed out changeset e26874655738
author | Paper <paper@tflc.us> |
---|---|
date | Fri, 25 Apr 2025 17:40:55 -0400 |
parents | 8b5e0974fd41 |
children |
line wrap: on
line diff
--- a/src/vec.c Fri Apr 25 17:40:51 2025 -0400 +++ b/src/vec.c Fri Apr 25 17:40:55 2025 -0400 @@ -1,286 +1,2 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - +#define VEC_IMPLEMENTATION #include "vec/vec.h" -#include "vec/cpu.h" -#include "vec/impl/generic.h" -#include "vec/impl/fallback.h" -#ifdef VEC_COMPILER_HAS_MMX -# include "vec/impl/x86/mmx.h" -#endif -#ifdef VEC_COMPILER_HAS_SSE2 -# include "vec/impl/x86/sse2.h" -#endif -#ifdef VEC_COMPILER_HAS_SSE41 -# include "vec/impl/x86/sse41.h" -#endif -#ifdef VEC_COMPILER_HAS_AVX2 -# include "vec/impl/x86/avx2.h" -#endif -#ifdef VEC_COMPILER_HAS_AVX512F -# include "vec/impl/x86/avx512f.h" -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC -# include "vec/impl/ppc/altivec.h" -#endif -#ifdef VEC_COMPILER_HAS_NEON -# include "vec/impl/arm/neon.h" -#endif - -extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); -extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); -extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); - -// 16-bit -const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; -const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; - -// 32-bit -const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; -const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; -const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; -const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; - -// 64-bit -const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; -const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; -const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; -const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; -const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; -const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; - -// 128-bit -const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; -const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; -const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; -const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; -const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; -const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; -const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; -const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; - -// 256-bit -const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; -const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; -const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; -const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; -const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; -const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; -const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; -const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; - -// 512-bit -const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; -const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; -const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; -const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; -const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; -const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; -const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; -const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; - -static int vec_init_spinner = 0; - -// returns 0 or a negative error code on failure -int vec_init(void) -{ - // This function is NOT thread safe. However, once vec - // is initialized, all of the vector functions are thread-safe. - // - // In fact, it's possible to use vec without calling - // vec_init() at all, but it would be completely useless since - // it would just use a generic implementation without any - // vectorization whatsoever (unless maybe the compiler is - // smart enough to optimize it into vectors) - - if (vec_init_spinner) - return 0; // already initialized, do nothing - - vec_uint32 cpu = vec_get_CPU_features(); - -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (cpu & VEC_CPU_HAS_ALTIVEC) { - vint8x16_impl_cpu = &vint8x16_impl_altivec; - vuint8x16_impl_cpu = &vuint8x16_impl_altivec; - vint16x8_impl_cpu = &vint16x8_impl_altivec; - vuint16x8_impl_cpu = &vuint16x8_impl_altivec; - vint32x4_impl_cpu = &vint32x4_impl_altivec; - vuint32x4_impl_cpu = &vuint32x4_impl_altivec; -#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX - if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { - vint64x2_impl_cpu = &vint64x2_impl_altivec; - vuint64x2_impl_cpu = &vuint64x2_impl_altivec; - } -#endif - } -#endif -#ifdef VEC_COMPILER_HAS_AVX512F - if (cpu & VEC_CPU_HAS_AVX512F) { - vint8x64_impl_cpu = &vint8x64_impl_avx512f; - vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; - vint16x32_impl_cpu = &vint16x32_impl_avx512f; - vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; - vint32x16_impl_cpu = &vint32x16_impl_avx512f; - vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; - vint64x8_impl_cpu = &vint64x8_impl_avx512f; - vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; - } -#endif -#ifdef VEC_COMPILER_HAS_AVX2 - if (cpu & VEC_CPU_HAS_AVX2) { - vint8x32_impl_cpu = &vint8x32_impl_avx2; - vuint8x32_impl_cpu = &vuint8x32_impl_avx2; - vint16x16_impl_cpu = &vint16x16_impl_avx2; - vuint16x16_impl_cpu = &vuint16x16_impl_avx2; - vint32x8_impl_cpu = &vint32x8_impl_avx2; - vuint32x8_impl_cpu = &vuint32x8_impl_avx2; - vint64x4_impl_cpu = &vint64x4_impl_avx2; - vuint64x4_impl_cpu = &vuint64x4_impl_avx2; - } -#endif -#ifdef VEC_COMPILER_HAS_SSE2 - if (cpu & VEC_CPU_HAS_SSE2) { - vint8x16_impl_cpu = &vint8x16_impl_sse2; - vuint8x16_impl_cpu = &vuint8x16_impl_sse2; - vint16x8_impl_cpu = &vint16x8_impl_sse2; - vuint16x8_impl_cpu = &vuint16x8_impl_sse2; -# ifdef VEC_COMPILER_HAS_SSE41 - if (cpu & VEC_CPU_HAS_SSE41) { - vint32x4_impl_cpu = &vint32x4_impl_sse41; - vuint32x4_impl_cpu = &vuint32x4_impl_sse41; - } else -# endif - { - vint32x4_impl_cpu = &vint32x4_impl_sse2; - vuint32x4_impl_cpu = &vuint32x4_impl_sse2; - } - vint64x2_impl_cpu = &vint64x2_impl_sse2; - vuint64x2_impl_cpu = &vuint64x2_impl_sse2; - } -#endif -#ifdef VEC_COMPILER_HAS_MMX - if (cpu & VEC_CPU_HAS_MMX) { - vint8x8_impl_cpu = &vint8x8_impl_mmx; - vuint8x8_impl_cpu = &vuint8x8_impl_mmx; - vint16x4_impl_cpu = &vint16x4_impl_mmx; - vuint16x4_impl_cpu = &vuint16x4_impl_mmx; - vint32x2_impl_cpu = &vint32x2_impl_mmx; - vuint32x2_impl_cpu = &vuint32x2_impl_mmx; - } -#endif -#ifdef VEC_COMPILER_HAS_NEON - if (cpu & VEC_CPU_HAS_NEON) { - // 64-bit - vint8x8_impl_cpu = &vint8x8_impl_neon; - vuint8x8_impl_cpu = &vuint8x8_impl_neon; - vint16x4_impl_cpu = &vint16x4_impl_neon; - vuint16x4_impl_cpu = &vuint16x4_impl_neon; - vint32x2_impl_cpu = &vint32x2_impl_neon; - vuint32x2_impl_cpu = &vuint32x2_impl_neon; - - // 128-bit - vint8x16_impl_cpu = &vint8x16_impl_neon; - vuint8x16_impl_cpu = &vuint8x16_impl_neon; - vint16x8_impl_cpu = &vint16x8_impl_neon; - vuint16x8_impl_cpu = &vuint16x8_impl_neon; - vint32x4_impl_cpu = &vint32x4_impl_neon; - vuint32x4_impl_cpu = &vuint32x4_impl_neon; - vint64x2_impl_cpu = &vint64x2_impl_neon; - vuint64x2_impl_cpu = &vuint64x2_impl_neon; - } -#endif - { - // do nothing, they're already set to generics - } - - vec_init_spinner++; - - return 0; -} - -/* ---------------------------------------------------------------- */ - -#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ - extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); - -#define VEC_DEFINE_OPERATIONS(bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) - -// 16-bit -VEC_DEFINE_OPERATIONS(8, 2) - -// 32-bit -VEC_DEFINE_OPERATIONS(8, 4) -VEC_DEFINE_OPERATIONS(16, 2) - -// 64-bit -VEC_DEFINE_OPERATIONS(8, 8) -VEC_DEFINE_OPERATIONS(16, 4) -VEC_DEFINE_OPERATIONS(32, 2) - -// 128-bit -VEC_DEFINE_OPERATIONS(8, 16) -VEC_DEFINE_OPERATIONS(16, 8) -VEC_DEFINE_OPERATIONS(32, 4) -VEC_DEFINE_OPERATIONS(64, 2) - -// 256-bit -VEC_DEFINE_OPERATIONS(8, 32) -VEC_DEFINE_OPERATIONS(16, 16) -VEC_DEFINE_OPERATIONS(32, 8) -VEC_DEFINE_OPERATIONS(64, 4) - -// 512-bit -VEC_DEFINE_OPERATIONS(8, 64) -VEC_DEFINE_OPERATIONS(16, 32) -VEC_DEFINE_OPERATIONS(32, 16) -VEC_DEFINE_OPERATIONS(64, 8) - -#undef VEC_DEFINE_OPERATIONS -#undef VEC_DEFINE_OPERATIONS_SIGN