Mercurial > vec
changeset 36:677c03c382b8
Backed out changeset e26874655738
author | Paper <paper@tflc.us> |
---|---|
date | Fri, 25 Apr 2025 17:40:55 -0400 |
parents | 99e4539f922f |
children | 4b5a557aa64f |
files | .hgignore CMakeLists.txt README include/vec/cpu.h include/vec/impl/align.h include/vec/impl/arm/neon.h include/vec/impl/cpu.h include/vec/impl/fallback.h include/vec/impl/generic.h include/vec/impl/integer.h.in include/vec/impl/ppc/altivec.h include/vec/impl/x86/avx2.h include/vec/impl/x86/avx512f.h include/vec/impl/x86/mmx.h include/vec/impl/x86/sse2.h include/vec/impl/x86/sse41.h include/vec/types.h.in include/vec/vec.h src/cpu.c src/impl/arm/neon.c src/impl/fallback.c src/impl/generic.c src/impl/ppc/altivec.c src/impl/x86/avx2.c src/impl/x86/avx512f.c src/impl/x86/mmx.c src/impl/x86/sse2.c src/impl/x86/sse41.c src/vec.c test/CMakeLists.txt test/Makefile.ppc test/Makefile.template test/Makefile.x86 test/test.cc test/test_arith.h test/test_shift.h |
diffstat | 36 files changed, 3877 insertions(+), 4201 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgignore Fri Apr 25 17:40:51 2025 -0400 +++ b/.hgignore Fri Apr 25 17:40:55 2025 -0400 @@ -40,4 +40,3 @@ # Build dir ^build/ -^test/build/
--- a/CMakeLists.txt Fri Apr 25 17:40:51 2025 -0400 +++ b/CMakeLists.txt Fri Apr 25 17:40:55 2025 -0400 @@ -1,114 +1,49 @@ cmake_minimum_required(VERSION 3.23) -project(vec VERSION 3.0.0 DESCRIPTION "a tiny C99 SIMD vector library") +project(vec VERSION 2.0.0 DESCRIPTION "a tiny C99 SIMD vector library") + +add_library(vec SHARED src/vec.c) -add_library(vec SHARED "src/vec.c;src/cpu.c;src/impl/generic.c;src/impl/fallback.c") +target_sources(vec PUBLIC + $<INSTALL_INTERFACE:vec/vec.h> + $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/vec.h> + $<INSTALL_INTERFACE:vec/impl/integer.h> + $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/vec/impl/integer.h> +) include(CheckCCompilerFlag) if(MSVC) - # Untested! - - if(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(COMPILER_HAS_MMX OFF) - else() - set(COMPILER_HAS_MMX ON) - set(COMPILER_MMX_FLAGS "") # none? - endif() - check_c_compiler_flag("/arch:SSE2" COMPILER_HAS_SSE2) - if(COMPILER_HAS_SSE2) - set(COMPILER_SSE2_FLAGS "/arch:SSE2") - endif() - check_c_compiler_flag("/arch:SSE4.2" COMPILER_HAS_SSE41) - if(COMPILER_HAS_SSE41) - set(COMPILER_SSE41_FLAGS "/arch:SSE4.2") + # TODO ? +else() + check_c_compiler_flag("-maltivec" COMPILER_HAS_ALTIVEC) + if(COMPILER_HAS_ALTIVEC) + target_compile_options(vec PRIVATE "-maltivec") endif() - check_c_compiler_flag("/arch:AVX2" COMPILER_HAS_AVX2) - if(COMPILER_HAS_AVX2) - set(COMPILER_AVX2_FLAGS "/arch:AVX2") - endif() - check_c_compiler_flag("/arch:AVX512" COMPILER_HAS_AVX512F) - if(COMPILER_HAS_AVX512F) - set(COMPILER_AVX512F_FLAGS "/arch:AVX512") - endif() - # TODO we have to try_compile to detect NEON -else() - #check_c_compiler_flag("-maltivec" COMPILER_HAS_ALTIVEC) - #if(COMPILER_HAS_ALTIVEC) - # set(COMPILER_ALTIVEC_FLAGS "-maltivec") - #endif() - #check_c_compiler_flag("-mfpu=neon" COMPILER_HAS_NEON) - #if(COMPILER_HAS_NEON) - # set(COMPILER_NEON_FLAGS "-mfpu=neon") - #endif() check_c_compiler_flag("-mmmx" COMPILER_HAS_MMX) if(COMPILER_HAS_MMX) - set(COMPILER_MMX_FLAGS "-mmmx") + target_compile_options(vec PRIVATE "-mmmx") endif() check_c_compiler_flag("-msse2" COMPILER_HAS_SSE2) if(COMPILER_HAS_SSE2) - set(COMPILER_SSE2_FLAGS "-msse2") + target_compile_options(vec PRIVATE "-msse2") endif() check_c_compiler_flag("-msse4.1" COMPILER_HAS_SSE41) if(COMPILER_HAS_SSE41) - set(COMPILER_SSE41_FLAGS "-msse4.1") + target_compile_options(vec PRIVATE "-msse4.1") endif() check_c_compiler_flag("-mavx2" COMPILER_HAS_AVX2) if(COMPILER_HAS_AVX2) - set(COMPILER_AVX2_FLAGS "-mavx2") + target_compile_options(vec PRIVATE "-mavx2") endif() check_c_compiler_flag("-mavx512f" COMPILER_HAS_AVX512F) if(COMPILER_HAS_AVX512F) - set(COMPILER_AVX512F_FLAGS "-mavx512f") + target_compile_options(vec PRIVATE "-mavx512f") endif() endif() -if(COMPILER_HAS_ALTIVEC) - target_sources(vec PRIVATE "src/impl/ppc/altivec.c") - set_source_files_properties("src/impl/ppc/altivec.c" PROPERTIES COMPILE_FLAGS "${COMPILER_ALTIVEC_FLAGS}") - target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_ALTIVEC") -endif() - -if(COMPILER_HAS_NEON) - target_sources(vec PRIVATE "src/impl/arm/neon.c") - set_source_files_properties("src/impl/arm/neon.c" PROPERTIES COMPILE_FLAGS "${COMPILER_NEON_FLAGS}") - target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_NEON") -endif() - -if(COMPILER_HAS_MMX) - target_sources(vec PRIVATE "src/impl/x86/mmx.c") - set_source_files_properties("src/impl/x86/mmx.c" PROPERTIES COMPILE_FLAGS "${COMPILER_MMX_FLAGS}") - target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_MMX") -endif() - -if(COMPILER_HAS_SSE2) - target_sources(vec PRIVATE "src/impl/x86/sse2.c") - set_source_files_properties("src/impl/x86/sse2.c" PROPERTIES COMPILE_FLAGS "${COMPILER_SSE2_FLAGS}") - target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_SSE2") -endif() - -if(COMPILER_HAS_SSE41) - target_sources(vec PRIVATE "src/impl/x86/sse41.c") - set_source_files_properties("src/impl/x86/sse41.c" PROPERTIES COMPILE_FLAGS "${COMPILER_SSE41_FLAGS}") - target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_SSE41") -endif() - -if(COMPILER_HAS_AVX2) - target_sources(vec PRIVATE "src/impl/x86/avx2.c") - set_source_files_properties("src/impl/x86/avx2.c" PROPERTIES COMPILE_FLAGS "${COMPILER_AVX2_FLAGS}") - target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_AVX2") -endif() - -if(COMPILER_HAS_AVX512F) - target_sources(vec PRIVATE "src/impl/x86/avx512f.c") - set_source_files_properties("src/impl/x86/avx512f.c" PROPERTIES COMPILE_FLAGS "${COMPILER_AVX512F_FLAGS}") - target_compile_definitions(vec PRIVATE "-DVEC_COMPILER_HAS_AVX512F") -endif() - - ######################################################################### -# integer types; it's nice to accommodate for older broken systems that -# may not have stdint.h. +# integer types include(CheckTypeSize) @@ -126,7 +61,6 @@ check_type_size("long" LONG_SIZE LANGUAGE C) check_type_size("long long" LONG_LONG_SIZE LANGUAGE C) check_type_size("uintptr_t" UINTPTR_T_SIZE LANGUAGE C) -check_type_size("size_t" SIZE_T_SIZE LANGUAGE C) if(INT16_T_SIZE EQUAL 2) set(SIZE16 "int16_t") @@ -134,8 +68,6 @@ set(SIZE16 "short") elseif(INT_SIZE EQUAL 2) set(SIZE16 "int") -else() - message(FATAL_ERROR "Failed to find a signed 16-bit integer type.") endif() if(UINT16_T_SIZE EQUAL 2) @@ -146,8 +78,6 @@ set(USIZE16 "unsigned short") elseif(INT_SIZE EQUAL 2) set(USIZE16 "unsigned int") -else() - message(FATAL_ERROR "Failed to find an unsigned 16-bit integer type.") endif() if(INT32_T_SIZE EQUAL 4) @@ -158,8 +88,6 @@ set(SIZE32 "int") elseif(LONG_SIZE EQUAL 4) set(SIZE32 "long") -else() - message(FATAL_ERROR "Failed to find a signed 32-bit integer type.") endif() if(UINT32_T_SIZE EQUAL 4) @@ -172,8 +100,6 @@ set(USIZE32 "unsigned int") elseif(LONG_SIZE EQUAL 4) set(USIZE32 "unsigned long") -else() - message(FATAL_ERROR "Failed to find an unsigned 32-bit integer type.") endif() if(INT64_T_SIZE EQUAL 8) @@ -186,8 +112,6 @@ set(SIZE64 "long") elseif(LONG_LONG_SIZE EQUAL 8) set(SIZE64 "long long") -else() - message(FATAL_ERROR "Failed to find a signed 64-bit integer type.") endif() if(UINT64_T_SIZE EQUAL 8) @@ -202,46 +126,28 @@ set(USIZE64 "unsigned long") elseif(LONG_LONG_SIZE EQUAL 8) set(USIZE64 "unsigned long long") -else() - message(FATAL_ERROR "Failed to find an unsigned 64-bit integer type.") endif() if(CMAKE_SIZEOF_VOID_P EQUAL UINTPTR_T_SIZE) set(USIZEPTR "uintptr_t") -elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 1) +elseif(CMAKE_SIZEOF_VOID_P EQUAL 1) set(USIZEPTR "unsigned char") -elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 2) +elseif(CMAKE_SIZEOF_VOID_P EQUAL 2) set(USIZEPTR "${USIZE16}") -elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 4) +elseif(CMAKE_SIZEOF_VOID_P EQUAL 4) set(USIZEPTR "${USIZE32}") -elseif(CMAKE_SIZEOF_VOID_P LESS_EQUAL 8) +elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) set(USIZEPTR "${USIZE64}") -else() - message(FATAL_ERROR "Failed to find an integer type that can fit a pointer.") endif() -if(NOT SIZE_T_SIZE EQUAL 0 AND NOT SIZE_T_SIZE EQUAL "") - set(USIZESIZE "size_t") -else() - # should be close enough I guess - set(USIZESIZE "${USIZEPTR}") -endif() +configure_file(include/vec/impl/integer.h.in include/vec/impl/integer.h @ONLY) -configure_file(include/vec/types.h.in include/vec/types.h @ONLY) +target_compile_definitions(vec PRIVATE "VEC_HAVE_IMPL_INTEGER_H") ######################################################################### -target_sources(vec PUBLIC - $<INSTALL_INTERFACE:vec/vec.h> - $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/vec.h> - $<INSTALL_INTERFACE:vec/types.h> - $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/vec/types.h> - $<INSTALL_INTERFACE:vec/cpu.h> - $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/cpu.h> -) - target_compile_features(vec PRIVATE $<IF:$<COMPILE_FEATURES:c_std_11>,c_std_11,c_std_99>) -target_include_directories(vec PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_BINARY_DIR}/include") +target_include_directories(vec PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_BINARY_DIR}/include/vec") # Installing
--- a/README Fri Apr 25 17:40:51 2025 -0400 +++ b/README Fri Apr 25 17:40:55 2025 -0400 @@ -1,4 +1,4 @@ -vec - a tiny SIMD vector library written in C99 +vec - a tiny SIMD vector header-only library written in C99 it comes with an extremely basic API that is similar to other intrinsics libraries; each type is in the exact same format: @@ -12,13 +12,6 @@ on processors where vec has an implementation and falls back to array-based implementations where they are not. -to initialize vec, you MUST call `vec_init()' when your program starts up. - -note that `vec_init()' is NOT thread-safe, and things can and will -blow up if you call it simultaneously from different threads (i.e. you -try to only initialize it when you need to... please just initialize -it on startup so you don't have to worry about that!!!) - all of these have many operations that are prefixed with the name of the type and an underscore, for example: @@ -113,3 +106,10 @@ the result vector if the value in `vec1' is greater than or equal to the corresponding value in `vec2', else all of the bits are turned off. + +to initialize vec, you MUST call `vec_init()' when your programs starts up. + +note that `vec_init()' is NOT thread-safe, and things can and will +blow up if you call it simultaneously from different threads (i.e. you +try to only initialize it when you need to... please just initialize +it on startup so you don't have to worry about that!!!)
--- a/include/vec/cpu.h Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -// Sure, this can be a public API. - -#ifndef VEC_CPU_H_ -#define VEC_CPU_H_ - -#include "vec/vec.h" - -enum { - VEC_CPU_HAS_ALTIVEC = (1 << 0), - VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1), - VEC_CPU_HAS_MMX = (1 << 2), - VEC_CPU_HAS_SSE = (1 << 3), - VEC_CPU_HAS_SSE2 = (1 << 4), - VEC_CPU_HAS_SSE3 = (1 << 5), - VEC_CPU_HAS_SSE41 = (1 << 6), - VEC_CPU_HAS_SSE42 = (1 << 7), - VEC_CPU_HAS_AVX = (1 << 8), - VEC_CPU_HAS_AVX2 = (1 << 9), - VEC_CPU_HAS_AVX512F = (1 << 10), - VEC_CPU_HAS_NEON = (1 << 11), -}; - -// NOT thread-safe. -vec_uint32 vec_get_CPU_features(void); - -#endif /* VEC_CPU_H_ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/align.h Fri Apr 25 17:40:55 2025 -0400 @@ -0,0 +1,267 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_ALIGN_H_ +#define VEC_IMPL_ALIGN_H_ + +// Array alignment macros + +#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) +# define VEC_ALIGNAS(x) alignas(x) +#elif (__STDC_VERSION__ >= 201112L) +# define VEC_ALIGNAS(x) _Alignas(x) +#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0) +# define VEC_ALIGNAS(x) __attribute__((__aligned__(x))) +#endif + +/* the alignment must be specified in bytes and must be a multiple of the + * type size. it is always assumed that the type will be on a boundary of + * its size, which may or may not be true */ +#ifdef VEC_ALIGNAS +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_ALIGNAS(align) type var[length] +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(var)) +#else +// use unions to get an aligned offset without triggering strict aliasing +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \ + union vec_aligned_union_##var##_ { \ + type arr[length]; \ + unsigned char bytes[sizeof(type) * length]; \ + }; \ + unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \ + type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \ + VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(vec_unaligned_##var##_) - (align - 1)) +#endif + +#define VEC_ALIGNED_ARRAY_LENGTH(var) \ + (VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var)) + +////////////////////////////////////////////////////////////////////////////////////// +// predefined variants for each vector type + +////////////////////////////////////////////////////////////////////////////////////// +// 16-bit + +#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT) +#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT) +#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT) +#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0) + +#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT) +#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT) +#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT) +#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 32-bit + +#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT) +#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT) +#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT) +#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0) + +#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT) +#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT) +#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT) +#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0) + +#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT) +#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT) +#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT) +#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0) + +#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT) +#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT) +#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT) +#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 64-bit + +#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT) +#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0) + +#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT) +#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0) + +#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT) +#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0) + +#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0) + +#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0) + +#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 128-bit + +#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) +#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) + +#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) +#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) + +#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) +#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) + +#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) +#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) + +#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) + +#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) + +#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) + +#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 256-bit + +#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) +#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) + +#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) +#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) +#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) + +#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) +#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) + +#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) + +#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) + +#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 512-bit + +#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) +#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) + +#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT) +#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) +#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) + +#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) +#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) + +#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) + +#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT) +#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) + +#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// + +#endif /* VEC_IMPL_ALIGN_H_ */
--- a/include/vec/impl/arm/neon.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/arm/neon.h Fri Apr 25 17:40:55 2025 -0400 @@ -25,22 +25,465 @@ #ifndef VEC_IMPL_ARM_NEON_H_ #define VEC_IMPL_ARM_NEON_H_ -#include "vec/vec.h" +#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vld1_##sign##bits(in); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + vstore_lane_##bits(sign, vec.neon, out); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_add, \ + v##sign##int##bits##x##size##_neon_sub, \ + v##sign##int##bits##x##size##_neon_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_neon_and, \ + v##sign##int##bits##x##size##_neon_or, \ + v##sign##int##bits##x##size##_neon_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_neon_lshift, \ + /* .rshift = */ NULL, \ + /* .lrshift = */ NULL, \ + }; + +#define VEC_DEFINE_OPERATIONS(bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +// Ok, we'll start out with the 64-bit types. -extern const vint8x8_impl vint8x8_impl_neon; -extern const vint16x4_impl vint16x4_impl_neon; -extern const vint32x2_impl vint32x2_impl_neon; -extern const vuint8x8_impl vuint8x8_impl_neon; -extern const vuint16x4_impl vuint16x4_impl_neon; -extern const vuint32x2_impl vuint32x2_impl_neon; +#define vadd_8 vadd_s8 +#define vadd_16 vadd_s16 +#define vadd_32 vadd_s32 +#define vsub_8 vsub_s8 +#define vsub_16 vsub_s16 +#define vsub_32 vsub_s32 +#define vmul_8 vmul_s8 +#define vmul_16 vmul_s16 +#define vmul_32 vmul_s32 +#define vshl_8 vshl_s8 +#define vshl_16 vshl_s16 +#define vshl_32 vshl_s32 +#define veor_8 veor_s8 +#define veor_16 veor_s16 +#define veor_32 veor_s32 +#define vorr_8 vorr_s8 +#define vorr_16 vorr_s16 +#define vorr_32 vorr_s32 +#define vand_8 vand_s8 +#define vand_16 vand_s16 +#define vand_32 vand_s32 +#define vld1_8 vld1_s8 +#define vld1_16 vld1_s16 +#define vld1_32 vld1_s32 +#define vget_lane_8 vget_lane_s8 +#define vget_lane_16 vget_lane_s16 +#define vget_lane_32 vget_lane_s32 +#define vstore_lane_8(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##8(vec, 0); \ + out[1] = vget_lane_##sign##8(vec, 1); \ + out[2] = vget_lane_##sign##8(vec, 2); \ + out[3] = vget_lane_##sign##8(vec, 3); \ + out[4] = vget_lane_##sign##8(vec, 4); \ + out[5] = vget_lane_##sign##8(vec, 5); \ + out[6] = vget_lane_##sign##8(vec, 6); \ + out[7] = vget_lane_##sign##8(vec, 7); \ + } while (0) +#define vstore_lane_16(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##16(vec, 0); \ + out[1] = vget_lane_##sign##16(vec, 1); \ + out[2] = vget_lane_##sign##16(vec, 2); \ + out[3] = vget_lane_##sign##16(vec, 3); \ + } while (0) +#define vstore_lane_32(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##32(vec, 0); \ + out[1] = vget_lane_##sign##32(vec, 1); \ + } while (0) +#define vreinterpret_8_u8(x) vreinterpret_s8_u8(x) +#define vreinterpret_16_u16(x) vreinterpret_s16_u16(x) +#define vreinterpret_32_u32(x) vreinterpret_s32_u32(x) + +VEC_DEFINE_OPERATIONS(8, 8) +VEC_DEFINE_OPERATIONS(16, 4) +VEC_DEFINE_OPERATIONS(32, 2) + +#undef vadd_8 +#undef vadd_16 +#undef vadd_32 +#undef vsub_8 +#undef vsub_16 +#undef vsub_32 +#undef vmul_8 +#undef vmul_16 +#undef vmul_32 +#undef vshl_8 +#undef vshl_16 +#undef vshl_32 +#undef veor_8 +#undef veor_16 +#undef veor_32 +#undef vorr_8 +#undef vorr_16 +#undef vorr_32 +#undef vand_8 +#undef vand_16 +#undef vand_32 +#undef vld1_8 +#undef vld1_16 +#undef vld1_32 +#undef vget_lane_8 +#undef vget_lane_16 +#undef vget_lane_32 +#undef vstore_lane_8 +#undef vstore_lane_16 +#undef vstore_lane_32 +#undef vreinterpret_8_u8 +#undef vreinterpret_16_u16 +#undef vreinterpret_32_u32 + +/////////////////////////////////////////////////////////////////////////////// +// 128-bit + +// Now we can go ahead and do the 128-bit ones. + +// NEON doesn't have native 64-bit multiplication, so we have +// to do it ourselves +static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b) +{ + const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a)); + const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b)); + + const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a)); + + return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr)); +} + +static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b) +{ + const uint32x2_t ac = vmovn_u64(a); + const uint32x2_t pr = vmovn_u64(b); + + const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a)); + + return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr); +} -extern const vint8x16_impl vint8x16_impl_neon; -extern const vint16x8_impl vint16x8_impl_neon; -extern const vint32x4_impl vint32x4_impl_neon; -extern const vint64x2_impl vint64x2_impl_neon; -extern const vuint8x16_impl vuint8x16_impl_neon; -extern const vuint16x8_impl vuint16x8_impl_neon; -extern const vuint32x4_impl vuint32x4_impl_neon; -extern const vuint64x2_impl vuint64x2_impl_neon; +#define vadd_8 vaddq_s8 +#define vadd_16 vaddq_s16 +#define vadd_32 vaddq_s32 +#define vadd_64 vaddq_s64 +#define vadd_u8 vaddq_u8 +#define vadd_u16 vaddq_u16 +#define vadd_u32 vaddq_u32 +#define vadd_u64 vaddq_u64 +#define vsub_8 vsubq_s8 +#define vsub_16 vsubq_s16 +#define vsub_32 vsubq_s32 +#define vsub_64 vsubq_s64 +#define vsub_u8 vsubq_u8 +#define vsub_u16 vsubq_u16 +#define vsub_u32 vsubq_u32 +#define vsub_u64 vsubq_u64 +#define vmul_8 vmulq_s8 +#define vmul_16 vmulq_s16 +#define vmul_32 vmulq_s32 +#define vmul_64 vmulq_s64 +#define vmul_u8 vmulq_u8 +#define vmul_u16 vmulq_u16 +#define vmul_u32 vmulq_u32 +#define vmul_u64 vmulq_u64 +#define vshl_8 vshlq_s8 +#define vshl_16 vshlq_s16 +#define vshl_32 vshlq_s32 +#define vshl_64 vshlq_s64 +#define vshl_u8 vshlq_u8 +#define vshl_u16 vshlq_u16 +#define vshl_u32 vshlq_u32 +#define vshl_u64 vshlq_u64 +#define veor_8 veorq_s8 +#define veor_16 veorq_s16 +#define veor_32 veorq_s32 +#define veor_64 veorq_s64 +#define veor_u8 veorq_u8 +#define veor_u16 veorq_u16 +#define veor_u32 veorq_u32 +#define veor_u64 veorq_u64 +#define vorr_8 vorrq_s8 +#define vorr_16 vorrq_s16 +#define vorr_32 vorrq_s32 +#define vorr_64 vorrq_s64 +#define vorr_u8 vorrq_u8 +#define vorr_u16 vorrq_u16 +#define vorr_u32 vorrq_u32 +#define vorr_u64 vorrq_u64 +#define vand_8 vandq_s8 +#define vand_16 vandq_s16 +#define vand_32 vandq_s32 +#define vand_64 vandq_s64 +#define vand_u8 vandq_u8 +#define vand_u16 vandq_u16 +#define vand_u32 vandq_u32 +#define vand_u64 vandq_u64 +#define vld1_8 vld1q_s8 +#define vld1_16 vld1q_s16 +#define vld1_32 vld1q_s32 +#define vld1_64 vld1q_s64 +#define vld1_u8 vld1q_u8 +#define vld1_u16 vld1q_u16 +#define vld1_u32 vld1q_u32 +#define vld1_u64 vld1q_u64 +#define vget_lane_8 vgetq_lane_s8 +#define vget_lane_16 vgetq_lane_s16 +#define vget_lane_32 vgetq_lane_s32 +#define vget_lane_64 vgetq_lane_s64 +#define vget_lane_u8 vgetq_lane_u8 +#define vget_lane_u16 vgetq_lane_u16 +#define vget_lane_u32 vgetq_lane_u32 +#define vget_lane_u64 vgetq_lane_u64 +#define vstore_lane_8(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##8(vec, 0); \ + out[1] = vget_lane_##sign##8(vec, 1); \ + out[2] = vget_lane_##sign##8(vec, 2); \ + out[3] = vget_lane_##sign##8(vec, 3); \ + out[4] = vget_lane_##sign##8(vec, 4); \ + out[5] = vget_lane_##sign##8(vec, 5); \ + out[6] = vget_lane_##sign##8(vec, 6); \ + out[7] = vget_lane_##sign##8(vec, 7); \ + out[8] = vget_lane_##sign##8(vec, 8); \ + out[9] = vget_lane_##sign##8(vec, 9); \ + out[10] = vget_lane_##sign##8(vec, 10); \ + out[11] = vget_lane_##sign##8(vec, 11); \ + out[12] = vget_lane_##sign##8(vec, 12); \ + out[13] = vget_lane_##sign##8(vec, 13); \ + out[14] = vget_lane_##sign##8(vec, 14); \ + out[15] = vget_lane_##sign##8(vec, 15); \ + } while (0) +#define vstore_lane_16(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##16(vec, 0); \ + out[1] = vget_lane_##sign##16(vec, 1); \ + out[2] = vget_lane_##sign##16(vec, 2); \ + out[3] = vget_lane_##sign##16(vec, 3); \ + out[4] = vget_lane_##sign##16(vec, 4); \ + out[5] = vget_lane_##sign##16(vec, 5); \ + out[6] = vget_lane_##sign##16(vec, 6); \ + out[7] = vget_lane_##sign##16(vec, 7); \ + } while (0) +#define vstore_lane_32(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##32(vec, 0); \ + out[1] = vget_lane_##sign##32(vec, 1); \ + out[2] = vget_lane_##sign##32(vec, 2); \ + out[3] = vget_lane_##sign##32(vec, 3); \ + } while (0) +#define vstore_lane_64(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##64(vec, 0); \ + out[1] = vget_lane_##sign##64(vec, 1); \ + } while (0) +#define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x) +#define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x) +#define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x) +#define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x) + +#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vld1_##sign##bits(in); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + vstore_lane_##bits(sign, vec.neon, out); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_add, \ + v##sign##int##bits##x##size##_neon_sub, \ + v##sign##int##bits##x##size##_neon_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_neon_and, \ + v##sign##int##bits##x##size##_neon_or, \ + v##sign##int##bits##x##size##_neon_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_neon_lshift, \ + /* .rshift = */ NULL, \ + /* .lrshift = */ NULL, \ + }; + +#define VEC_DEFINE_OPERATIONS(bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +VEC_DEFINE_OPERATIONS(8, 16) +VEC_DEFINE_OPERATIONS(16, 8) +VEC_DEFINE_OPERATIONS(32, 4) +VEC_DEFINE_OPERATIONS(64, 2) + +#undef vadd_8 +#undef vadd_16 +#undef vadd_32 +#undef vadd_64 +#undef vsub_8 +#undef vsub_16 +#undef vsub_32 +#undef vsub_64 +#undef vmul_8 +#undef vmul_16 +#undef vmul_32 +#undef vmul_64 +#undef vshl_8 +#undef vshl_16 +#undef vshl_32 +#undef vshl_64 +#undef veor_8 +#undef veor_16 +#undef veor_32 +#undef veor_64 +#undef vorr_8 +#undef vorr_16 +#undef vorr_32 +#undef vorr_64 +#undef vand_8 +#undef vand_16 +#undef vand_32 +#undef vand_64 +#undef vld1_8 +#undef vld1_16 +#undef vld1_32 +#undef vld1_64 +#undef vget_lane_8 +#undef vget_lane_16 +#undef vget_lane_32 +#undef vget_lane_64 +#undef vstore_lane_8 +#undef vstore_lane_16 +#undef vstore_lane_32 +#undef vstore_lane_64 + +#undef VEC_DEFINE_OPERATIONS +#undef VEC_DEFINE_OPERATIONS_SIGN #endif /* VEC_IMPL_ARM_NEON_H_ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/cpu.h Fri Apr 25 17:40:55 2025 -0400 @@ -0,0 +1,512 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_CPU_H_ +#define VEC_IMPL_CPU_H_ + +/* Detect CPU SIMD support. Much of this code was stolen from SDL. + * + * Simple DirectMedia Layer + * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org> + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. +*/ + +#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) +# include <sys/sysctl.h> // For AltiVec check +#elif defined(__OpenBSD__) && defined(__powerpc__) +# include <sys/types.h> +# include <sys/sysctl.h> // For AltiVec check +# include <machine/cpu.h> +#elif defined(__FreeBSD__) && defined(__powerpc__) +# include <machine/cpu.h> +# include <sys/auxv.h> +#elif defined(__ALTIVEC__) +# include <signal.h> +# include <setjmp.h> +#endif + +#ifdef __FreeBSD__ +# include <sys/param.h> +#endif + +#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__) +# include <unistd.h> +# include <sys/types.h> +# include <sys/stat.h> +# include <fcntl.h> +# include <elf.h> + +/*#include <asm/hwcap.h>*/ +# ifndef AT_HWCAP +# define AT_HWCAP 16 +# endif +# ifndef AT_PLATFORM +# define AT_PLATFORM 15 +# endif +# ifndef HWCAP_NEON +# define HWCAP_NEON (1 << 12) +# endif +#endif + +static inline int vec_CPU_have_CPUID(void) +{ + int has_CPUID = 0; + +#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) + __asm__ ( +" pushfl # Get original EFLAGS \n" +" popl %%eax \n" +" movl %%eax,%%ecx \n" +" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" +" pushl %%eax # Save new EFLAGS value on stack \n" +" popfl # Replace current EFLAGS value \n" +" pushfl # Get new EFLAGS \n" +" popl %%eax # Store new EFLAGS in EAX \n" +" xorl %%ecx,%%eax # Can not toggle ID bit, \n" +" jz 1f # Processor=80486 \n" +" movl $1,%0 # We have CPUID support \n" +"1: \n" + : "=m" (has_CPUID) + : + : "%eax", "%ecx" + ); +#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) +/* Technically, if this is being compiled under __x86_64__ then it has + CPUid by definition. But it's nice to be able to prove it. :) */ + __asm__ ( +" pushfq # Get original EFLAGS \n" +" popq %%rax \n" +" movq %%rax,%%rcx \n" +" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" +" pushq %%rax # Save new EFLAGS value on stack \n" +" popfq # Replace current EFLAGS value \n" +" pushfq # Get new EFLAGS \n" +" popq %%rax # Store new EFLAGS in EAX \n" +" xorl %%ecx,%%eax # Can not toggle ID bit, \n" +" jz 1f # Processor=80486 \n" +" movl $1,%0 # We have CPUID support \n" +"1: \n" + : "=m" (has_CPUID) + : + : "%rax", "%rcx" + ); +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) + __asm { + pushfd ; Get original EFLAGS + pop eax + mov ecx, eax + xor eax, 200000h ; Flip ID bit in EFLAGS + push eax ; Save new EFLAGS value on stack + popfd ; Replace current EFLAGS value + pushfd ; Get new EFLAGS + pop eax ; Store new EFLAGS in EAX + xor eax, ecx ; Can not toggle ID bit, + jz done ; Processor=80486 + mov has_CPUID,1 ; We have CPUID support +done: + } +#elif defined(_MSC_VER) && defined(_M_X64) + has_CPUID = 1; +#elif defined(__sun) && defined(__i386) + __asm ( +" pushfl \n" +" popl %eax \n" +" movl %eax,%ecx \n" +" xorl $0x200000,%eax \n" +" pushl %eax \n" +" popfl \n" +" pushfl \n" +" popl %eax \n" +" xorl %ecx,%eax \n" +" jz 1f \n" +" movl $1,-8(%ebp) \n" +"1: \n" + ); +#elif defined(__sun) && defined(__amd64) + __asm ( +" pushfq \n" +" popq %rax \n" +" movq %rax,%rcx \n" +" xorl $0x200000,%eax \n" +" pushq %rax \n" +" popfq \n" +" pushfq \n" +" popq %rax \n" +" xorl %ecx,%eax \n" +" jz 1f \n" +" movl $1,-8(%rbp) \n" +"1: \n" + ); +#endif + + return has_CPUID; +} + +#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm__ __volatile__( \ + " pushl %%ebx \n" \ + " xorl %%ecx,%%ecx \n" \ + " cpuid \n" \ + " movl %%ebx, %%esi \n" \ + " popl %%ebx \n" \ + : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ + : "a"(func)) +#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm__ __volatile__( \ + " pushq %%rbx \n" \ + " xorq %%rcx,%%rcx \n" \ + " cpuid \n" \ + " movq %%rbx, %%rsi \n" \ + " popq %%rbx \n" \ + : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ + : "a"(func)) +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm { \ + __asm mov eax, func \ + __asm xor ecx, ecx \ + __asm cpuid \ + __asm mov a, eax \ + __asm mov b, ebx \ + __asm mov c, ecx \ + __asm mov d, edx \ + } +#elif (defined(_MSC_VER) && defined(_M_X64)) +// Use __cpuidex instead of __cpuid because ICL does not clear ecx register +# define VEC_CPU_CPUID(func, a, b, c, d) \ + do { \ + int CPUInfo[4]; \ + __cpuidex(CPUInfo, func, 0); \ + a = CPUInfo[0]; \ + b = CPUInfo[1]; \ + c = CPUInfo[2]; \ + d = CPUInfo[3]; \ + } while (0) +#else +# define VEC_CPU_CPUID(func, a, b, c, d) \ + do { \ + a = b = c = d = 0; \ + (void)a; \ + (void)b; \ + (void)c; \ + (void)d; \ + } while (0) +#endif + +// --------------------------------------------------------------- + +static int vec_CPU_CPUIDFeatures[4]; +static int vec_CPU_CPUIDMaxFunction = 0; +static int vec_CPU_OSSavesYMM = 0; +static int vec_CPU_OSSavesZMM = 0; + +static inline void vec_CPU_get_CPUID_features(void) +{ + static int checked = 0; + if (!checked) { + checked = 1; + if (vec_CPU_have_CPUID()) { + int a, b, c, d; + VEC_CPU_CPUID(0, a, b, c, d); + vec_CPU_CPUIDMaxFunction = a; + if (vec_CPU_CPUIDMaxFunction >= 1) { + VEC_CPU_CPUID(1, a, b, c, d); + vec_CPU_CPUIDFeatures[0] = a; + vec_CPU_CPUIDFeatures[1] = b; + vec_CPU_CPUIDFeatures[2] = c; + vec_CPU_CPUIDFeatures[3] = d; + + // Check to make sure we can call xgetbv + if (c & 0x08000000) { + // Call xgetbv to see if YMM (etc) register state is saved +#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__)) + __asm__(".byte 0x0f, 0x01, 0xd0" + : "=a"(a) + : "c"(0) + : "%edx"); +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1 + a = (int)_xgetbv(0); +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) + __asm { + xor ecx, ecx + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 + mov a, eax + } +#endif + vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0; + vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0; + } + } + } + } +} + +#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) +static jmp_buf vec_jmpbuf; +static void vec_CPU_illegal_instruction(int sig) +{ + longjmp(vec_jmpbuf, 1); +} +#endif + +static int vec_CPU_have_ALTIVEC(void) +{ + volatile int altivec = 0; +#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__)) + int selectors[2] = { +# ifdef __OpenBSD__ + CTL_MACHDEP, CPU_ALTIVEC +# else + CTL_HW, HW_VECTORUNIT +# endif + }; + int hasVectorUnit = 0; + vec_uintsize length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if (!error) + altivec = (hasVectorUnit != 0); +#elif defined(__FreeBSD__) && defined(__powerpc__) + unsigned long cpufeatures = 0; + elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); + altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; +#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) + void (*handler)(int sig); + handler = signal(SIGILL, vec_CPU_illegal_instruction); + if (!setjmp(vec_jmpbuf)) { + vector unsigned char vec; + vec_and(vec, vec); + altivec = 1; + } + signal(SIGILL, handler); +#endif + return altivec; +} + +static int vec_CPU_have_ALTIVEC_VSX(void) +{ + volatile int vsx = 0; +#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__) +# warning Compiling UNTESTED code for VSX. + void (*handler)(int sig); + handler = signal(SIGILL, vec_CPU_illegal_instruction); + if (!setjmp(vec_jmpbuf)) { + // this is completely untested + //__asm__ __volatile__("mtspr 256, %0\n\t" + // "xxland %%v0, %%v0, %%v0" ::"r"(-1)); + //vsx = 1; + } + signal(SIGILL, handler); +#endif + return vsx; +} + +#define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000) +#define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000) +#define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000) +#define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001) +#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000) +#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000) +#define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000)) + +static inline int vec_CPU_have_AVX2(void) +{ + if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { + int a, b, c, d; + VEC_CPU_CPUID(7, a, b, c, d); + return b & 0x00000020; + (void)a, (void)c, (void)d; + } + return 0; +} + +static inline int vec_CPU_have_AVX512F(void) +{ + if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { + int a, b, c, d; + VEC_CPU_CPUID(7, a, b, c, d); + return b & 0x00000020; + (void)a, (void)c, (void)d; + } + return 0; +} + +#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL) +static int readProcAuxvForNeon(void) +{ + int neon = 0; + int fd; + + fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + Elf32_auxv_t aux; + while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { + if (aux.a_type == AT_HWCAP) { + neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON; + break; + } + } + close(fd); + } + return neon; +} +#endif + +static int vec_CPU_have_NEON(void) +{ +/* The way you detect NEON is a privileged instruction on ARM, so you have + query the OS kernel in a platform-specific way. :/ */ +#if defined(SDL_CPUINFO_DISABLED) + return 0; /* disabled */ +#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64)) +/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */ +/* Seems to have been removed */ +#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE +#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19 +#endif + /* All WinRT ARM devices are required to support NEON, but just in case. */ + return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0; +#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__) + return 1; /* ARMv8 always has non-optional NEON support. */ +#elif defined(__VITA__) + return 1; +#elif defined(__3DS__) + return 0; +#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) + /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */ + return 1; /* all Apple ARMv7 chips and later have NEON. */ +#elif defined(__APPLE__) + return 0; /* assume anything else from Apple doesn't have NEON. */ +#elif !defined(__arm__) + return 0; /* not an ARM CPU at all. */ +#elif defined(__OpenBSD__) + return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */ +#elif defined(HAVE_ELF_AUX_INFO) + unsigned long hasneon = 0; + if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0) + return 0; + + return ((hasneon & HWCAP_NEON) == HWCAP_NEON); +#elif defined(__QNXNTO__) + return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON; +#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON; +#elif defined(__linux__) + return readProcAuxvForNeon(); +#elif defined(__ANDROID__) + /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */ + { + AndroidCpuFamily cpu_family = android_getCpuFamily(); + if (cpu_family == ANDROID_CPU_FAMILY_ARM) { + uint64_t cpu_features = android_getCpuFeatures(); + if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) { + return 1; + } + } + return 0; + } +#elif defined(__RISCOS__) + /* Use the VFPSupport_Features SWI to access the MVFR registers */ + { + _kernel_swi_regs regs; + regs.r[0] = 0; + if (_kernel_swi(VFPSupport_Features, ®s, ®s) == NULL) { + if ((regs.r[2] & 0xFFF000) == 0x111000) { + return 1; + } + } + return 0; + } +#else +#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me. + return 0; +#endif +} + +enum { + VEC_CPU_HAS_ALTIVEC = (1 << 0), + VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1), + VEC_CPU_HAS_MMX = (1 << 2), + VEC_CPU_HAS_SSE = (1 << 3), + VEC_CPU_HAS_SSE2 = (1 << 4), + VEC_CPU_HAS_SSE3 = (1 << 5), + VEC_CPU_HAS_SSE41 = (1 << 6), + VEC_CPU_HAS_SSE42 = (1 << 7), + VEC_CPU_HAS_AVX = (1 << 8), + VEC_CPU_HAS_AVX2 = (1 << 9), + VEC_CPU_HAS_AVX512F = (1 << 10), + VEC_CPU_HAS_NEON = (1 << 11), +}; + +#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF) + +static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET; + +static void vec_get_CPU_features(void) +{ + vec_CPU_get_CPUID_features(); + vec_CPU_features = 0; + if (vec_CPU_have_ALTIVEC()) + vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; + if (vec_CPU_have_ALTIVEC_VSX()) + vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; + if (vec_CPU_have_MMX()) + vec_CPU_features |= VEC_CPU_HAS_MMX; + if (vec_CPU_have_SSE()) + vec_CPU_features |= VEC_CPU_HAS_SSE; + if (vec_CPU_have_SSE2()) + vec_CPU_features |= VEC_CPU_HAS_SSE2; + if (vec_CPU_have_SSE3()) + vec_CPU_features |= VEC_CPU_HAS_SSE3; + if (vec_CPU_have_SSE41()) + vec_CPU_features |= VEC_CPU_HAS_SSE41; + if (vec_CPU_have_SSE42()) + vec_CPU_features |= VEC_CPU_HAS_SSE42; + if (vec_CPU_have_AVX()) + vec_CPU_features |= VEC_CPU_HAS_AVX; + if (vec_CPU_have_AVX2()) + vec_CPU_features |= VEC_CPU_HAS_AVX2; + if (vec_CPU_have_AVX512F()) + vec_CPU_features |= VEC_CPU_HAS_AVX512F; + if (vec_CPU_have_NEON()) + vec_CPU_features |= VEC_CPU_HAS_NEON; +} + +#endif /* VEC_IMPL_CPU_H_ */
--- a/include/vec/impl/fallback.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/fallback.h Fri Apr 25 17:40:55 2025 -0400 @@ -25,29 +25,148 @@ #ifndef VEC_IMPL_FALLBACK_H_ #define VEC_IMPL_FALLBACK_H_ -#include "vec/vec.h" +#include <string.h> + +// Fallback implementations - this is what an implementation should use if it +// doesn't support a specific function. Note that the load_aligned and +// store_aligned functions are not implemented here - this is on purpose; +// every single implementation *needs* to have one of these. + +#define VEC_FALLBACK_OPERATION(op, sign, csign, bits, size) \ + do { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ + v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \ + \ + for (int i = 0; i < size; i++) varr1[i] = (op); \ + \ + return v##sign##int##bits##x##size##_load_aligned(varr1); \ + } while (0) + +#define VEC_FALLBACK_CMP(op, sign, csign, bits, size) \ + VEC_FALLBACK_OPERATION((varr1[i] op varr2[i]) ? UINT##bits##_MAX : 0, sign, csign, bits, size) + +#define VEC_FALLBACK_SHIFT(op, sign, csign, bits, size) \ + do { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ + VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ + vuint##bits##x##size##_store_aligned(vec2, varr2); \ + \ + for (int i = 0; i < size; i++) varr1[i] = (op); \ + \ + return v##sign##int##bits##x##size##_load_aligned(varr1); \ + } while (0) #define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]); \ - void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ + for (int i = 0; i < size; i++) arr[i] = x; \ + return v##sign##int##bits##x##size##_load_aligned(arr); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ + memcpy(arr, in, sizeof(vec_##sign##int##bits) * size); \ + return v##sign##int##bits##x##size##_load_aligned(arr); \ + } \ + \ + static void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ + v##sign##int##bits##x##size##_store_aligned(vec, arr); \ + memcpy(out, arr, sizeof(vec_##sign##int##bits) * size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_add(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \ + { \ + return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)UINT##bits##_MAX)); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(<, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(<=, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(==, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(>=, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(>, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_FALLBACK_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_FALLBACK_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_FALLBACK_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size); \ + } #define VEC_DEFINE_FALLBACK_OPERATIONS(bits, size) \ VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( , , bits, size) \ @@ -83,6 +202,9 @@ VEC_DEFINE_FALLBACK_OPERATIONS(32, 16) VEC_DEFINE_FALLBACK_OPERATIONS(64, 8) +#undef VEC_FALLBACK_OPERATION +#undef VEC_FALLBACK_CMP +#undef VEC_FALLBACK_SHIFT #undef VEC_DEFINE_FALLBACK_OPERATIONS #undef VEC_DEFINE_FALLBACK_OPERATIONS_SIGN
--- a/include/vec/impl/generic.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/generic.h Fri Apr 25 17:40:55 2025 -0400 @@ -27,113 +27,114 @@ #ifndef VEC_IMPL_GENERIC_H_ #define VEC_IMPL_GENERIC_H_ -#include "vec/vec.h" +#include <string.h> + +// ----------------------------------------------------------------- + +// TODO implement these so we don't waste stack space by doing the +// fallbacks +#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + memcpy(vec.generic, in, sizeof(vec_##sign##int##bits) * size); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + memcpy(out, vec.generic, sizeof(vec_##sign##int##bits) * size); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_generic_load_aligned, \ + v##sign##int##bits##x##size##_generic_load_aligned, \ + v##sign##int##bits##x##size##_generic_store_aligned, \ + v##sign##int##bits##x##size##_generic_store_aligned, \ + }; + +#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +VEC_GENERIC_DEFINE_OPERATIONS(8, 2) +VEC_GENERIC_DEFINE_OPERATIONS(16, 2) +VEC_GENERIC_DEFINE_OPERATIONS(32, 2) +VEC_GENERIC_DEFINE_OPERATIONS(64, 2) + +#undef VEC_GENERIC_DEFINE_OPERATIONS +#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN -#define VEC_DEFINE_GENERIC_OPERATIONS_SIGN(sign, csign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_splat(vec_##sign##int##bits x); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load(const vec_##sign##int##bits in[size]); \ - void v##sign##int##bits##x##size##_generic_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_not(v##sign##int##bits##x##size vec); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); +// ----------------------------------------------------------------- +// now we can just keep doubling the same implementation -#define VEC_DEFINE_GENERIC_OPERATIONS(bits, size) \ - VEC_DEFINE_GENERIC_OPERATIONS_SIGN( , , bits, size) \ - VEC_DEFINE_GENERIC_OPERATIONS_SIGN(u, U, bits, size) +#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.generic[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \ + vec.generic[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.generic[0] = v##sign##int##bits##x##halfsize##_load(in); \ + vec.generic[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[0], out); \ + v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[1], out + halfsize); \ + } \ + \ + static void v##sign##int##bits##x##size##_generic_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + v##sign##int##bits##x##halfsize##_store(vec.generic[0], out); \ + v##sign##int##bits##x##halfsize##_store(vec.generic[1], out + halfsize); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_generic_load_aligned, \ + v##sign##int##bits##x##size##_generic_load, \ + v##sign##int##bits##x##size##_generic_store_aligned, \ + v##sign##int##bits##x##size##_generic_store, \ + }; -// 16-bit -VEC_DEFINE_GENERIC_OPERATIONS(8, 2) +#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size, halfsize) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize) // 32-bit -VEC_DEFINE_GENERIC_OPERATIONS(8, 4) -VEC_DEFINE_GENERIC_OPERATIONS(16, 2) +VEC_GENERIC_DEFINE_OPERATIONS(8, 4, 2) // 64-bit -VEC_DEFINE_GENERIC_OPERATIONS(8, 8) -VEC_DEFINE_GENERIC_OPERATIONS(16, 4) -VEC_DEFINE_GENERIC_OPERATIONS(32, 2) +VEC_GENERIC_DEFINE_OPERATIONS(8, 8, 4) +VEC_GENERIC_DEFINE_OPERATIONS(16, 4, 2) // 128-bit -VEC_DEFINE_GENERIC_OPERATIONS(8, 16) -VEC_DEFINE_GENERIC_OPERATIONS(16, 8) -VEC_DEFINE_GENERIC_OPERATIONS(32, 4) -VEC_DEFINE_GENERIC_OPERATIONS(64, 2) +VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8) +VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4) +VEC_GENERIC_DEFINE_OPERATIONS(32, 4, 2) // 256-bit -VEC_DEFINE_GENERIC_OPERATIONS(8, 32) -VEC_DEFINE_GENERIC_OPERATIONS(16, 16) -VEC_DEFINE_GENERIC_OPERATIONS(32, 8) -VEC_DEFINE_GENERIC_OPERATIONS(64, 4) +VEC_GENERIC_DEFINE_OPERATIONS(8, 32, 16) +VEC_GENERIC_DEFINE_OPERATIONS(16, 16, 8) +VEC_GENERIC_DEFINE_OPERATIONS(32, 8, 4) +VEC_GENERIC_DEFINE_OPERATIONS(64, 4, 2) // 512-bit -VEC_DEFINE_GENERIC_OPERATIONS(8, 64) -VEC_DEFINE_GENERIC_OPERATIONS(16, 32) -VEC_DEFINE_GENERIC_OPERATIONS(32, 16) -VEC_DEFINE_GENERIC_OPERATIONS(64, 8) - -#undef VEC_DEFINE_GENERIC_OPERATIONS -#undef VEC_DEFINE_GENERIC_OPERATIONS_SIGN - -// 16-bit -extern const vint8x2_impl vint8x2_impl_generic; -extern const vuint8x2_impl vuint8x2_impl_generic; - -// 32-bit -extern const vint8x4_impl vint8x4_impl_generic; -extern const vuint8x4_impl vuint8x4_impl_generic; -extern const vint16x2_impl vint16x2_impl_generic; -extern const vuint16x2_impl vuint16x2_impl_generic; - -// 64-bit -extern const vint8x8_impl vint8x8_impl_generic; -extern const vuint8x8_impl vuint8x8_impl_generic; -extern const vint16x4_impl vint16x4_impl_generic; -extern const vuint16x4_impl vuint16x4_impl_generic; -extern const vint32x2_impl vint32x2_impl_generic; -extern const vuint32x2_impl vuint32x2_impl_generic; +VEC_GENERIC_DEFINE_OPERATIONS(8, 64, 32) +VEC_GENERIC_DEFINE_OPERATIONS(16, 32, 16) +VEC_GENERIC_DEFINE_OPERATIONS(32, 16, 8) +VEC_GENERIC_DEFINE_OPERATIONS(64, 8, 4) -// 128-bit -extern const vint8x16_impl vint8x16_impl_generic; -extern const vuint8x16_impl vuint8x16_impl_generic; -extern const vint16x8_impl vint16x8_impl_generic; -extern const vuint16x8_impl vuint16x8_impl_generic; -extern const vint32x4_impl vint32x4_impl_generic; -extern const vuint32x4_impl vuint32x4_impl_generic; -extern const vint64x2_impl vint64x2_impl_generic; -extern const vuint64x2_impl vuint64x2_impl_generic; - -// 256-bit -extern const vint8x32_impl vint8x32_impl_generic; -extern const vuint8x32_impl vuint8x32_impl_generic; -extern const vint16x16_impl vint16x16_impl_generic; -extern const vuint16x16_impl vuint16x16_impl_generic; -extern const vint32x8_impl vint32x8_impl_generic; -extern const vuint32x8_impl vuint32x8_impl_generic; -extern const vint64x4_impl vint64x4_impl_generic; -extern const vuint64x4_impl vuint64x4_impl_generic; - -// 512-bit -extern const vint8x64_impl vint8x64_impl_generic; -extern const vuint8x64_impl vuint8x64_impl_generic; -extern const vint16x32_impl vint16x32_impl_generic; -extern const vuint16x32_impl vuint16x32_impl_generic; -extern const vint32x16_impl vint32x16_impl_generic; -extern const vuint32x16_impl vuint32x16_impl_generic; -extern const vint64x8_impl vint64x8_impl_generic; -extern const vuint64x8_impl vuint64x8_impl_generic; +#undef VEC_GENERIC_DEFINE_OPERATIONS +#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN #endif /* VEC_IMPL_GENERIC_H_ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/integer.h.in Fri Apr 25 17:40:55 2025 -0400 @@ -0,0 +1,58 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_INTEGER_H_ +#define VEC_IMPL_INTEGER_H_ + +#cmakedefine HAVE_SYS_TYPES_H +#cmakedefine HAVE_STDDEF_H +#cmakedefine HAVE_STDINT_H + +#ifdef HAVE_SYS_TYPES_H +# include <sys/types.h> +#endif +#ifdef HAVE_STDDEF_H +# include <stddef.h> +#endif +#ifdef HAVE_STDINT_H +# include <stdint.h> +#endif + +typedef signed char vec_int8; +typedef @SIZE16@ vec_int16; +typedef @SIZE32@ vec_int32; +typedef @SIZE64@ vec_int64; + +typedef unsigned char vec_uint8; +typedef @USIZE16@ vec_uint16; +typedef @USIZE32@ vec_uint32; +typedef @USIZE64@ vec_uint64; + +/* this is only used for bitshifting right now */ +typedef vec_int64 vec_intmax; +typedef vec_uint64 vec_uintmax; + +typedef @USIZEPTR@ vec_uintptr; + +#endif /* VEC_IMPL_INTEGER_H_ */ \ No newline at end of file
--- a/include/vec/impl/ppc/altivec.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/ppc/altivec.h Fri Apr 25 17:40:55 2025 -0400 @@ -27,13 +27,228 @@ #ifndef VEC_IMPL_PPC_ALTIVEC_H_ #define VEC_IMPL_PPC_ALTIVEC_H_ -#include "vec/vec.h" +#include <altivec.h> + +/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */ +#ifdef vec_mul +# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_mul(vec1.altivec, vec2.altivec); \ + return vec; \ + } +# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \ + v##sign##int##bits##x##size##_altivec_mul +#else +# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) +# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) NULL +#endif + +#ifdef vec_splats +# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(vec_##sign##int##bits x) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_splats(x); \ + return vec; \ + } +# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \ + v##sign##int##bits##x##size##_altivec_splat +#else +# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) +# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) NULL +#endif + +#define VEC_ALTIVEC_uRSHIFT vec_sr +#define VEC_ALTIVEC_RSHIFT vec_sra + +#define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_sr(vec1.altivec, vec2.altivec); \ + return vec; \ + } +#define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \ + v##sign##int##bits##x##size##_altivec_lrshift + +#define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size) +#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) NULL + +#define VEC_ALTIVEC_CAST_BOOL_8 (vector signed char) +#define VEC_ALTIVEC_CAST_BOOL_U8 (vector unsigned char) +#define VEC_ALTIVEC_CAST_BOOL_16 (vector signed short) +#define VEC_ALTIVEC_CAST_BOOL_U16 (vector unsigned short) +#define VEC_ALTIVEC_CAST_BOOL_32 (vector signed int) +#define VEC_ALTIVEC_CAST_BOOL_U32 (vector unsigned int) -extern const vint8x16_impl vint8x16_impl_altivec; -extern const vint16x8_impl vint16x8_impl_altivec; -extern const vint32x4_impl vint32x4_impl_altivec; -extern const vuint8x16_impl vuint8x16_impl_altivec; -extern const vuint16x8_impl vuint16x8_impl_altivec; -extern const vuint32x4_impl vuint32x4_impl_altivec; +/* Since altivec conveniently made their API super user friendly, we can just use + * one giant macro to define literally everything */ +#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_ld(0, in); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + vec_st(vec.altivec, 0, out); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_add(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_sub(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_sl(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_avg(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_and(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_or(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_xor(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmplt(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmplt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpeq(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmpgt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpgt(vec1.altivec, vec2.altivec); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \ + VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size), \ + v##sign##int##bits##x##size##_altivec_load_aligned, \ + v##sign##int##bits##x##size##_altivec_load, \ + v##sign##int##bits##x##size##_altivec_store_aligned, \ + /* .store = */ NULL, \ + v##sign##int##bits##x##size##_altivec_add, \ + v##sign##int##bits##x##size##_altivec_sub, \ + VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size), \ + /* .div = */ NULL, \ + v##sign##int##bits##x##size##_altivec_avg, \ + v##sign##int##bits##x##size##_altivec_and, \ + v##sign##int##bits##x##size##_altivec_or, \ + v##sign##int##bits##x##size##_altivec_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_altivec_lshift, \ + v##sign##int##bits##x##size##_altivec_rshift, \ + VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size), \ + v##sign##int##bits##x##size##_altivec_cmplt, \ + v##sign##int##bits##x##size##_altivec_cmple, \ + v##sign##int##bits##x##size##_altivec_cmpeq, \ + v##sign##int##bits##x##size##_altivec_cmpge, \ + v##sign##int##bits##x##size##_altivec_cmpgt, \ + }; + +#define VEC_DEFINE_OPERATIONS(bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +VEC_DEFINE_OPERATIONS(8, 16) +VEC_DEFINE_OPERATIONS(16, 8) +VEC_DEFINE_OPERATIONS(32, 4) +//#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX +//VEC_DEFINE_OPERATIONS(64, 2) +//#endif + +#undef VEC_DEFINE_OPERATIONS +#undef VEC_DEFINE_OPERATIONS_SIGN +#undef VEC_ALTIVEC_DEFINE_MUL +#undef VEC_ALTIVEC_STRUCT_MUL +#undef VEC_ALTIVEC_DEFINE_LRSHIFT +#undef VEC_ALTIVEC_STRUCT_LRSHIFT +#undef VEC_ALTIVEC_DEFINE_uLRSHIFT +#undef VEC_ALTIVEC_STRUCT_uLRSHIFT +#undef VEC_ALTIVEC_DEFINE_SPLAT +#undef VEC_ALTIVEC_STRUCT_SPLAT +#undef VEC_ALTIVEC_uRSHIFT +#undef VEC_ALTIVEC_RSHIFT #endif /* VEC_IMPL_PPC_ALTIVEC_H_ */
--- a/include/vec/impl/x86/avx2.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/x86/avx2.h Fri Apr 25 17:40:55 2025 -0400 @@ -25,15 +25,272 @@ #ifndef VEC_IMPL_X86_AVX2_H_ #define VEC_IMPL_X86_AVX2_H_ -#include "vec/vec.h" +#define VEC_AVX2_OPERATION_8x32_16x16(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m256i dst_even = _mm256_##op##_epi16(vec1.avx2, vec2.avx2); \ + __m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1.avx2, 8), _mm256_srli_epi16(vec2.avx2, 8)); \ + \ + /* repack */ \ + v##sign##int8x32 vec; \ + vec.avx2 = _mm256_or_si256( \ + _mm256_slli_epi16(dst_odd, 8), \ + _mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \ + ); \ + return vec; \ + } while (0) + +#define VEC_AVX2_OPERATION_8x32_32x8(op, sign) \ + do { \ + /* unpack */ \ + __m256i dst_1 = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \ + __m256i dst_2 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 8), _mm256_srli_epi32(vec2.avx2, 8)); \ + __m256i dst_3 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \ + __m256i dst_4 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 24), _mm256_srli_epi32(vec2.avx2, 24)); \ + \ + /* repack */ \ + v##sign##int8x32 vec; \ + vec.avx2 = _mm256_or_si256( \ + _mm256_or_si256( \ + _mm256_slli_epi32(dst_4, 8), \ + _mm256_srli_epi32(_mm256_slli_epi32(dst_3, 8), 8) \ + ), \ + _mm256_or_si256( \ + _mm256_slli_epi32(_mm256_slli_epi32(dst_2, 8), 16), \ + _mm256_srli_epi32(_mm256_slli_epi32(dst_1, 8), 24) \ + ) \ + ); \ + return vec; \ + } while (0) + +#define VEC_AVX2_OPERATION_16x16(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m256i dst_even = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \ + __m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \ + \ + /* repack */ \ + v##sign##int16x16 vec; \ + vec.avx2 = _mm256_or_si256( \ + _mm256_slli_epi32(dst_odd, 16), \ + _mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \ + ); \ + return vec; \ + } while (0) + +// shifting + +#define VEC_AVX2_LSHIFT_8x32(sign) \ + VEC_AVX2_OPERATION_8x32_32x8(sllv, sign) + +#define VEC_AVX2_LSHIFT_16x16(sign) \ + VEC_AVX2_OPERATION_16x16(sllv, sign) + +#define VEC_AVX2_LSHIFT_32x8(sign) \ + do { \ + v##sign##int32x8 vec; \ + vec.avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2); \ + return vec; \ + } while (0) + +#define VEC_AVX2_LSHIFT_64x4(sign) \ + do { \ + v##sign##int64x4 vec; \ + vec.avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2); \ + return vec; \ + } while (0) + +#define VEC_AVX2_RSHIFT_8x32(sign, aORl) \ + VEC_AVX2_OPERATION_8x32_32x8(sr##aORl##v, sign) + +#define VEC_AVX2_RSHIFT_16x16(sign, aORl) \ + VEC_AVX2_OPERATION_16x16(sr##aORl##v, sign) + +#define VEC_AVX2_RSHIFT_32x8(sign, aORl) \ + do { \ + v##sign##int32x8 vec; \ + vec.avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2); \ + return vec; \ + } while (0) + +#define VEC_AVX2_aRSHIFT_64x4(sign) \ + do { \ + return v##sign##int64x4_fallback_rshift(vec1, vec2); \ + } while (0) + +#define VEC_AVX2_lRSHIFT_64x4(sign) \ + do { \ + v##sign##int64x4 vec; \ + vec.avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2); \ + return vec; \ + } while (0) + +#define VEC_AVX2_RSHIFT_64x4(sign, aORl) \ + VEC_AVX2_##aORl##RSHIFT_64x4(sign) + +// multiplication + +#define VEC_AVX2_MUL_8x32(sign) \ + VEC_AVX2_OPERATION_8x32_16x16(mullo, sign) + +#define VEC_AVX2_MUL_16x16(sign) \ + do { \ + v##sign##int16x16 vec; \ + vec.avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2); \ + return vec; \ + } while (0) + +#define VEC_AVX2_MUL_32x8(sign) \ + do { \ + v##sign##int32x8 vec; \ + vec.avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2); \ + return vec; \ + } while (0) -extern const vint8x32_impl vint8x32_impl_avx2; -extern const vint16x16_impl vint16x16_impl_avx2; -extern const vint32x8_impl vint32x8_impl_avx2; -extern const vint64x4_impl vint64x4_impl_avx2; -extern const vuint8x32_impl vuint8x32_impl_avx2; -extern const vuint16x16_impl vuint16x16_impl_avx2; -extern const vuint32x8_impl vuint32x8_impl_avx2; -extern const vuint64x4_impl vuint64x4_impl_avx2; +#define VEC_AVX2_MUL_64x4(sign) \ + do { \ + __m256i ac = _mm256_mul_epu32(vec1.avx2, vec2.avx2); \ + __m256i b = _mm256_srli_epi64(vec1.avx2, 32); \ + __m256i bc = _mm256_mul_epu32(b, vec2.avx2); \ + __m256i d = _mm256_srli_epi64(vec2.avx2, 32); \ + __m256i ad = _mm256_mul_epu32(vec1.avx2, d); \ + __m256i hi = _mm256_add_epi64(bc, ad); \ + hi = _mm256_slli_epi64(hi, 32); \ + \ + v##sign##int64x4 vec; \ + vec.avx2 = _mm256_add_epi64(hi, ac); \ + return vec; \ + } while (0) + +// operations + +#define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_load_si256((const __m256i *)in); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_loadu_si256((const __m256i *)in); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm256_store_si256((__m256i *)out, vec.avx2); \ + } \ + \ + static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm256_storeu_si256((__m256i *)out, vec.avx2); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX2_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX2_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX2_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX2_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_avx2_load_aligned, \ + v##sign##int##bits##x##size##_avx2_load, \ + v##sign##int##bits##x##size##_avx2_store_aligned, \ + v##sign##int##bits##x##size##_avx2_store, \ + v##sign##int##bits##x##size##_avx2_add, \ + v##sign##int##bits##x##size##_avx2_sub, \ + v##sign##int##bits##x##size##_avx2_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_avx2_and, \ + v##sign##int##bits##x##size##_avx2_or, \ + v##sign##int##bits##x##size##_avx2_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_avx2_lshift, \ + v##sign##int##bits##x##size##_avx2_rshift, \ + v##sign##int##bits##x##size##_avx2_lrshift, \ + }; + +#define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \ + VEC_AVX2_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_AVX2_DEFINE_OPERATIONS_SIGN(u, bits, size) + +VEC_AVX2_DEFINE_OPERATIONS(8, 32) +VEC_AVX2_DEFINE_OPERATIONS(16, 16) +VEC_AVX2_DEFINE_OPERATIONS(32, 8) +VEC_AVX2_DEFINE_OPERATIONS(64, 4) + +#undef VEC_AVX2_DEFINE_OPERATIONS +#undef VEC_AVX2_DEFINE_OPERATIONS_SIGN +#undef VEC_AVX2_MUL_8x32 +#undef VEC_AVX2_MUL_16x16 +#undef VEC_AVX2_MUL_32x8 +#undef VEC_AVX2_MUL_64x4 +#undef VEC_AVX2_OPERATION_8x32_16x16 +#undef VEC_AVX2_OPERATION_8x32_32x8 +#undef VEC_AVX2_OPERATION_16x16 +#undef VEC_AVX2_LSHIFT_8x32 +#undef VEC_AVX2_LSHIFT_16x16 +#undef VEC_AVX2_LSHIFT_32x8 +#undef VEC_AVX2_LSHIFT_64x4 +#undef VEC_AVX2_RSHIFT_8x32 +#undef VEC_AVX2_RSHIFT_16x16 +#undef VEC_AVX2_RSHIFT_32x8 +#undef VEC_AVX2_aRSHIFT_64x4 +#undef VEC_AVX2_lRSHIFT_64x4 +#undef VEC_AVX2_RSHIFT_64x4 #endif /* VEC_IMPL_X86_AVX2_H_ */
--- a/include/vec/impl/x86/avx512f.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/x86/avx512f.h Fri Apr 25 17:40:55 2025 -0400 @@ -25,15 +25,272 @@ #ifndef VEC_IMPL_X86_AVX512F_H_ #define VEC_IMPL_X86_AVX512F_H_ -#include "vec/vec.h" +#define VEC_AVX512F_OPERATION_8x64(op, sign) \ + do { \ + /* unpack and add */ \ + __m512i dst_1 = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \ + __m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 8), _mm512_srli_epi32(vec2.avx512f, 8)); \ + __m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \ + __m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 24), _mm512_srli_epi32(vec2.avx512f, 24)); \ + \ + /* repack */ \ + v##sign##int8x64 vec; \ + vec.avx512f = _mm512_or_si512( \ + _mm512_or_si512( \ + _mm512_slli_epi32(dst_4, 8), \ + _mm512_srli_epi32(_mm512_slli_epi32(dst_3, 8), 8) \ + ), \ + _mm512_or_si512( \ + _mm512_slli_epi32(_mm512_slli_epi32(dst_2, 8), 16), \ + _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 8), 24) \ + ) \ + ); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_OPERATION_16x32(op, sign) \ + do { \ + /* unpack and add */ \ + __m512i dst_even = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \ + __m512i dst_odd = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \ + \ + /* repack */ \ + v##sign##int16x32 vec; \ + vec.avx512f = _mm512_or_si512( \ + _mm512_slli_epi32(dst_odd, 16), \ + _mm512_srli_epi32(_mm512_slli_epi32(dst_even, 16), 16) \ + ); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_ADD_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(add, sign) + +#define VEC_AVX512F_ADD_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(add, sign) + +#define VEC_AVX512F_ADD_32x16(sign) \ + do { \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_ADD_64x8(sign) \ + do { \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_SUB_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(sub, sign) + +#define VEC_AVX512F_SUB_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(sub, sign) + +#define VEC_AVX512F_SUB_32x16(sign) \ + do { \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_SUB_64x8(sign) \ + do { \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_MUL_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(mullo, sign) + +#define VEC_AVX512F_MUL_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(mullo, sign) + +#define VEC_AVX512F_MUL_32x16(sign) \ + do { \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_MUL_64x8(sign) \ + do { \ + __m512i ac = _mm512_mul_epu32(vec1.avx512f, vec2.avx512f); \ + __m512i b = _mm512_srli_epi64(vec1.avx512f, 32); \ + __m512i bc = _mm512_mul_epu32(b, vec2.avx512f); \ + __m512i d = _mm512_srli_epi64(vec2.avx512f, 32); \ + __m512i ad = _mm512_mul_epu32(vec1.avx512f, d); \ + __m512i hi = _mm512_add_epi64(bc, ad); \ + hi = _mm512_slli_epi64(hi, 32); \ + \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_add_epi64(hi, ac); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_LSHIFT_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(sllv, sign) + +#define VEC_AVX512F_LSHIFT_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(sllv, sign) + +#define VEC_AVX512F_LSHIFT_32x16(sign) \ + do { \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_LSHIFT_64x8(sign) \ + do { \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_RSHIFT_8x64(sign, aORl) \ + VEC_AVX512F_OPERATION_8x64(sr##aORl##v, sign) + +#define VEC_AVX512F_RSHIFT_16x32(sign, aORl) \ + VEC_AVX512F_OPERATION_16x32(sr##aORl##v, sign) -extern const vint8x64_impl vint8x64_impl_avx512f; -extern const vint16x32_impl vint16x32_impl_avx512f; -extern const vint32x16_impl vint32x16_impl_avx512f; -extern const vint64x8_impl vint64x8_impl_avx512f; -extern const vuint8x64_impl vuint8x64_impl_avx512f; -extern const vuint16x32_impl vuint16x32_impl_avx512f; -extern const vuint32x16_impl vuint32x16_impl_avx512f; -extern const vuint64x8_impl vuint64x8_impl_avx512f; +#define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \ + do { \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \ + do { \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } while (0) + +#define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_load_si512((const __m512i *)in); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm512_store_si512((__m512i *)out, vec.avx512f); \ + } \ + \ + static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm512_storeu_si512((__m512i *)out, vec.avx512f); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX512F_ADD_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX512F_SUB_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX512F_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX512F_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX512F_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_avx512f_load_aligned, \ + v##sign##int##bits##x##size##_avx512f_load, \ + v##sign##int##bits##x##size##_avx512f_store_aligned, \ + v##sign##int##bits##x##size##_avx512f_store, \ + v##sign##int##bits##x##size##_avx512f_add, \ + v##sign##int##bits##x##size##_avx512f_sub, \ + v##sign##int##bits##x##size##_avx512f_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_avx512f_and, \ + v##sign##int##bits##x##size##_avx512f_or, \ + v##sign##int##bits##x##size##_avx512f_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_avx512f_lshift, \ + v##sign##int##bits##x##size##_avx512f_rshift, \ + v##sign##int##bits##x##size##_avx512f_lrshift, \ + }; + +#define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \ + VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size) + +VEC_AVX512F_DEFINE_OPERATIONS(8, 64) +VEC_AVX512F_DEFINE_OPERATIONS(16, 32) +VEC_AVX512F_DEFINE_OPERATIONS(32, 16) +VEC_AVX512F_DEFINE_OPERATIONS(64, 8) + +#undef VEC_AVX512F_DEFINE_OPERATIONS +#undef VEC_AVX512F_DEFINE_OPERATIONS_SIGN +#undef VEC_AVX512F_MUL_8x64 +#undef VEC_AVX512F_MUL_16x32 +#undef VEC_AVX512F_MUL_32x16 +#undef VEC_AVX512F_MUL_64x8 + +#undef VEC_AVX512F_LSHIFT_8x64 +#undef VEC_AVX512F_LSHIFT_16x32 +#undef VEC_AVX512F_LSHIFT_32x16 +#undef VEC_AVX512F_LSHIFT_64x8 + +#undef VEC_AVX512F_RSHIFT_8x64 +#undef VEC_AVX512F_RSHIFT_16x32 +#undef VEC_AVX512F_RSHIFT_32x16 +#undef VEC_AVX512F_RSHIFT_64x8 #endif /* VEC_IMPL_X86_AVX512F_H_ */
--- a/include/vec/impl/x86/mmx.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/x86/mmx.h Fri Apr 25 17:40:55 2025 -0400 @@ -25,13 +25,190 @@ #ifndef VEC_IMPL_X86_MMX_H_ #define VEC_IMPL_X86_MMX_H_ -#include "vec/vec.h" +#define VEC_MMX_OPERATION_8x8(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m64 dst_even = _mm_##op##_pi16(vec1.mmx, vec2.mmx); \ + __m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1.mmx, 8), _mm_srli_pi16(vec2.mmx, 8)); \ + \ + /* repack */ \ + v##sign##int8x8 vec; \ + vec.mmx = _mm_or_si64( \ + _mm_slli_pi16(dst_odd, 8), \ + _mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \ + ); \ + return vec; \ + } while (0) + +// shifting +#define VEC_MMX_LSHIFT_8x8(sign) \ + VEC_MMX_OPERATION_8x8(sll, sign) + +#define VEC_MMX_LSHIFT_16x4(sign) \ + do { \ + v##sign##int16x4 vec; \ + vec.mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx); \ + return vec; \ + } while (0) + +#define VEC_MMX_LSHIFT_32x2(sign) \ + do { \ + v##sign##int32x2 vec; \ + vec.mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx); \ + return vec; \ + } while (0) + +#define VEC_MMX_RSHIFT_8x8(sign, aORl) \ + VEC_MMX_OPERATION_8x8(sr##aORl, sign) + +#define VEC_MMX_RSHIFT_16x4(sign, aORl) \ + do { \ + v##sign##int16x4 vec; \ + vec.mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx); \ + return vec; \ + } while (0) + +#define VEC_MMX_RSHIFT_32x2(sign, aORl) \ + do { \ + v##sign##int32x2 vec; \ + vec.mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx); \ + return vec; \ + } while (0) + +// shared between MMX variations +#define VEC_MMX_MUL_8x8(sign) \ + VEC_MMX_OPERATION_8x8(mullo, sign) + +#define VEC_MMX_MUL_16x4(sign) \ + do { \ + /* we have a real instruction for this */ \ + v##sign##int16x4 vec; \ + vec.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ + return vec; \ + } while (0) + +#define VEC_MMX_MUL_32x2(sign) \ + do { \ + __m64 ac = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ + __m64 b = _mm_srli_pi32(vec1.mmx, 16); \ + __m64 bc = _mm_mullo_pi16(b, vec2.mmx); \ + __m64 d = _mm_srli_pi32(vec2.mmx, 16); \ + __m64 ad = _mm_mullo_pi16(vec1.mmx, d); \ + __m64 hi = _mm_add_pi32(bc, ad); \ + hi = _mm_slli_pi32(hi, 16); \ + \ + v##sign##int32x2 vec; \ + vec.mmx = _mm_add_pi32(hi, ac); \ + return vec; \ + } while (0) -extern const vint8x8_impl vint8x8_impl_mmx; -extern const vint16x4_impl vint16x4_impl_mmx; -extern const vint32x2_impl vint32x2_impl_mmx; -extern const vuint8x8_impl vuint8x8_impl_mmx; -extern const vuint16x4_impl vuint16x4_impl_mmx; -extern const vuint32x2_impl vuint32x2_impl_mmx; +#define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + memcpy(&vec.mmx, in, sizeof(vec.mmx)); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + memcpy(out, &vec.mmx, sizeof(vec.mmx)); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_MMX_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_and_si64(vec1.mmx, vec2.mmx); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_or_si64(vec1.mmx, vec2.mmx); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_xor_si64(vec1.mmx, vec2.mmx); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_MMX_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_MMX_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_MMX_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_mmx_load_aligned, \ + v##sign##int##bits##x##size##_mmx_load_aligned, \ + v##sign##int##bits##x##size##_mmx_store_aligned, \ + v##sign##int##bits##x##size##_mmx_store_aligned, \ + v##sign##int##bits##x##size##_mmx_add, \ + v##sign##int##bits##x##size##_mmx_sub, \ + v##sign##int##bits##x##size##_mmx_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_mmx_and, \ + v##sign##int##bits##x##size##_mmx_or, \ + v##sign##int##bits##x##size##_mmx_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_mmx_lshift, \ + v##sign##int##bits##x##size##_mmx_rshift, \ + v##sign##int##bits##x##size##_mmx_lrshift, \ + }; + +#define VEC_MMX_DEFINE_OPERATIONS(bits, size) \ + VEC_MMX_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_MMX_DEFINE_OPERATIONS_SIGN(u, bits, size) + +VEC_MMX_DEFINE_OPERATIONS(8, 8) +VEC_MMX_DEFINE_OPERATIONS(16, 4) +VEC_MMX_DEFINE_OPERATIONS(32, 2) + +#undef VEC_MMX_DEFINE_OPERATIONS +#undef VEC_MMX_DEFINE_OPERATIONS_SIGN +#undef VEC_MMX_MUL_8x8 +#undef VEC_MMX_MUL_16x4 +#undef VEC_MMX_MUL_32x2 +#undef VEC_MMX_OPERATION_8x8 +#undef VEC_MMX_LSHIFT_8x8 +#undef VEC_MMX_LSHIFT_16x4 +#undef VEC_MMX_LSHIFT_32x2 +#undef VEC_MMX_RSHIFT_8x8 +#undef VEC_MMX_RSHIFT_16x4 +#undef VEC_MMX_RSHIFT_32x2 #endif /* VEC_IMPL_X86_MMX_H_ */
--- a/include/vec/impl/x86/sse2.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/x86/sse2.h Fri Apr 25 17:40:55 2025 -0400 @@ -25,42 +25,290 @@ #ifndef VEC_IMPL_X86_SSE2_H_ #define VEC_IMPL_X86_SSE2_H_ -#include "vec/vec.h" +#define VEC_SSE2_OPERATION_8x16(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m128i dst_even = _mm_##op##_epi16(vec1.sse, vec2.sse); \ + __m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \ + \ + /* repack */ \ + v##sign##int8x16 vec; \ + vec.sse = _mm_or_si128( \ + _mm_slli_epi16(dst_odd, 8), \ + _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ + ); \ + return vec; \ + } while (0) + +// shifting +#define VEC_SSE2_LSHIFT_8x16(sign) \ + VEC_SSE2_OPERATION_8x16(sll, sign) + +#define VEC_SSE2_LSHIFT_16x8(sign) \ + do { \ + v##sign##int16x8 vec; \ + vec.sse = _mm_sll_epi16(vec1.sse, vec2.sse); \ + return vec; \ + } while (0) + +#define VEC_SSE2_LSHIFT_32x4(sign) \ + do { \ + v##sign##int32x4 vec; \ + vec.sse = _mm_sll_epi32(vec1.sse, vec2.sse); \ + return vec; \ + } while (0) + +#define VEC_SSE2_LSHIFT_64x2(sign) \ + do { \ + v##sign##int64x2 vec; \ + vec.sse = _mm_sll_epi64(vec1.sse, vec2.sse); \ + return vec; \ + } while (0) + +#define VEC_SSE2_RSHIFT_8x16(sign, aORl) \ + VEC_SSE2_OPERATION_8x16(sr##aORl, sign) + +#define VEC_SSE2_RSHIFT_16x8(sign, aORl) \ + do { \ + v##sign##int16x8 vec; \ + vec.sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse); \ + return vec; \ + } while (0) + +#define VEC_SSE2_RSHIFT_32x4(sign, aORl) \ + do { \ + v##sign##int32x4 vec; \ + vec.sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse); \ + return vec; \ + } while (0) + +#define VEC_SSE2_aRSHIFT_64x2(sign) \ + do { \ + return v##sign##int64x2_fallback_rshift(vec1, vec2); \ + } while (0) + +#define VEC_SSE2_lRSHIFT_64x2(sign) \ + do { \ + v##sign##int64x2 vec; \ + vec.sse = _mm_srl_epi64(vec1.sse, vec2.sse); \ + return vec; \ + } while (0) -// These are only extern because the SSE 4.1 translation unit needs to access it. -#define VEC_DEFINE_SSE2_OPERATIONS_SIGN(sign, csign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_splat(vec_##sign##int##bits x); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]); \ - void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); +#define VEC_SSE2_RSHIFT_64x2(sign, aORl) \ + VEC_SSE2_##aORl##RSHIFT_64x2(sign) + +// shared between SSE2 variations +#define VEC_SSE2_MUL_8x16(sign) \ + VEC_SSE2_OPERATION_8x16(mullo, sign) + +#define VEC_SSE2_MUL_16x8(sign) \ + do { \ + /* we have a real instruction for this */ \ + vec1.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) + +#define VEC_SSE2_MUL_32x4(sign) \ + do { \ + /* this was stolen from... somewhere :) */ \ + __m128i a13 = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \ + __m128i b13 = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \ + __m128i prod02 = _mm_mul_epu32(vec1.sse, vec2.sse); /* (-,a2*b2,-,a0*b0) */ \ + __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ + __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ + \ + vec1.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ + return vec1; \ + } while (0) + +#define VEC_SSE2_MUL_64x2(sign) \ + do { \ + __m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ + __m128i b = _mm_srli_epi64(vec1.sse, 32); /* b = vec1 >> 32; */ \ + __m128i bc = _mm_mul_epu32(b, vec2.sse); /* bc = b * (vec2 & UINT32_MAX); */ \ + __m128i d = _mm_srli_epi64(vec2.sse, 32); /* d = vec2 >> 32; */ \ + __m128i ad = _mm_mul_epu32(vec1.sse, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ + __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ + hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ + \ + vec1.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \ + return vec1; \ + } while (0) + +#define VEC_SSE2_CMPEQ_8x16(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi8(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) + +#define VEC_SSE2_CMPEQ_16x8(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi16(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) + +#define VEC_SSE2_CMPEQ_32x4(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) + +// SSE2 doesn't have an intrinsic for 64x2 equality comparison, +// so how can we take a 32x4 comparison result and turn it into +// a 64x2 comparison result? +// +// well, Intel conveniently provided an operation where we can +// shuffle around 32-bit integers (_mm_shuffle_epi32). +// +// this means all we have to do is simply do the 32-bit operation, +// shuffle the parts, and then return a bitwise AND of the result. -#define VEC_DEFINE_SSE2_OPERATIONS(bits, size) \ - VEC_DEFINE_SSE2_OPERATIONS_SIGN( , , bits, size) \ - VEC_DEFINE_SSE2_OPERATIONS_SIGN(u, U, bits, size) - -VEC_DEFINE_SSE2_OPERATIONS(8, 16) -VEC_DEFINE_SSE2_OPERATIONS(16, 8) -VEC_DEFINE_SSE2_OPERATIONS(32, 4) -VEC_DEFINE_SSE2_OPERATIONS(64, 2) +#define VEC_SSE2_CMPEQ_64x2(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ + vec2.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(1, 1, 3, 3)); \ + vec1.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(0, 0, 2, 2)); \ + vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) -#undef VEC_DEFINE_SSE2_OPERATIONS -#undef VEC_DEFINE_SSE2_OPERATIONS_SIGN +#define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_load_si128((const __m128i *)in); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_loadu_si128((const __m128i *)in); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm_store_si128((__m128i *)out, vec.sse); \ + } \ + \ + static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm_storeu_si128((__m128i *)out, vec.sse); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_add_epi##bits(vec1.sse, vec2.sse); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_sub_epi##bits(vec1.sse, vec2.sse); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_SSE2_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_and_si128(vec1.sse, vec2.sse); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_or_si128(vec1.sse, vec2.sse); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_xor_si128(vec1.sse, vec2.sse); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_SSE2_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_SSE2_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_SSE2_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_SSE2_CMPEQ_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_sse2_load_aligned, \ + v##sign##int##bits##x##size##_sse2_load, \ + v##sign##int##bits##x##size##_sse2_store_aligned, \ + v##sign##int##bits##x##size##_sse2_store, \ + v##sign##int##bits##x##size##_sse2_add, \ + v##sign##int##bits##x##size##_sse2_sub, \ + v##sign##int##bits##x##size##_sse2_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_sse2_and, \ + v##sign##int##bits##x##size##_sse2_or, \ + v##sign##int##bits##x##size##_sse2_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_sse2_lshift, \ + v##sign##int##bits##x##size##_sse2_rshift, \ + v##sign##int##bits##x##size##_sse2_lrshift, \ + /* .cmplt = */ NULL, \ + /* .cmple = */ NULL, \ + v##sign##int##bits##x##size##_sse2_cmpeq, \ + /* .cmpge = */ NULL, \ + /* .cmpgt = */ NULL, \ + }; -extern const vint8x16_impl vint8x16_impl_sse2; -extern const vint16x8_impl vint16x8_impl_sse2; -extern const vint32x4_impl vint32x4_impl_sse2; -extern const vint64x2_impl vint64x2_impl_sse2; -extern const vuint8x16_impl vuint8x16_impl_sse2; -extern const vuint16x8_impl vuint16x8_impl_sse2; -extern const vuint32x4_impl vuint32x4_impl_sse2; -extern const vuint64x2_impl vuint64x2_impl_sse2; +#define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \ + VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size) + +// SSE is *only* 128-bit +VEC_SSE2_DEFINE_OPERATIONS(8, 16) +VEC_SSE2_DEFINE_OPERATIONS(16, 8) +VEC_SSE2_DEFINE_OPERATIONS(32, 4) +VEC_SSE2_DEFINE_OPERATIONS(64, 2) + +#undef VEC_SSE2_DEFINE_OPERATIONS +#undef VEC_SSE2_DEFINE_OPERATIONS_SIGN +#undef VEC_SSE2_MUL_8x16 +#undef VEC_SSE2_MUL_16x8 +#undef VEC_SSE2_MUL_32x4 +#undef VEC_SSE2_MUL_64x2 +#undef VEC_SSE2_OPERATION_8x16 +#undef VEC_SSE2_LSHIFT_8x16 +#undef VEC_SSE2_LSHIFT_16x8 +#undef VEC_SSE2_LSHIFT_32x4 +#undef VEC_SSE2_LSHIFT_64x2 +#undef VEC_SSE2_RSHIFT_8x16 +#undef VEC_SSE2_RSHIFT_16x8 +#undef VEC_SSE2_RSHIFT_32x4 +#undef VEC_SSE2_aRSHIFT_64x2 +#undef VEC_SSE2_lRSHIFT_64x2 +#undef VEC_SSE2_RSHIFT_64x2 #endif /* VEC_IMPL_X86_SSE2_H_ */
--- a/include/vec/impl/x86/sse41.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/impl/x86/sse41.h Fri Apr 25 17:40:55 2025 -0400 @@ -25,9 +25,43 @@ #ifndef VEC_IMPL_X86_SSE41_H_ #define VEC_IMPL_X86_SSE41_H_ -#include "vec/vec.h" +// SSE 4.1 provides a real _mm_mullo_epi32 +#define VEC_SSE41_DEFINE_OPERATIONS(sign) \ + static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \ + { \ + v##sign##int32x4 vec; \ + vec.sse = _mm_mullo_epi32(vec1.sse, vec2.sse); \ + return vec; \ + } \ + \ + static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \ + /* .splat = */ NULL, \ + v##sign##int32x4_sse2_load_aligned, \ + v##sign##int32x4_sse2_load, \ + v##sign##int32x4_sse2_store_aligned, \ + v##sign##int32x4_sse2_store, \ + v##sign##int32x4_sse2_add, \ + v##sign##int32x4_sse2_sub, \ + v##sign##int32x4_sse41_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int32x4_sse2_and, \ + v##sign##int32x4_sse2_or, \ + v##sign##int32x4_sse2_xor, \ + /* .not = */ NULL, \ + v##sign##int32x4_sse2_lshift, \ + v##sign##int32x4_sse2_rshift, \ + v##sign##int32x4_sse2_lrshift, \ + /* .cmplt = */ NULL, \ + /* .cmple = */ NULL, \ + v##sign##int32x4_sse2_cmpeq, \ + /* .cmpge = */ NULL, \ + /* .cmpgt = */ NULL, \ + }; -extern const vint32x4_impl vint32x4_impl_sse41; -extern const vuint32x4_impl vuint32x4_impl_sse41; +VEC_SSE41_DEFINE_OPERATIONS() +VEC_SSE41_DEFINE_OPERATIONS(u) + +#undef VEC_SSE41_DEFINE_OPERATIONS #endif /* VEC_IMPL_X86_SSE41_H_ */
--- a/include/vec/types.h.in Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ -/** - * vec - a tiny SIMD vector library in plain C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#ifndef VEC_TYPES_H_ -#define VEC_TYPES_H_ - -#cmakedefine HAVE_SYS_TYPES_H -#cmakedefine HAVE_STDDEF_H -#cmakedefine HAVE_STDINT_H - -#ifdef HAVE_SYS_TYPES_H -# include <sys/types.h> -#endif -#ifdef HAVE_STDDEF_H -# include <stddef.h> -#endif -#ifdef HAVE_STDINT_H -# include <stdint.h> -#endif - -typedef signed char vec_int8; -typedef @SIZE16@ vec_int16; -typedef @SIZE32@ vec_int32; -typedef @SIZE64@ vec_int64; - -typedef unsigned char vec_uint8; -typedef @USIZE16@ vec_uint16; -typedef @USIZE32@ vec_uint32; -typedef @USIZE64@ vec_uint64; - -/* this is only used for bitshifting right now */ -typedef vec_int64 vec_intmax; -typedef vec_uint64 vec_uintmax; - -typedef @USIZESIZE@ vec_uintsize; -typedef @USIZEPTR@ vec_uintptr; - -// okay, now we have to do this crap. -#ifdef HAVE_STDINT_H -# define VEC_INT8_C(x) INT8_C(x) -# define VEC_UINT8_C(x) UINT8_C(x) -# define VEC_INT16_C(x) INT16_C(x) -# define VEC_UINT16_C(x) UINT16_C(x) -# define VEC_INT32_C(x) INT32_C(x) -# define VEC_UINT32_C(x) UINT32_C(x) -# define VEC_INT64_C(x) INT64_C(x) -# define VEC_UINT64_C(x) UINT64_C(x) -# define VEC_INTMAX_C(x) INTMAX_C(x) -# define VEC_UINTMAX_C(x) UINTMAX_C(x) - -# define VEC_INT8_MAX INT8_MAX -# define VEC_INT8_MIN INT8_MIN -# define VEC_UINT8_MAX UINT8_MAX -# define VEC_INT16_MAX INT16_MAX -# define VEC_INT16_MIN INT16_MIN -# define VEC_UINT16_MAX UINT16_MAX -# define VEC_INT32_MAX INT32_MAX -# define VEC_INT32_MIN INT32_MIN -# define VEC_UINT32_MAX UINT32_MAX -# define VEC_INT64_MAX INT64_MAX -# define VEC_INT64_MIN INT64_MIN -# define VEC_UINT64_MAX UINT64_MAX -#else -// These are based on the minimum sizes for each integer type. -// -// i.e. long is guaranteed to be at least 32 bits, long long is -// guaranteed to be at least 64 bits, etc. -# define VEC_INT8_C(x) x -# define VEC_UINT8_C(x) x##U -# define VEC_INT16_C(x) x -# define VEC_UINT16_C(x) x##U -# define VEC_INT32_C(x) x##L -# define VEC_UINT32_C(x) x##UL -# define VEC_INT64_C(x) x##LL -# define VEC_UINT64_C(x) x##ULL -# define VEC_INTMAX_C(x) VEC_INT64_C(x) -# define VEC_UINTMAX_C(x) VEC_UINT64_C(x) - -# define VEC_INT8_MAX 0x7F -# define VEC_INT8_MIN (-0x7F - 1) -# define VEC_UINT8_MAX 0xFFU -# define VEC_INT16_MAX 0x7FFF -# define VEC_INT16_MIN (-0x7FFF - 1) -# define VEC_UINT16_MAX 0xFFFFU -# define VEC_INT32_MAX 0x7FFFFFFFL -# define VEC_INT32_MIN (-0x7FFFFFFFL - 1L) -# define VEC_UINT32_MAX 0xFFFFFFFFUL -# define VEC_INT64_MAX 0x7FFFFFFFFFFFFFFFLL -# define VEC_INT64_MIN (-0x7FFFFFFFFFFFFFFFLL - 1LL) -# define VEC_UINT64_MAX 0xFFFFFFFFFFFFFFFFULL -#endif - -#endif /* VEC_TYPES_H_ */
--- a/include/vec/vec.h Fri Apr 25 17:40:51 2025 -0400 +++ b/include/vec/vec.h Fri Apr 25 17:40:55 2025 -0400 @@ -29,37 +29,52 @@ extern "C" { #endif -// different on every implementation -#include "vec/types.h" + +#ifdef VEC_HAVE_IMPL_INTEGER_H +# include "impl/integer.h" +#else +# if __cplusplus >= (201103L) +# include <cstdint> +# include <cstddef> +typedef std::size_t vec_uintsize; + +typedef std::uint8_t vec_uint8; +typedef std::uint16_t vec_uint16; +typedef std::uint32_t vec_uint32; +typedef std::uint64_t vec_uint64; +typedef std::uintmax_t vec_uintmax; +typedef std::uintptr_t vec_uintptr; + +typedef std::int8_t vec_int8; +typedef std::int16_t vec_int16; +typedef std::int32_t vec_int32; +typedef std::int64_t vec_int64; +typedef std::intmax_t vec_intmax; +# elif __STDC_VERSION__ >= 199901L +# include <stdint.h> +# include <stddef.h> +typedef uint8_t vec_uint8; +typedef uint16_t vec_uint16; +typedef uint32_t vec_uint32; +typedef uint64_t vec_uint64; +typedef uintmax_t vec_uintmax; +typedef uintptr_t vec_uintptr; +typedef size_t vec_uintsize; +typedef int8_t vec_int8; +typedef int16_t vec_int16; +typedef int32_t vec_int32; +typedef int64_t vec_int64; +typedef intmax_t vec_intmax; +# else +# error Unable to find integer types with known size. +# endif +#endif #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ (((a) >= (x)) && \ ((a) > x || (b) >= (y)) && \ ((a) > x || (b) > (y) || (c) >= (z))) -// MSVC sucks and its a pain in the ass to find out this stuff -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) -# define VEC_MSVC_VERSION_MAJOR (_MSC_FULL_VER / 10000000) -# define VEC_MSVC_VERSION_MINOR ((_MSC_FULL_VER % 10000000) / 100000) -# define VEC_MSVC_VERSION_PATCH (_MSC_FULL_VER % 100000) -#elif defined(_MSC_FULL_VER) -# define VEC_MSVC_VERSION_MAJOR (_MSC_FULL_VER / 1000000) -# define VEC_MSVC_VERSION_MINOR ((_MSC_FULL_VER % 1000000) / 10000) -# define VEC_MSVC_VERSION_PATCH (_MSC_FULL_VER % 10000) -#elif defined(_MSC_VER) -# define VEC_MSVC_VERSION_MAJOR (_MSC_VER / 100) -# define VEC_MSVC_VERSION_MINOR (_MSC_VER % 100) -# define VEC_MSVC_VERSION_PATCH (0) -#endif - -#ifdef VEC_MSVC_VERSION_MAJOR -# define VEC_MSVC_ATLEAST(x, y, z) \ - VEC_SEMVER_ATLEAST(VEC_MSVC_VERSION_MAJOR, VEC_MSVC_VERSION_MINOR, VEC_MSVC_VERSION_PATCH, x, y, z) -#else -# define VEC_MSVC_ATLEAST(x, y, z) (0) -#endif - -// now we get to GNU C stuff (not necessarily GCC) #ifdef __GNUC__ # define VEC_GNUC_ATLEAST(x, y, z) \ VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z) @@ -67,13 +82,13 @@ # define VEC_GNUC_ATLEAST(x, y, z) (0) #endif +/* GCC/clang attributes */ #if defined(__has_attribute) # define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x) #else # define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch) #endif -// this isn't used anywhere (yet!) but still useful to have #if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) # define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg) #elif (__STDC_VERSION__ >= 201112L) @@ -84,86 +99,332 @@ [!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })] #endif -////////////////////////////////////////////////////////////////////////////// -// Detect compiler SIMD support +#ifndef VEC_ASSERT +# ifndef VEC_DISABLE_ASSERTIONS +# include <assert.h> +# define VEC_ASSERT(x, msg) assert(msg && x) +# else +# define VEC_ASSERT(x, msg) +# endif +#endif + +/* --------------------------------------------------------------- */ +/* Detect compiler SIMD support */ + +#define VEC_NEON_ALIGNMENT 16 +#define VEC_ALTIVEC_ALIGNMENT 16 +#define VEC_SSE2_ALIGNMENT 16 +#define VEC_AVX2_ALIGNMENT 32 +#define VEC_AVX512F_ALIGNMENT 64 + +// for the generic implementation +#define VINT8x2_ALIGNMENT 1 +#define VUINT8x2_ALIGNMENT 1 + +#define VINT8x4_ALIGNMENT VINT8x2_ALIGNMENT +#define VINT16x2_ALIGNMENT 2 +#define VUINT8x4_ALIGNMENT VUINT8x2_ALIGNMENT +#define VUINT16x2_ALIGNMENT 2 + +#define VINT8x8_ALIGNMENT VINT8x4_ALIGNMENT +#define VINT16x4_ALIGNMENT VINT16x2_ALIGNMENT +#define VINT32x2_ALIGNMENT 4 +#define VUINT8x8_ALIGNMENT VUINT8x4_ALIGNMENT +#define VUINT16x4_ALIGNMENT VUINT16x2_ALIGNMENT +#define VUINT32x2_ALIGNMENT 4 + +#define VINT8x16_ALIGNMENT VINT8x8_ALIGNMENT +#define VINT16x8_ALIGNMENT VINT16x4_ALIGNMENT +#define VINT32x4_ALIGNMENT VINT32x2_ALIGNMENT +#define VINT64x2_ALIGNMENT 8 +#define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT +#define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT +#define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT +#define VUINT64x2_ALIGNMENT 8 + +#define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT +#define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT +#define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT +#define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT +#define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT +#define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT +#define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT +#define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT -// Current known alignments for each implementation, ordered by -// architecture and instruction set: -// -// /---------------------------------------------------\ -// | Architecture | Instruction Set | Bits | Alignment | -// |---------------------------------------------------| -// | ARM | NEON | 64 | 8 bytes | -// | ARM | NEON | 128 | 16 bytes | -// | PowerPC | AltiVec | 128 | 16 bytes | -// | x86 | MMX | 64 | None? | -// | x86 | SSE2 | 128 | 16 bytes | -// | x86 | AVX2 | 256 | 32 bytes | -// | x86 | AVX512-F | 512 | 64 bytes | -// \---------------------------------------------------/ -// -// If these ever have to be extended or changed, there absolutely *must* -// be a new major release of vec, since that would change the ABI... +#define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT +#define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT +#define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT +#define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT +#define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT +#define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT +#define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT +#define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT + +#ifndef VEC_SUPPRESS_HW -#define VINT8x2_ALIGNMENT 2 -#define VUINT8x2_ALIGNMENT 2 - -#define VINT8x4_ALIGNMENT 4 -#define VINT16x2_ALIGNMENT 4 -#define VUINT8x4_ALIGNMENT 4 -#define VUINT16x2_ALIGNMENT 4 +// IIRC `__VEC__' is also defined, but I don't know for sure. +// IBM says that `__ALTIVEC__' is standard though. +#ifdef __ALTIVEC__ +# include <altivec.h> +# define VEC_COMPILER_HAS_ALTIVEC +# if defined(__POWER8__) && defined(__VSX__) +# define VEC_COMPILER_HAS_ALTIVEC_VSX +# endif +# if VINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT8x16_ALIGNMENT +# define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT16x8_ALIGNMENT +# define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT64x2_ALIGNMENT +# define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT8x16_ALIGNMENT +# define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT16x8_ALIGNMENT +# define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT64x2_ALIGNMENT +# define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +#endif -// 64-bit -#define VINT8x8_ALIGNMENT 8 -#define VINT16x4_ALIGNMENT 8 -#define VINT32x2_ALIGNMENT 8 -#define VUINT8x8_ALIGNMENT 8 -#define VUINT16x4_ALIGNMENT 8 -#define VUINT32x2_ALIGNMENT 8 +#ifdef __ARM_NEON +# include <arm_neon.h> +# define VEC_COMPILER_HAS_NEON +# if VINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT8x8_ALIGNMENT +# define VINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT16x4_ALIGNMENT +# define VINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT8x8_ALIGNMENT +# define VUINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT16x4_ALIGNMENT +# define VUINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT8x16_ALIGNMENT +# define VINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT16x8_ALIGNMENT +# define VINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT64x2_ALIGNMENT +# define VINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT8x16_ALIGNMENT +# define VUINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT16x8_ALIGNMENT +# define VUINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT64x2_ALIGNMENT +# define VUINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +#endif -// 128-bit -#define VINT8x16_ALIGNMENT 16 -#define VINT16x8_ALIGNMENT 16 -#define VINT32x4_ALIGNMENT 16 -#define VINT64x2_ALIGNMENT 16 -#define VUINT8x16_ALIGNMENT 16 -#define VUINT16x8_ALIGNMENT 16 -#define VUINT32x4_ALIGNMENT 16 -#define VUINT64x2_ALIGNMENT 16 +#ifdef __MMX__ +# include <mmintrin.h> +# define VEC_COMPILER_HAS_MMX +#endif -// 256-bit -#define VINT8x32_ALIGNMENT 32 -#define VINT16x16_ALIGNMENT 32 -#define VINT32x8_ALIGNMENT 32 -#define VINT64x4_ALIGNMENT 32 -#define VUINT8x32_ALIGNMENT 32 -#define VUINT16x16_ALIGNMENT 32 -#define VUINT32x8_ALIGNMENT 32 -#define VUINT64x4_ALIGNMENT 32 +#ifdef __SSE2__ +# include <emmintrin.h> +# define VEC_COMPILER_HAS_SSE2 +# ifdef __SSE4_1__ +# define VEC_COMPILER_HAS_SSE41 +# endif +# if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT8x16_ALIGNMENT +# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT16x8_ALIGNMENT +# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT64x2_ALIGNMENT +# define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT8x16_ALIGNMENT +# define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT16x8_ALIGNMENT +# define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT64x2_ALIGNMENT +# define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +#endif -// 512-bit -#define VINT8x64_ALIGNMENT 64 -#define VINT16x32_ALIGNMENT 64 -#define VINT32x16_ALIGNMENT 64 -#define VINT64x8_ALIGNMENT 64 -#define VUINT8x64_ALIGNMENT 64 -#define VUINT16x32_ALIGNMENT 64 -#define VUINT32x16_ALIGNMENT 64 -#define VUINT64x8_ALIGNMENT 64 +#ifdef __AVX2__ +# include <immintrin.h> +# define VEC_COMPILER_HAS_AVX2 +# if VINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT8x32_ALIGNMENT +# define VINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT16x16_ALIGNMENT +# define VINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT32x8_ALIGNMENT +# define VINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT64x4_ALIGNMENT +# define VINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT8x32_ALIGNMENT +# define VUINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT16x16_ALIGNMENT +# define VUINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT32x8_ALIGNMENT +# define VUINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT64x4_ALIGNMENT +# define VUINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +#endif -////////////////////////////////////////////////////////////////////////////// -// bit shift +#ifdef __AVX512F__ +# include <immintrin.h> +# define VEC_COMPILER_HAS_AVX512F +# if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT8x64_ALIGNMENT +# define VINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT16x32_ALIGNMENT +# define VINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT32x16_ALIGNMENT +# define VINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT64x8_ALIGNMENT +# define VINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT8x64_ALIGNMENT +# define VUINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT16x32_ALIGNMENT +# define VUINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT32x16_ALIGNMENT +# define VUINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT64x8_ALIGNMENT +# define VUINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +#endif -inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y) +#endif + +/* --------------------------------------------------------------- */ +/* bit shift */ + +inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y) { return x >> y; } -inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y) +inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y) { return x << y; } +inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y) +{ + // reinterpret as unsigned integer and then shift + union { + vec_intmax d; + vec_uintmax u; + } xx; + + xx.d = x; + xx.u >>= y; + return xx.d; +} + +inline vec_intmax vec_llshift(vec_intmax x, unsigned int y) +{ + // reinterpret as unsigned integer and then shift + union { + vec_intmax d; + vec_uintmax u; + } xx; + + xx.d = x; + xx.u <<= y; + return xx.d; +} + inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y) { return x >> y; @@ -236,399 +497,334 @@ return xx.d; } -////////////////////////////////////////////////////////////////////////////// -// array alignment - -#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) -# define VEC_ALIGNOF(type) alignof(x) -#elif (__STDC_VERSION__ >= 201112L) -# define VEC_ALIGNOF(type) _Alignof(x) -#elif defined(HAVE_STDDEF_H) // already included -# define VEC_ALIGNOF(type) \ - (offsetof(struct { char slot1; x slot2; }, slot2)) -#else -// inline offsetof -# define VEC_ALIGNOF(type) \ - ((vec_uintsize)((char *)&((struct { char slot1; x slot2; } *)0)->slot2 - (char *)0)) -#endif - -#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) -# define VEC_ALIGNAS(x) alignas(x) -#elif (__STDC_VERSION__ >= 201112L) -# define VEC_ALIGNAS(x) _Alignas(x) -#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0) -# define VEC_ALIGNAS(x) __attribute__((__aligned__(x))) -#elif VEC_MSVC_ATLEAST(0, 0, 0) // FIXME which version? -# define VEC_ALIGNAS(x) __declspec(align(x)) -#else -# error vec: vec requires compiler alignment support -#endif - -// this wart is here because originally vec didn't require that -// there be compiler support for alignment. now that we *do*, -// we need to -#ifdef VEC_ALIGNAS -# define VEC_ALIGNED_ARRAY(type, var, length, align) \ - VEC_ALIGNAS(align) type var[length] -# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(var)) +#ifdef VEC_IMPLEMENTATION +extern inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y); +extern inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y); +extern inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y); +extern inline vec_intmax vec_llshift(vec_intmax x, unsigned int y); +extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); +extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); +extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); +extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); #endif -#define VEC_ALIGNED_ARRAY_LENGTH(var) \ - (VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var)) - -////////////////////////////////////////////////////////////////////////////////////// -// predefined variants for each vector type - -////////////////////////////////////////////////////////////////////////////////////// -// 16-bit - -#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT) -#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT) -#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT) -#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0) - -#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT) -#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT) -#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT) -#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 32-bit - -#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT) -#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT) -#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT) -#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0) - -#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT) -#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT) -#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT) -#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0) - -#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT) -#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT) -#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT) -#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0) - -#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT) -#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT) -#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT) -#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 64-bit - -#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT) -#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT) -#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT) -#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0) - -#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT) -#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT) -#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT) -#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0) - -#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT) -#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT) -#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT) -#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0) - -#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT) -#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT) -#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT) -#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0) - -#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT) -#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT) -#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT) -#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0) - -#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT) -#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT) -#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT) -#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 128-bit - -#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) -#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) - -#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) -#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) - -#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) -#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) - -#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) -#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) - -#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) +/* --------------------------------------------------------------- */ -#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) - -#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) - -#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 256-bit - -#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) -#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) - -#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) -#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) - -#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) -#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) - -#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) -#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) - -#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) -#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) - -#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) -#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) +#include "impl/align.h" -#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) -#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) - -#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) -#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 512-bit - -#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) -#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) - -#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT) -#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) - -#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) -#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) - -#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) -#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) - -#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) -#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) - -#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT) -#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) - -#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) -#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) - -#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) -#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////// -// Defines the structures for each vector type +/* --------------------------------------------------------------- */ +/* Defines the structures for each vector type */ // 16-bit -typedef struct { - VEC_ALIGNAS(VUINT8x2_ALIGNMENT) vec_uint8 bytes[2]; +typedef union { + vec_uint8 generic[2]; } vuint8x2; -typedef struct { - VEC_ALIGNAS(VINT8x2_ALIGNMENT) vec_uint8 bytes[2]; +typedef union { + vec_int8 generic[2]; } vint8x2; // 32-bit -typedef struct { - VEC_ALIGNAS(VUINT8x4_ALIGNMENT) vec_uint8 bytes[4]; +typedef union { + vuint8x2 generic[2]; } vuint8x4; -typedef struct { - VEC_ALIGNAS(VUINT16x2_ALIGNMENT) vec_uint8 bytes[4]; +typedef union { + vec_uint16 generic[2]; } vuint16x2; -typedef struct { - VEC_ALIGNAS(VINT8x4_ALIGNMENT) vec_uint8 bytes[4]; +typedef union { + vint8x2 generic[2]; } vint8x4; -typedef struct { - VEC_ALIGNAS(VINT16x2_ALIGNMENT) vec_uint8 bytes[4]; +typedef union { + vec_int16 generic[2]; } vint16x2; // 64-bit -typedef struct { - VEC_ALIGNAS(VUINT8x8_ALIGNMENT) vec_uint8 bytes[8]; +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif +#ifdef VEC_COMPILER_HAS_NEON + uint8x8_t neon; +#endif + + vuint8x4 generic[2]; } vuint8x8; -typedef struct { - VEC_ALIGNAS(VUINT16x4_ALIGNMENT) vec_uint8 bytes[8]; +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif +#ifdef VEC_COMPILER_HAS_NEON + uint16x4_t neon; +#endif + + vuint16x2 generic[2]; } vuint16x4; -typedef struct { - VEC_ALIGNAS(VUINT32x2_ALIGNMENT) vec_uint8 bytes[8]; +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif +#ifdef VEC_COMPILER_HAS_NEON + uint32x2_t neon; +#endif + + vec_uint32 generic[2]; } vuint32x2; -typedef struct { - VEC_ALIGNAS(VINT8x8_ALIGNMENT) vec_uint8 bytes[8]; +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif +#ifdef VEC_COMPILER_HAS_NEON + int8x8_t neon; +#endif + + vint8x4 generic[2]; } vint8x8; -typedef struct { - VEC_ALIGNAS(VINT16x4_ALIGNMENT) vec_uint8 bytes[8]; +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif +#ifdef VEC_COMPILER_HAS_NEON + int16x4_t neon; +#endif + + vint16x2 generic[2]; } vint16x4; -typedef struct { - VEC_ALIGNAS(VINT32x2_ALIGNMENT) vec_uint8 bytes[8]; +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif +#ifdef VEC_COMPILER_HAS_NEON + int32x2_t neon; +#endif + + vec_int32 generic[2]; } vint32x2; // 128-bit typedef union { - VEC_ALIGNAS(VUINT8x16_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector unsigned char altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + uint8x16_t neon; +#endif + vuint8x8 generic[2]; } vuint8x16; typedef union { - VEC_ALIGNAS(VUINT16x8_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector unsigned short altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + uint16x8_t neon; +#endif + vuint16x4 generic[2]; } vuint16x8; typedef union { - VEC_ALIGNAS(VUINT32x4_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector unsigned int altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + uint32x4_t neon; +#endif + vuint32x2 generic[2]; } vuint32x4; typedef union { - VEC_ALIGNAS(VUINT64x2_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + vector unsigned long long altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + uint64x2_t neon; +#endif + vec_uint64 generic[2]; } vuint64x2; typedef union { - VEC_ALIGNAS(VINT8x16_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector signed char altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + int8x16_t neon; +#endif + vint8x8 generic[2]; } vint8x16; typedef union { - VEC_ALIGNAS(VINT16x8_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector signed short altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + int16x8_t neon; +#endif + vint16x4 generic[2]; } vint16x8; typedef union { - VEC_ALIGNAS(VINT32x4_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC + vector signed int altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + int32x4_t neon; +#endif + vint32x2 generic[2]; } vint32x4; typedef union { - VEC_ALIGNAS(VINT64x2_ALIGNMENT) vec_uint8 bytes[16]; +#ifdef VEC_COMPILER_HAS_SSE2 + __m128i sse; +#endif +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + vector signed long long altivec; +#endif +#ifdef VEC_COMPILER_HAS_NEON + int64x2_t neon; +#endif + vec_int64 generic[2]; } vint64x2; // 256-bit typedef union { - VEC_ALIGNAS(VUINT8x32_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vuint8x16 generic[2]; } vuint8x32; typedef union { - VEC_ALIGNAS(VUINT16x16_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vuint16x8 generic[2]; } vuint16x16; typedef union { - VEC_ALIGNAS(VUINT32x8_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vuint32x4 generic[2]; } vuint32x8; typedef union { - VEC_ALIGNAS(VUINT64x4_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vuint64x2 generic[2]; } vuint64x4; typedef union { - VEC_ALIGNAS(VINT8x32_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vint8x16 generic[2]; } vint8x32; typedef union { - VEC_ALIGNAS(VINT16x16_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vint16x8 generic[2]; } vint16x16; typedef union { - VEC_ALIGNAS(VINT32x8_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vint32x4 generic[2]; } vint32x8; typedef union { - VEC_ALIGNAS(VINT64x4_ALIGNMENT) vec_uint8 bytes[32]; +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif + vint64x2 generic[2]; } vint64x4; // 512-bit typedef union { - VEC_ALIGNAS(VUINT8x64_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vuint8x32 generic[2]; } vuint8x64; typedef union { - VEC_ALIGNAS(VUINT16x32_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vuint16x16 generic[2]; } vuint16x32; typedef union { - VEC_ALIGNAS(VUINT32x16_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vuint32x8 generic[2]; } vuint32x16; typedef union { - VEC_ALIGNAS(VUINT64x8_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vuint64x4 generic[2]; } vuint64x8; typedef union { - VEC_ALIGNAS(VINT8x64_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vint8x32 generic[2]; } vint8x64; typedef union { - VEC_ALIGNAS(VINT16x32_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vint16x16 generic[2]; } vint16x32; typedef union { - VEC_ALIGNAS(VINT32x16_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vint32x8 generic[2]; } vint32x16; typedef union { - VEC_ALIGNAS(VINT64x8_ALIGNMENT) vec_uint8 bytes[64]; +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif + vint64x4 generic[2]; } vint64x8; // --------------------------------------------------------------------------------- @@ -636,12 +832,77 @@ int vec_init(void); -////////////////////////////////////////////////////////////////////////////// -// these are, for the most part, meant to be used internally +#define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ + void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ + void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); + +#define VEC_DECLARE_OPERATIONS(bits, size) \ + VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \ + VEC_DECLARE_OPERATIONS_SIGN(u, bits, size) + +// 16-bit +VEC_DECLARE_OPERATIONS(8, 2) + +// 32-bit +VEC_DECLARE_OPERATIONS(8, 4) +VEC_DECLARE_OPERATIONS(16, 2) + +// 64-bit +VEC_DECLARE_OPERATIONS(8, 8) +VEC_DECLARE_OPERATIONS(16, 4) +VEC_DECLARE_OPERATIONS(32, 2) + +// 128-bit +VEC_DECLARE_OPERATIONS(8, 16) +VEC_DECLARE_OPERATIONS(16, 8) +VEC_DECLARE_OPERATIONS(32, 4) +VEC_DECLARE_OPERATIONS(64, 2) + +// 256-bit +VEC_DECLARE_OPERATIONS(8, 32) +VEC_DECLARE_OPERATIONS(16, 16) +VEC_DECLARE_OPERATIONS(32, 8) +VEC_DECLARE_OPERATIONS(64, 4) + +// 512-bit +VEC_DECLARE_OPERATIONS(8, 64) +VEC_DECLARE_OPERATIONS(16, 32) +VEC_DECLARE_OPERATIONS(32, 16) +VEC_DECLARE_OPERATIONS(64, 8) + +#undef VEC_DECLARE_OPERATIONS +#undef VEC_DECLARE_OPERATIONS_SIGN + +// --------------------------------------------------------------------------------- +// okay, now we can actually implement the functions + +#ifdef VEC_IMPLEMENTATION + +// Fallback functions, need to be defined before everything else. +#include "impl/fallback.h" // okay, these are filled in for each supported backend. // `and', `or', `xor', and `nor' have to be prefixed with -// `b' because of <iso646.h>/cxxisms +// `b' because of <iso646.h> #define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \ typedef struct { \ v##sign##int##bits##x##size (*splat)(vec_##sign##int##bits x); \ @@ -705,166 +966,389 @@ #undef VEC_DEFINE_IMPL_STRUCT #undef VEC_DEFINE_IMPL_STRUCT_SIGN +// ------------------------------------------------------------------------ + +#ifdef VEC_COMPILER_HAS_ALTIVEC +# include "impl/ppc/altivec.h" +#endif + +#ifdef VEC_COMPILER_HAS_AVX512F +# include "impl/x86/avx512f.h" +#endif + +#ifdef VEC_COMPILER_HAS_AVX2 +# include "impl/x86/avx2.h" +#endif + +#ifdef VEC_COMPILER_HAS_SSE2 +# include "impl/x86/sse2.h" +#endif + +// depends on SSE2 functions; the only thing SSE4.1 provides for us +// is a native 32-bit multiply +#ifdef VEC_COMPILER_HAS_SSE41 +# include "impl/x86/sse41.h" +#endif + +#ifdef VEC_COMPILER_HAS_MMX +# include "impl/x86/mmx.h" +#endif + +#ifdef VEC_COMPILER_HAS_NEON +# include "impl/arm/neon.h" +#endif + +#include "impl/generic.h" + +/* ---------------------------------------------------------------- */ + +#include "impl/cpu.h" // CPU detection crap + // 16-bit -extern const vint8x2_impl *vint8x2_impl_cpu; -extern const vuint8x2_impl *vuint8x2_impl_cpu; +static vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; +static vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; // 32-bit -extern const vint8x4_impl *vint8x4_impl_cpu; -extern const vuint8x4_impl *vuint8x4_impl_cpu; -extern const vint16x2_impl *vint16x2_impl_cpu; -extern const vuint16x2_impl *vuint16x2_impl_cpu; +static vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; +static vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; +static vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; +static vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; // 64-bit -extern const vint8x8_impl *vint8x8_impl_cpu; -extern const vuint8x8_impl *vuint8x8_impl_cpu; -extern const vint16x4_impl *vint16x4_impl_cpu; -extern const vuint16x4_impl *vuint16x4_impl_cpu; -extern const vint32x2_impl *vint32x2_impl_cpu; -extern const vuint32x2_impl *vuint32x2_impl_cpu; +static vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; +static vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; +static vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; +static vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; +static vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; +static vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; // 128-bit -extern const vint8x16_impl *vint8x16_impl_cpu; -extern const vuint8x16_impl *vuint8x16_impl_cpu; -extern const vint16x8_impl *vint16x8_impl_cpu; -extern const vuint16x8_impl *vuint16x8_impl_cpu; -extern const vint32x4_impl *vint32x4_impl_cpu; -extern const vuint32x4_impl *vuint32x4_impl_cpu; -extern const vint64x2_impl *vint64x2_impl_cpu; -extern const vuint64x2_impl *vuint64x2_impl_cpu; +static vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; +static vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; +static vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; +static vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; +static vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; +static vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; +static vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; +static vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; // 256-bit -extern const vint8x32_impl *vint8x32_impl_cpu; -extern const vuint8x32_impl *vuint8x32_impl_cpu; -extern const vint16x16_impl *vint16x16_impl_cpu; -extern const vuint16x16_impl *vuint16x16_impl_cpu; -extern const vint32x8_impl *vint32x8_impl_cpu; -extern const vuint32x8_impl *vuint32x8_impl_cpu; -extern const vint64x4_impl *vint64x4_impl_cpu; -extern const vuint64x4_impl *vuint64x4_impl_cpu; +static vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; +static vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; +static vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; +static vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; +static vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; +static vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; +static vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; +static vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; // 512-bit -extern const vint8x64_impl *vint8x64_impl_cpu; -extern const vuint8x64_impl *vuint8x64_impl_cpu; -extern const vint16x32_impl *vint16x32_impl_cpu; -extern const vuint16x32_impl *vuint16x32_impl_cpu; -extern const vint32x16_impl *vint32x16_impl_cpu; -extern const vuint32x16_impl *vuint32x16_impl_cpu; -extern const vint64x8_impl *vint64x8_impl_cpu; -extern const vuint64x8_impl *vuint64x8_impl_cpu; +static vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; +static vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; +static vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; +static vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; +static vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; +static vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; +static vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; +static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; + +// returns 0 or a negative error code on failure +int vec_init(void) +{ + // This function is NOT thread safe. However, once vec + // is initialized, all of the vector functions are thread-safe. + // + // In fact, it's possible to use vec without calling + // vec_init() at all, but it would be completely useless since + // it would just use a generic implementation without any + // vectorization whatsoever (unless maybe the compiler is + // smart enough to optimize it into vectors) + + vec_get_CPU_features(); -////////////////////////////////////////////////////////////////////////////// -// declared as inline for ! performance : ) +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (vec_CPU_have_ALTIVEC()) { + vint8x16_impl_cpu = &vint8x16_impl_altivec; + vuint8x16_impl_cpu = &vuint8x16_impl_altivec; + vint16x8_impl_cpu = &vint16x8_impl_altivec; + vuint16x8_impl_cpu = &vuint16x8_impl_altivec; + vint32x4_impl_cpu = &vint32x4_impl_altivec; + vuint32x4_impl_cpu = &vuint32x4_impl_altivec; +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + if (vec_CPU_have_ALTIVEC_VSX()) { + vint64x2_impl_cpu = &vint64x2_impl_altivec; + vuint64x2_impl_cpu = &vuint64x2_impl_altivec; + } +#endif + } +#endif +#ifdef VEC_COMPILER_HAS_AVX512F + if (vec_CPU_have_AVX512F()) { + vint8x64_impl_cpu = &vint8x64_impl_avx512f; + vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; + vint16x32_impl_cpu = &vint16x32_impl_avx512f; + vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; + vint32x16_impl_cpu = &vint32x16_impl_avx512f; + vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; + vint64x8_impl_cpu = &vint64x8_impl_avx512f; + vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; + } +#endif +#ifdef VEC_COMPILER_HAS_AVX2 + if (vec_CPU_have_AVX2()) { + vint8x32_impl_cpu = &vint8x32_impl_avx2; + vuint8x32_impl_cpu = &vuint8x32_impl_avx2; + vint16x16_impl_cpu = &vint16x16_impl_avx2; + vuint16x16_impl_cpu = &vuint16x16_impl_avx2; + vint32x8_impl_cpu = &vint32x8_impl_avx2; + vuint32x8_impl_cpu = &vuint32x8_impl_avx2; + vint64x4_impl_cpu = &vint64x4_impl_avx2; + vuint64x4_impl_cpu = &vuint64x4_impl_avx2; + } +#endif +#ifdef VEC_COMPILER_HAS_SSE2 + if (vec_CPU_have_SSE2()) { + vint8x16_impl_cpu = &vint8x16_impl_sse2; + vuint8x16_impl_cpu = &vuint8x16_impl_sse2; + vint16x8_impl_cpu = &vint16x8_impl_sse2; + vuint16x8_impl_cpu = &vuint16x8_impl_sse2; +# ifdef VEC_COMPILER_HAS_SSE41 + if (vec_CPU_have_SSE41()) { + vint32x4_impl_cpu = &vint32x4_impl_sse41; + vuint32x4_impl_cpu = &vuint32x4_impl_sse41; + } else +# endif + { + vint32x4_impl_cpu = &vint32x4_impl_sse2; + vuint32x4_impl_cpu = &vuint32x4_impl_sse2; + } + vint64x2_impl_cpu = &vint64x2_impl_sse2; + vuint64x2_impl_cpu = &vuint64x2_impl_sse2; + } +#endif +#ifdef VEC_COMPILER_HAS_MMX + if (vec_CPU_have_MMX()) { + vint8x8_impl_cpu = &vint8x8_impl_mmx; + vuint8x8_impl_cpu = &vuint8x8_impl_mmx; + vint16x4_impl_cpu = &vint16x4_impl_mmx; + vuint16x4_impl_cpu = &vuint16x4_impl_mmx; + vint32x2_impl_cpu = &vint32x2_impl_mmx; + vuint32x2_impl_cpu = &vuint32x2_impl_mmx; + } +#endif +#ifdef VEC_COMPILER_HAS_NEON + if (vec_CPU_have_NEON()) { + // 64-bit + vint8x8_impl_cpu = &vint8x8_impl_neon; + vuint8x8_impl_cpu = &vuint8x8_impl_neon; + vint16x4_impl_cpu = &vint16x4_impl_neon; + vuint16x4_impl_cpu = &vuint16x4_impl_neon; + vint32x2_impl_cpu = &vint32x2_impl_neon; + vuint32x2_impl_cpu = &vuint32x2_impl_neon; + + // 128-bit + vint8x16_impl_cpu = &vint8x16_impl_neon; + vuint8x16_impl_cpu = &vuint8x16_impl_neon; + vint16x8_impl_cpu = &vint16x8_impl_neon; + vuint16x8_impl_cpu = &vuint16x8_impl_neon; + vint32x4_impl_cpu = &vint32x4_impl_neon; + vuint32x4_impl_cpu = &vuint32x4_impl_neon; + vint64x2_impl_cpu = &vint64x2_impl_neon; + vuint64x2_impl_cpu = &vuint64x2_impl_neon; + } +#endif + { + // do nothing, they're already set to generics + } + + return 0; +} + +/* ---------------------------------------------------------------- */ #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ - { \ - return v##sign##int##bits##x##size##_impl_cpu->splat(x); \ - } \ - \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \ - } \ - \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->load(in); \ - } \ + if (v##sign##int##bits##x##size##_impl_cpu->splat) \ + return v##sign##int##bits##x##size##_impl_cpu->splat(x); \ \ - inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \ - } \ - \ - inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - return v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \ + return v##sign##int##bits##x##size##_fallback_splat(x); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \ + v##sign##int##bits##x##size err = {0}; \ + \ + if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \ + return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \ + \ + VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \ + \ + return err; \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->load) \ + return v##sign##int##bits##x##size##_impl_cpu->load(in); \ + \ + return v##sign##int##bits##x##size##_fallback_load(in); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \ + v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \ + return; \ + } \ + \ + VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->store) { \ + v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \ + return; \ + } \ + \ + v##sign##int##bits##x##size##_fallback_store(vec, out); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->add) \ + v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_add(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->sub) \ + v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_sub(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->mul) \ + v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_mul(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->div) \ + v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_div(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->avg) \ + v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_avg(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \ + if (v##sign##int##bits##x##size##_impl_cpu->band) \ + v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->bor) \ + v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->bxor) \ + v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->bnot) \ + v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \ + \ + return v##sign##int##bits##x##size##_fallback_not(vec); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->cmplt) \ + v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmplt(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->cmple) \ + v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmple(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->cmpeq) \ + v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmpeq(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->cmpge) \ + v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmpge(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->cmpgt) \ + v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmpgt(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->lshift) \ + v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_lshift(vec1, vec2); \ } \ \ - inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ { \ - return v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->rshift) \ + v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_rshift(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->lrshift) \ + v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_lrshift(vec1, vec2); \ } #define VEC_DEFINE_OPERATIONS(bits, size) \ @@ -904,6 +1388,8 @@ #undef VEC_DEFINE_OPERATIONS #undef VEC_DEFINE_OPERATIONS_SIGN +#endif /* VEC_IMPLEMENTATION */ + #ifdef __cplusplus } #endif
--- a/src/cpu.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,497 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -/* Detect CPU SIMD support. Much of this code was stolen from SDL. - * - * Simple DirectMedia Layer - * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org> - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. -*/ - -#include "vec/cpu.h" - -#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) -# include <sys/sysctl.h> // For AltiVec check -#elif defined(__OpenBSD__) && defined(__powerpc__) -# include <sys/types.h> -# include <sys/sysctl.h> // For AltiVec check -# include <machine/cpu.h> -#elif defined(__FreeBSD__) && defined(__powerpc__) -# include <machine/cpu.h> -# include <sys/auxv.h> -#elif defined(__ALTIVEC__) -# include <signal.h> -# include <setjmp.h> -#endif - -#ifdef __FreeBSD__ -# include <sys/param.h> -#endif - -#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__) -# include <unistd.h> -# include <sys/types.h> -# include <sys/stat.h> -# include <fcntl.h> -# include <elf.h> - -/*#include <asm/hwcap.h>*/ -# ifndef AT_HWCAP -# define AT_HWCAP 16 -# endif -# ifndef AT_PLATFORM -# define AT_PLATFORM 15 -# endif -# ifndef HWCAP_NEON -# define HWCAP_NEON (1 << 12) -# endif -#endif - -static inline int vec_CPU_have_CPUID(void) -{ - int has_CPUID = 0; - -#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) - __asm__ ( -" pushfl # Get original EFLAGS \n" -" popl %%eax \n" -" movl %%eax,%%ecx \n" -" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" -" pushl %%eax # Save new EFLAGS value on stack \n" -" popfl # Replace current EFLAGS value \n" -" pushfl # Get new EFLAGS \n" -" popl %%eax # Store new EFLAGS in EAX \n" -" xorl %%ecx,%%eax # Can not toggle ID bit, \n" -" jz 1f # Processor=80486 \n" -" movl $1,%0 # We have CPUID support \n" -"1: \n" - : "=m" (has_CPUID) - : - : "%eax", "%ecx" - ); -#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) -/* Technically, if this is being compiled under __x86_64__ then it has - CPUid by definition. But it's nice to be able to prove it. :) */ - __asm__ ( -" pushfq # Get original EFLAGS \n" -" popq %%rax \n" -" movq %%rax,%%rcx \n" -" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" -" pushq %%rax # Save new EFLAGS value on stack \n" -" popfq # Replace current EFLAGS value \n" -" pushfq # Get new EFLAGS \n" -" popq %%rax # Store new EFLAGS in EAX \n" -" xorl %%ecx,%%eax # Can not toggle ID bit, \n" -" jz 1f # Processor=80486 \n" -" movl $1,%0 # We have CPUID support \n" -"1: \n" - : "=m" (has_CPUID) - : - : "%rax", "%rcx" - ); -#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) - __asm { - pushfd ; Get original EFLAGS - pop eax - mov ecx, eax - xor eax, 200000h ; Flip ID bit in EFLAGS - push eax ; Save new EFLAGS value on stack - popfd ; Replace current EFLAGS value - pushfd ; Get new EFLAGS - pop eax ; Store new EFLAGS in EAX - xor eax, ecx ; Can not toggle ID bit, - jz done ; Processor=80486 - mov has_CPUID,1 ; We have CPUID support -done: - } -#elif defined(_MSC_VER) && defined(_M_X64) - has_CPUID = 1; -#elif defined(__sun) && defined(__i386) - __asm ( -" pushfl \n" -" popl %eax \n" -" movl %eax,%ecx \n" -" xorl $0x200000,%eax \n" -" pushl %eax \n" -" popfl \n" -" pushfl \n" -" popl %eax \n" -" xorl %ecx,%eax \n" -" jz 1f \n" -" movl $1,-8(%ebp) \n" -"1: \n" - ); -#elif defined(__sun) && defined(__amd64) - __asm ( -" pushfq \n" -" popq %rax \n" -" movq %rax,%rcx \n" -" xorl $0x200000,%eax \n" -" pushq %rax \n" -" popfq \n" -" pushfq \n" -" popq %rax \n" -" xorl %ecx,%eax \n" -" jz 1f \n" -" movl $1,-8(%rbp) \n" -"1: \n" - ); -#endif - - return has_CPUID; -} - -#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) -# define VEC_CPU_CPUID(func, a, b, c, d) \ - __asm__ __volatile__( \ - " pushl %%ebx \n" \ - " xorl %%ecx,%%ecx \n" \ - " cpuid \n" \ - " movl %%ebx, %%esi \n" \ - " popl %%ebx \n" \ - : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ - : "a"(func)) -#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) -# define VEC_CPU_CPUID(func, a, b, c, d) \ - __asm__ __volatile__( \ - " pushq %%rbx \n" \ - " xorq %%rcx,%%rcx \n" \ - " cpuid \n" \ - " movq %%rbx, %%rsi \n" \ - " popq %%rbx \n" \ - : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ - : "a"(func)) -#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) -# define VEC_CPU_CPUID(func, a, b, c, d) \ - __asm { \ - __asm mov eax, func \ - __asm xor ecx, ecx \ - __asm cpuid \ - __asm mov a, eax \ - __asm mov b, ebx \ - __asm mov c, ecx \ - __asm mov d, edx \ - } -#elif (defined(_MSC_VER) && defined(_M_X64)) -// Use __cpuidex instead of __cpuid because ICL does not clear ecx register -# define VEC_CPU_CPUID(func, a, b, c, d) \ - do { \ - int CPUInfo[4]; \ - __cpuidex(CPUInfo, func, 0); \ - a = CPUInfo[0]; \ - b = CPUInfo[1]; \ - c = CPUInfo[2]; \ - d = CPUInfo[3]; \ - } while (0) -#else -# define VEC_CPU_CPUID(func, a, b, c, d) \ - do { \ - a = b = c = d = 0; \ - (void)a; \ - (void)b; \ - (void)c; \ - (void)d; \ - } while (0) -#endif - -// --------------------------------------------------------------- - -static int vec_CPU_CPUIDFeatures[4]; -static int vec_CPU_CPUIDMaxFunction = 0; -static int vec_CPU_OSSavesYMM = 0; -static int vec_CPU_OSSavesZMM = 0; - -static inline void vec_CPU_get_CPUID_features(void) -{ - static int checked = 0; - if (!checked) { - checked = 1; - if (vec_CPU_have_CPUID()) { - int a, b, c, d; - VEC_CPU_CPUID(0, a, b, c, d); - vec_CPU_CPUIDMaxFunction = a; - if (vec_CPU_CPUIDMaxFunction >= 1) { - VEC_CPU_CPUID(1, a, b, c, d); - vec_CPU_CPUIDFeatures[0] = a; - vec_CPU_CPUIDFeatures[1] = b; - vec_CPU_CPUIDFeatures[2] = c; - vec_CPU_CPUIDFeatures[3] = d; - - // Check to make sure we can call xgetbv - if (c & 0x08000000) { - // Call xgetbv to see if YMM (etc) register state is saved -#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__)) - __asm__(".byte 0x0f, 0x01, 0xd0" - : "=a"(a) - : "c"(0) - : "%edx"); -#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1 - a = (int)_xgetbv(0); -#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) - __asm { - xor ecx, ecx - _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 - mov a, eax - } -#endif - vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0; - vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0; - } - } - } - } -} - -#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) -static jmp_buf vec_jmpbuf; -static void vec_CPU_illegal_instruction(int sig) -{ - longjmp(vec_jmpbuf, 1); -} -#endif - -static int vec_CPU_have_ALTIVEC(void) -{ - volatile int altivec = 0; -#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__)) - int selectors[2] = { -# ifdef __OpenBSD__ - CTL_MACHDEP, CPU_ALTIVEC -# else - CTL_HW, HW_VECTORUNIT -# endif - }; - int hasVectorUnit = 0; - vec_uintsize length = sizeof(hasVectorUnit); - int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); - if (!error) - altivec = (hasVectorUnit != 0); -#elif defined(__FreeBSD__) && defined(__powerpc__) - unsigned long cpufeatures = 0; - elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); - altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; -#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) - void (*handler)(int sig); - handler = signal(SIGILL, vec_CPU_illegal_instruction); - if (!setjmp(vec_jmpbuf)) { - vector unsigned char vec; - vec_and(vec, vec); - altivec = 1; - } - signal(SIGILL, handler); -#endif - return altivec; -} - -static int vec_CPU_have_ALTIVEC_VSX(void) -{ - volatile int vsx = 0; -#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__) -# warning Compiling UNTESTED code for VSX. - void (*handler)(int sig); - handler = signal(SIGILL, vec_CPU_illegal_instruction); - if (!setjmp(vec_jmpbuf)) { - // this is completely untested - //__asm__ __volatile__("mtspr 256, %0\n\t" - // "xxland %%v0, %%v0, %%v0" ::"r"(-1)); - //vsx = 1; - } - signal(SIGILL, handler); -#endif - return vsx; -} - -#define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000) -#define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000) -#define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000) -#define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001) -#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000) -#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000) -#define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000)) - -static inline int vec_CPU_have_AVX2(void) -{ - if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { - int a, b, c, d; - VEC_CPU_CPUID(7, a, b, c, d); - return b & 0x00000020; - (void)a, (void)c, (void)d; - } - return 0; -} - -static inline int vec_CPU_have_AVX512F(void) -{ - if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { - int a, b, c, d; - VEC_CPU_CPUID(7, a, b, c, d); - return b & 0x00000020; - (void)a, (void)c, (void)d; - } - return 0; -} - -#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL) -static int readProcAuxvForNeon(void) -{ - int neon = 0; - int fd; - - fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); - if (fd >= 0) { - Elf32_auxv_t aux; - while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { - if (aux.a_type == AT_HWCAP) { - neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON; - break; - } - } - close(fd); - } - return neon; -} -#endif - -static int vec_CPU_have_NEON(void) -{ -/* The way you detect NEON is a privileged instruction on ARM, so you have - query the OS kernel in a platform-specific way. :/ */ -#if defined(SDL_CPUINFO_DISABLED) - return 0; /* disabled */ -#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64)) -/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */ -/* Seems to have been removed */ -#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE -#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19 -#endif - /* All WinRT ARM devices are required to support NEON, but just in case. */ - return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0; -#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__) - return 1; /* ARMv8 always has non-optional NEON support. */ -#elif defined(__VITA__) - return 1; -#elif defined(__3DS__) - return 0; -#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) - /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */ - return 1; /* all Apple ARMv7 chips and later have NEON. */ -#elif defined(__APPLE__) - return 0; /* assume anything else from Apple doesn't have NEON. */ -#elif !defined(__arm__) - return 0; /* not an ARM CPU at all. */ -#elif defined(__OpenBSD__) - return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */ -#elif defined(HAVE_ELF_AUX_INFO) - unsigned long hasneon = 0; - if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0) - return 0; - - return ((hasneon & HWCAP_NEON) == HWCAP_NEON); -#elif defined(__QNXNTO__) - return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON; -#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL) - return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON; -#elif defined(__linux__) - return readProcAuxvForNeon(); -#elif defined(__ANDROID__) - /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */ - { - AndroidCpuFamily cpu_family = android_getCpuFamily(); - if (cpu_family == ANDROID_CPU_FAMILY_ARM) { - uint64_t cpu_features = android_getCpuFeatures(); - if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) { - return 1; - } - } - return 0; - } -#elif defined(__RISCOS__) - /* Use the VFPSupport_Features SWI to access the MVFR registers */ - { - _kernel_swi_regs regs; - regs.r[0] = 0; - if (_kernel_swi(VFPSupport_Features, ®s, ®s) == NULL) { - if ((regs.r[2] & 0xFFF000) == 0x111000) { - return 1; - } - } - return 0; - } -#else -#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me. - return 0; -#endif -} - -#define VEC_CPU_FEATURES_RESET VEC_UINT32_C(0xFFFFFFFF) - -static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET; - -vec_uint32 vec_get_CPU_features(void) -{ - if (vec_CPU_features == VEC_CPU_FEATURES_RESET) { - vec_CPU_get_CPUID_features(); - vec_CPU_features = 0; - if (vec_CPU_have_ALTIVEC()) - vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; - if (vec_CPU_have_ALTIVEC_VSX()) - vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; - if (vec_CPU_have_MMX()) - vec_CPU_features |= VEC_CPU_HAS_MMX; - if (vec_CPU_have_SSE()) - vec_CPU_features |= VEC_CPU_HAS_SSE; - if (vec_CPU_have_SSE2()) - vec_CPU_features |= VEC_CPU_HAS_SSE2; - if (vec_CPU_have_SSE3()) - vec_CPU_features |= VEC_CPU_HAS_SSE3; - if (vec_CPU_have_SSE41()) - vec_CPU_features |= VEC_CPU_HAS_SSE41; - if (vec_CPU_have_SSE42()) - vec_CPU_features |= VEC_CPU_HAS_SSE42; - if (vec_CPU_have_AVX()) - vec_CPU_features |= VEC_CPU_HAS_AVX; - if (vec_CPU_have_AVX2()) - vec_CPU_features |= VEC_CPU_HAS_AVX2; - if (vec_CPU_have_AVX512F()) - vec_CPU_features |= VEC_CPU_HAS_AVX512F; - if (vec_CPU_have_NEON()) - vec_CPU_features |= VEC_CPU_HAS_NEON; - } - return vec_CPU_features; -} \ No newline at end of file
--- a/src/impl/arm/neon.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,488 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include "vec/impl/arm/neon.h" - -#include <arm_neon.h> - -// There is LOTS of preprocessor hacking here (as if the other files -// weren't bad enough... lol) - -#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vld1_##sign##bits(in); \ - return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - vstore_lane_##bits(sign, vec.neon, out); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_neon_load_aligned, \ - v##sign##int##bits##x##size##_neon_load_aligned, \ - v##sign##int##bits##x##size##_neon_store_aligned, \ - v##sign##int##bits##x##size##_neon_store_aligned, \ - v##sign##int##bits##x##size##_neon_add, \ - v##sign##int##bits##x##size##_neon_sub, \ - v##sign##int##bits##x##size##_neon_mul, \ - /* .div = */ NULL, \ - /* .avg = */ NULL, \ - v##sign##int##bits##x##size##_neon_and, \ - v##sign##int##bits##x##size##_neon_or, \ - v##sign##int##bits##x##size##_neon_xor, \ - /* .not = */ NULL, \ - v##sign##int##bits##x##size##_neon_lshift, \ - /* .rshift = */ NULL, \ - /* .lrshift = */ NULL, \ - }; - -#define VEC_DEFINE_OPERATIONS(bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) - -// Ok, we'll start out with the 64-bit types. - -#define vadd_8 vadd_s8 -#define vadd_16 vadd_s16 -#define vadd_32 vadd_s32 -#define vsub_8 vsub_s8 -#define vsub_16 vsub_s16 -#define vsub_32 vsub_s32 -#define vmul_8 vmul_s8 -#define vmul_16 vmul_s16 -#define vmul_32 vmul_s32 -#define vshl_8 vshl_s8 -#define vshl_16 vshl_s16 -#define vshl_32 vshl_s32 -#define veor_8 veor_s8 -#define veor_16 veor_s16 -#define veor_32 veor_s32 -#define vorr_8 vorr_s8 -#define vorr_16 vorr_s16 -#define vorr_32 vorr_s32 -#define vand_8 vand_s8 -#define vand_16 vand_s16 -#define vand_32 vand_s32 -#define vld1_8 vld1_s8 -#define vld1_16 vld1_s16 -#define vld1_32 vld1_s32 -#define vget_lane_8 vget_lane_s8 -#define vget_lane_16 vget_lane_s16 -#define vget_lane_32 vget_lane_s32 -#define vstore_lane_8(sign, vec, out) \ - do { \ - out[0] = vget_lane_##sign##8(vec, 0); \ - out[1] = vget_lane_##sign##8(vec, 1); \ - out[2] = vget_lane_##sign##8(vec, 2); \ - out[3] = vget_lane_##sign##8(vec, 3); \ - out[4] = vget_lane_##sign##8(vec, 4); \ - out[5] = vget_lane_##sign##8(vec, 5); \ - out[6] = vget_lane_##sign##8(vec, 6); \ - out[7] = vget_lane_##sign##8(vec, 7); \ - } while (0) -#define vstore_lane_16(sign, vec, out) \ - do { \ - out[0] = vget_lane_##sign##16(vec, 0); \ - out[1] = vget_lane_##sign##16(vec, 1); \ - out[2] = vget_lane_##sign##16(vec, 2); \ - out[3] = vget_lane_##sign##16(vec, 3); \ - } while (0) -#define vstore_lane_32(sign, vec, out) \ - do { \ - out[0] = vget_lane_##sign##32(vec, 0); \ - out[1] = vget_lane_##sign##32(vec, 1); \ - } while (0) -#define vreinterpret_8_u8(x) vreinterpret_s8_u8(x) -#define vreinterpret_16_u16(x) vreinterpret_s16_u16(x) -#define vreinterpret_32_u32(x) vreinterpret_s32_u32(x) - -VEC_DEFINE_OPERATIONS(8, 8) -VEC_DEFINE_OPERATIONS(16, 4) -VEC_DEFINE_OPERATIONS(32, 2) - -#undef vadd_8 -#undef vadd_16 -#undef vadd_32 -#undef vsub_8 -#undef vsub_16 -#undef vsub_32 -#undef vmul_8 -#undef vmul_16 -#undef vmul_32 -#undef vshl_8 -#undef vshl_16 -#undef vshl_32 -#undef veor_8 -#undef veor_16 -#undef veor_32 -#undef vorr_8 -#undef vorr_16 -#undef vorr_32 -#undef vand_8 -#undef vand_16 -#undef vand_32 -#undef vld1_8 -#undef vld1_16 -#undef vld1_32 -#undef vget_lane_8 -#undef vget_lane_16 -#undef vget_lane_32 -#undef vstore_lane_8 -#undef vstore_lane_16 -#undef vstore_lane_32 -#undef vreinterpret_8_u8 -#undef vreinterpret_16_u16 -#undef vreinterpret_32_u32 - -/////////////////////////////////////////////////////////////////////////////// -// 128-bit - -// Now we can go ahead and do the 128-bit ones. - -// NEON doesn't have native 64-bit multiplication, so we have -// to do it ourselves -static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b) -{ - const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a)); - const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b)); - - const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a)); - - return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr)); -} - -static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b) -{ - const uint32x2_t ac = vmovn_u64(a); - const uint32x2_t pr = vmovn_u64(b); - - const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a)); - - return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr); -} - -#define vadd_8 vaddq_s8 -#define vadd_16 vaddq_s16 -#define vadd_32 vaddq_s32 -#define vadd_64 vaddq_s64 -#define vadd_u8 vaddq_u8 -#define vadd_u16 vaddq_u16 -#define vadd_u32 vaddq_u32 -#define vadd_u64 vaddq_u64 -#define vsub_8 vsubq_s8 -#define vsub_16 vsubq_s16 -#define vsub_32 vsubq_s32 -#define vsub_64 vsubq_s64 -#define vsub_u8 vsubq_u8 -#define vsub_u16 vsubq_u16 -#define vsub_u32 vsubq_u32 -#define vsub_u64 vsubq_u64 -#define vmul_8 vmulq_s8 -#define vmul_16 vmulq_s16 -#define vmul_32 vmulq_s32 -#define vmul_64 vmulq_s64 -#define vmul_u8 vmulq_u8 -#define vmul_u16 vmulq_u16 -#define vmul_u32 vmulq_u32 -#define vmul_u64 vmulq_u64 -#define vshl_8 vshlq_s8 -#define vshl_16 vshlq_s16 -#define vshl_32 vshlq_s32 -#define vshl_64 vshlq_s64 -#define vshl_u8 vshlq_u8 -#define vshl_u16 vshlq_u16 -#define vshl_u32 vshlq_u32 -#define vshl_u64 vshlq_u64 -#define veor_8 veorq_s8 -#define veor_16 veorq_s16 -#define veor_32 veorq_s32 -#define veor_64 veorq_s64 -#define veor_u8 veorq_u8 -#define veor_u16 veorq_u16 -#define veor_u32 veorq_u32 -#define veor_u64 veorq_u64 -#define vorr_8 vorrq_s8 -#define vorr_16 vorrq_s16 -#define vorr_32 vorrq_s32 -#define vorr_64 vorrq_s64 -#define vorr_u8 vorrq_u8 -#define vorr_u16 vorrq_u16 -#define vorr_u32 vorrq_u32 -#define vorr_u64 vorrq_u64 -#define vand_8 vandq_s8 -#define vand_16 vandq_s16 -#define vand_32 vandq_s32 -#define vand_64 vandq_s64 -#define vand_u8 vandq_u8 -#define vand_u16 vandq_u16 -#define vand_u32 vandq_u32 -#define vand_u64 vandq_u64 -#define vld1_8 vld1q_s8 -#define vld1_16 vld1q_s16 -#define vld1_32 vld1q_s32 -#define vld1_64 vld1q_s64 -#define vld1_u8 vld1q_u8 -#define vld1_u16 vld1q_u16 -#define vld1_u32 vld1q_u32 -#define vld1_u64 vld1q_u64 -#define vget_lane_8 vgetq_lane_s8 -#define vget_lane_16 vgetq_lane_s16 -#define vget_lane_32 vgetq_lane_s32 -#define vget_lane_64 vgetq_lane_s64 -#define vget_lane_u8 vgetq_lane_u8 -#define vget_lane_u16 vgetq_lane_u16 -#define vget_lane_u32 vgetq_lane_u32 -#define vget_lane_u64 vgetq_lane_u64 -#define vstore_lane_8(sign, vec, out) \ - do { \ - out[0] = vget_lane_##sign##8(vec, 0); \ - out[1] = vget_lane_##sign##8(vec, 1); \ - out[2] = vget_lane_##sign##8(vec, 2); \ - out[3] = vget_lane_##sign##8(vec, 3); \ - out[4] = vget_lane_##sign##8(vec, 4); \ - out[5] = vget_lane_##sign##8(vec, 5); \ - out[6] = vget_lane_##sign##8(vec, 6); \ - out[7] = vget_lane_##sign##8(vec, 7); \ - out[8] = vget_lane_##sign##8(vec, 8); \ - out[9] = vget_lane_##sign##8(vec, 9); \ - out[10] = vget_lane_##sign##8(vec, 10); \ - out[11] = vget_lane_##sign##8(vec, 11); \ - out[12] = vget_lane_##sign##8(vec, 12); \ - out[13] = vget_lane_##sign##8(vec, 13); \ - out[14] = vget_lane_##sign##8(vec, 14); \ - out[15] = vget_lane_##sign##8(vec, 15); \ - } while (0) -#define vstore_lane_16(sign, vec, out) \ - do { \ - out[0] = vget_lane_##sign##16(vec, 0); \ - out[1] = vget_lane_##sign##16(vec, 1); \ - out[2] = vget_lane_##sign##16(vec, 2); \ - out[3] = vget_lane_##sign##16(vec, 3); \ - out[4] = vget_lane_##sign##16(vec, 4); \ - out[5] = vget_lane_##sign##16(vec, 5); \ - out[6] = vget_lane_##sign##16(vec, 6); \ - out[7] = vget_lane_##sign##16(vec, 7); \ - } while (0) -#define vstore_lane_32(sign, vec, out) \ - do { \ - out[0] = vget_lane_##sign##32(vec, 0); \ - out[1] = vget_lane_##sign##32(vec, 1); \ - out[2] = vget_lane_##sign##32(vec, 2); \ - out[3] = vget_lane_##sign##32(vec, 3); \ - } while (0) -#define vstore_lane_64(sign, vec, out) \ - do { \ - out[0] = vget_lane_##sign##64(vec, 0); \ - out[1] = vget_lane_##sign##64(vec, 1); \ - } while (0) -#define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x) -#define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x) -#define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x) -#define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x) - -#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vld1_##sign##bits(in); \ - return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - vstore_lane_##bits(sign, vec.neon, out); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_neon_load_aligned, \ - v##sign##int##bits##x##size##_neon_load_aligned, \ - v##sign##int##bits##x##size##_neon_store_aligned, \ - v##sign##int##bits##x##size##_neon_store_aligned, \ - v##sign##int##bits##x##size##_neon_add, \ - v##sign##int##bits##x##size##_neon_sub, \ - v##sign##int##bits##x##size##_neon_mul, \ - /* .div = */ NULL, \ - /* .avg = */ NULL, \ - v##sign##int##bits##x##size##_neon_and, \ - v##sign##int##bits##x##size##_neon_or, \ - v##sign##int##bits##x##size##_neon_xor, \ - /* .not = */ NULL, \ - v##sign##int##bits##x##size##_neon_lshift, \ - /* .rshift = */ NULL, \ - /* .lrshift = */ NULL, \ - }; - -#define VEC_DEFINE_OPERATIONS(bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) - -VEC_DEFINE_OPERATIONS(8, 16) -VEC_DEFINE_OPERATIONS(16, 8) -VEC_DEFINE_OPERATIONS(32, 4) -VEC_DEFINE_OPERATIONS(64, 2) - -#undef vadd_8 -#undef vadd_16 -#undef vadd_32 -#undef vadd_64 -#undef vsub_8 -#undef vsub_16 -#undef vsub_32 -#undef vsub_64 -#undef vmul_8 -#undef vmul_16 -#undef vmul_32 -#undef vmul_64 -#undef vshl_8 -#undef vshl_16 -#undef vshl_32 -#undef vshl_64 -#undef veor_8 -#undef veor_16 -#undef veor_32 -#undef veor_64 -#undef vorr_8 -#undef vorr_16 -#undef vorr_32 -#undef vorr_64 -#undef vand_8 -#undef vand_16 -#undef vand_32 -#undef vand_64 -#undef vld1_8 -#undef vld1_16 -#undef vld1_32 -#undef vld1_64 -#undef vget_lane_8 -#undef vget_lane_16 -#undef vget_lane_32 -#undef vget_lane_64 -#undef vstore_lane_8 -#undef vstore_lane_16 -#undef vstore_lane_32 -#undef vstore_lane_64
--- a/src/impl/fallback.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,189 +0,0 @@ -#include "vec/impl/fallback.h" - -#include <string.h> - -// Fallback implementations - this is what an implementation should use if it -// doesn't support a specific function *and* the actual representation in -// memory is unknown or yields incorrect results from the generic functions. -// This is *extremely* unlikely; for x86 the layout is exactly the same in -// memory as the generic functions (i.e. it is literally stored as an array of -// integers). -// -// These functions can probably be removed if every current implementation is -// found to have the same - -#define VEC_FALLBACK_OPERATION(op, sign, csign, bits, size) \ - do { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ - v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \ - \ - for (int i = 0; i < size; i++) varr1[i] = (op); \ - \ - return v##sign##int##bits##x##size##_load_aligned(varr1); \ - } while (0) - -#define VEC_FALLBACK_CMP(op, sign, csign, bits, size) \ - VEC_FALLBACK_OPERATION((varr1[i] op varr2[i]) ? VEC_UINT##bits##_MAX : 0, sign, csign, bits, size) - -#define VEC_FALLBACK_SHIFT(op, sign, csign, bits, size) \ - do { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ - VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ - vuint##bits##x##size##_store_aligned(vec2, varr2); \ - \ - for (int i = 0; i < size; i++) varr1[i] = (op); \ - \ - return v##sign##int##bits##x##size##_load_aligned(varr1); \ - } while (0) - -#define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ - for (int i = 0; i < size; i++) arr[i] = x; \ - return v##sign##int##bits##x##size##_load_aligned(arr); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ - memcpy(arr, in, sizeof(vec_##sign##int##bits) * size); \ - return v##sign##int##bits##x##size##_load_aligned(arr); \ - } \ - \ - void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ - v##sign##int##bits##x##size##_store_aligned(vec, arr); \ - memcpy(out, arr, sizeof(vec_##sign##int##bits) * size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION((varr1[i] + varr2[i] + 1) / 2, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \ - { \ - return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)VEC_UINT##bits##_MAX)); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(<, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(<=, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(==, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(>=, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(>, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_FALLBACK_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_FALLBACK_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_FALLBACK_SHIFT(vec_lrshift((vec_uint##bits)varr1[i], varr2[i]), sign, csign, bits, size); \ - } - -#define VEC_DEFINE_FALLBACK_OPERATIONS(bits, size) \ - VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( , , bits, size) \ - VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(u, U, bits, size) - -// 16-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 2) - -// 32-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 4) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 2) - -// 64-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 8) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 4) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 2) - -// 128-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 16) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 8) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 4) -VEC_DEFINE_FALLBACK_OPERATIONS(64, 2) - -// 256-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 32) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 16) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 8) -VEC_DEFINE_FALLBACK_OPERATIONS(64, 4) - -// 512-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 64) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 32) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 16) -VEC_DEFINE_FALLBACK_OPERATIONS(64, 8) - -#undef VEC_FALLBACK_OPERATION -#undef VEC_FALLBACK_CMP -#undef VEC_FALLBACK_SHIFT -#undef VEC_DEFINE_FALLBACK_OPERATIONS -#undef VEC_DEFINE_FALLBACK_OPERATIONS_SIGN
--- a/src/impl/generic.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,460 +0,0 @@ -#include "vec/impl/generic.h" - -#include <string.h> - -// ----------------------------------------------------------------- - -#define VEC_GENERIC_OPERATION(op, sign, csign, bits, size) \ - do { \ - for (int i = 0; i < size; i++) \ - ((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] = (op); \ - \ - return vec1; \ - } while (0) - -#define VEC_GENERIC_BUILTIN_OPERATION(op, sign, csign, bits, size) \ - VEC_GENERIC_OPERATION(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] op ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i], sign, csign, bits, size) - -#define VEC_GENERIC_CMP(op, sign, csign, bits, size) \ - VEC_GENERIC_OPERATION((((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] op ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]) ? VEC_UINT##bits##_MAX : 0, sign, csign, bits, size) - -// TODO implement these so we don't waste stack space by doing the -// generics -#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ - union v##sign##int##bits##x##size##_impl_data { \ - v##sign##int##bits##x##size vec; \ - vec_##sign##int##bits impl[size]; \ - }; \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_splat(vec_##sign##int##bits x) \ - { \ - v##sign##int##bits##x##size vec; \ - for (int i = 0; i < size; i++) \ - ((union v##sign##int##bits##x##size##_impl_data *)&vec)->impl[i] = x; \ - return vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - memcpy(&vec, in, sizeof(vec_##sign##int##bits) * size); \ - return vec; \ - } \ - \ - void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - memcpy(out, &vec, sizeof(vec_##sign##int##bits) * size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_BUILTIN_OPERATION(+, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_BUILTIN_OPERATION(-, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_BUILTIN_OPERATION(*, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_OPERATION(((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i] ? (((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] / ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]) : 0, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_OPERATION((((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i] + ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i] + 1) / 2, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_BUILTIN_OPERATION(&, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_BUILTIN_OPERATION(|, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_BUILTIN_OPERATION(^, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_not(v##sign##int##bits##x##size vec) \ - { \ - return v##sign##int##bits##x##size##_generic_xor(vec, v##sign##int##bits##x##size##_generic_splat((vec_##sign##int##bits)VEC_UINT##bits##_MAX)); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_CMP(<, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_CMP(<=, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_CMP(==, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_CMP(>=, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_GENERIC_CMP(>, sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_GENERIC_OPERATION(vec_##sign##lshift(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i], ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]), sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_GENERIC_OPERATION(vec_##sign##rshift(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i], ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]), sign, csign, bits, size); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_GENERIC_OPERATION(vec_lrshift((vec_uint##bits)(((union v##sign##int##bits##x##size##_impl_data *)&vec1)->impl[i]), ((union v##sign##int##bits##x##size##_impl_data *)&vec2)->impl[i]), sign, csign, bits, size); \ - } \ - \ - const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ - v##sign##int##bits##x##size##_generic_splat, \ - v##sign##int##bits##x##size##_generic_load_aligned, \ - v##sign##int##bits##x##size##_generic_load_aligned, \ - v##sign##int##bits##x##size##_generic_store_aligned, \ - v##sign##int##bits##x##size##_generic_store_aligned, \ - v##sign##int##bits##x##size##_generic_add, \ - v##sign##int##bits##x##size##_generic_sub, \ - v##sign##int##bits##x##size##_generic_mul, \ - v##sign##int##bits##x##size##_generic_div, \ - v##sign##int##bits##x##size##_generic_avg, \ - v##sign##int##bits##x##size##_generic_and, \ - v##sign##int##bits##x##size##_generic_or, \ - v##sign##int##bits##x##size##_generic_xor, \ - v##sign##int##bits##x##size##_generic_not, \ - v##sign##int##bits##x##size##_generic_lshift, \ - v##sign##int##bits##x##size##_generic_rshift, \ - v##sign##int##bits##x##size##_generic_lrshift, \ - v##sign##int##bits##x##size##_generic_cmplt, \ - v##sign##int##bits##x##size##_generic_cmple, \ - v##sign##int##bits##x##size##_generic_cmpeq, \ - v##sign##int##bits##x##size##_generic_cmpge, \ - v##sign##int##bits##x##size##_generic_cmpgt, \ - }; - -#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size) - -VEC_GENERIC_DEFINE_OPERATIONS(8, 2) -VEC_GENERIC_DEFINE_OPERATIONS(16, 2) -VEC_GENERIC_DEFINE_OPERATIONS(32, 2) -VEC_GENERIC_DEFINE_OPERATIONS(64, 2) - -#undef VEC_GENERIC_DEFINE_OPERATIONS -#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN - -// ----------------------------------------------------------------- -// now we can just keep doubling the same implementation - -#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \ - union v##sign##int##bits##x##size##_impl_data { \ - v##sign##int##bits##x##size vec; \ - v##sign##int##bits##x##halfsize impl[2]; \ - }; \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_splat(vec_##sign##int##bits x) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.impl[0] = v##sign##int##bits##x##halfsize##_splat(x); \ - vec.impl[1] = v##sign##int##bits##x##halfsize##_splat(x); \ - return vec.vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.impl[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \ - vec.impl[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \ - return vec.vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.impl[0] = v##sign##int##bits##x##halfsize##_load(in); \ - vec.impl[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \ - return vec.vec; \ - } \ - \ - void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vecd = (union v##sign##int##bits##x##size##_impl_data *)&vec; \ - \ - v##sign##int##bits##x##halfsize##_store_aligned(vecd->impl[0], out); \ - v##sign##int##bits##x##halfsize##_store_aligned(vecd->impl[1], out + halfsize); \ - } \ - \ - void v##sign##int##bits##x##size##_generic_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vecd = (union v##sign##int##bits##x##size##_impl_data *)&vec; \ - \ - v##sign##int##bits##x##halfsize##_store(vecd->impl[0], out); \ - v##sign##int##bits##x##halfsize##_store(vecd->impl[1], out + halfsize); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_add(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_add(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_sub(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_sub(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_mul(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_mul(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_div(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_div(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_avg(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_avg(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_and(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_and(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_or(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_or(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_xor(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_xor(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_not(v##sign##int##bits##x##size vec1) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_not(vec1d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_not(vec1d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union vuint##bits##x##size##_impl_data *vec2d = (union vuint##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_lshift(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_lshift(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union vuint##bits##x##size##_impl_data *vec2d = (union vuint##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_rshift(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_rshift(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union vuint##bits##x##size##_impl_data *vec2d = (union vuint##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_lrshift(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_lrshift(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmplt(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmplt(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmple(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmple(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmpeq(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmpeq(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmpge(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmpge(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->impl[0] = v##sign##int##bits##x##halfsize##_cmpgt(vec1d->impl[0], vec2d->impl[0]); \ - vec1d->impl[1] = v##sign##int##bits##x##halfsize##_cmpgt(vec1d->impl[1], vec2d->impl[1]); \ - \ - return vec1d->vec; \ - } \ - \ - const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ - v##sign##int##bits##x##size##_generic_splat, \ - v##sign##int##bits##x##size##_generic_load_aligned, \ - v##sign##int##bits##x##size##_generic_load, \ - v##sign##int##bits##x##size##_generic_store_aligned, \ - v##sign##int##bits##x##size##_generic_store, \ - v##sign##int##bits##x##size##_generic_add, \ - v##sign##int##bits##x##size##_generic_sub, \ - v##sign##int##bits##x##size##_generic_mul, \ - v##sign##int##bits##x##size##_generic_div, \ - v##sign##int##bits##x##size##_generic_avg, \ - v##sign##int##bits##x##size##_generic_and, \ - v##sign##int##bits##x##size##_generic_or, \ - v##sign##int##bits##x##size##_generic_xor, \ - v##sign##int##bits##x##size##_generic_not, \ - v##sign##int##bits##x##size##_generic_lshift, \ - v##sign##int##bits##x##size##_generic_rshift, \ - v##sign##int##bits##x##size##_generic_lrshift, \ - v##sign##int##bits##x##size##_generic_cmplt, \ - v##sign##int##bits##x##size##_generic_cmple, \ - v##sign##int##bits##x##size##_generic_cmpeq, \ - v##sign##int##bits##x##size##_generic_cmpge, \ - v##sign##int##bits##x##size##_generic_cmpgt, \ - }; - -#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size, halfsize) - -// 32-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 4, 2) - -// 64-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 8, 4) -VEC_GENERIC_DEFINE_OPERATIONS(16, 4, 2) - -// 128-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8) -VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4) -VEC_GENERIC_DEFINE_OPERATIONS(32, 4, 2) - -// 256-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 32, 16) -VEC_GENERIC_DEFINE_OPERATIONS(16, 16, 8) -VEC_GENERIC_DEFINE_OPERATIONS(32, 8, 4) -VEC_GENERIC_DEFINE_OPERATIONS(64, 4, 2) - -// 512-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 64, 32) -VEC_GENERIC_DEFINE_OPERATIONS(16, 32, 16) -VEC_GENERIC_DEFINE_OPERATIONS(32, 16, 8) -VEC_GENERIC_DEFINE_OPERATIONS(64, 8, 4) - -#undef VEC_GENERIC_DEFINE_OPERATIONS -#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN
--- a/src/impl/ppc/altivec.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,233 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include "vec/impl/ppc/altivec.h" - -#include <altivec.h> - -/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */ -#ifdef vec_mul -# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_mul(vec1.altivec, vec2.altivec); \ - return vec; \ - } -# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \ - v##sign##int##bits##x##size##_altivec_mul -#else -# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) -# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) NULL -#endif - -#ifdef vec_splats -# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(vec_##sign##int##bits x) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_splats(x); \ - return vec; \ - } -# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \ - v##sign##int##bits##x##size##_altivec_splat -#else -# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) -# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) NULL -#endif - -#define VEC_ALTIVEC_uRSHIFT vec_sr -#define VEC_ALTIVEC_RSHIFT vec_sra - -#define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_sr(vec1.altivec, vec2.altivec); \ - return vec; \ - } -#define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \ - v##sign##int##bits##x##size##_altivec_lrshift - -#define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size) -#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) NULL - -#define VEC_ALTIVEC_CAST_BOOL_8 (vector signed char) -#define VEC_ALTIVEC_CAST_BOOL_U8 (vector unsigned char) -#define VEC_ALTIVEC_CAST_BOOL_16 (vector signed short) -#define VEC_ALTIVEC_CAST_BOOL_U16 (vector unsigned short) -#define VEC_ALTIVEC_CAST_BOOL_32 (vector signed int) -#define VEC_ALTIVEC_CAST_BOOL_U32 (vector unsigned int) - -/* Since altivec conveniently made their API super user friendly, we can just use - * one giant macro to define literally everything */ -#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_ld(0, in); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_perm(vec_ld(0, in), vec_ld(15, in), vec_lvsl(0, in)); \ - return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - vec_st(vec.altivec, 0, out); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_add(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_sub(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_sl(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_avg(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_and(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_or(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = vec_xor(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmplt(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmplt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpeq(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_or(vec_cmpgt(vec1.altivec, vec2.altivec), vec_cmpeq(vec1.altivec, vec2.altivec)); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.altivec = VEC_ALTIVEC_CAST_BOOL_##csign##bits vec_cmpgt(vec1.altivec, vec2.altivec); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \ - VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size), \ - v##sign##int##bits##x##size##_altivec_load_aligned, \ - v##sign##int##bits##x##size##_altivec_load, \ - v##sign##int##bits##x##size##_altivec_store_aligned, \ - /* .store = */ NULL, \ - v##sign##int##bits##x##size##_altivec_add, \ - v##sign##int##bits##x##size##_altivec_sub, \ - VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size), \ - /* .div = */ NULL, \ - v##sign##int##bits##x##size##_altivec_avg, \ - v##sign##int##bits##x##size##_altivec_and, \ - v##sign##int##bits##x##size##_altivec_or, \ - v##sign##int##bits##x##size##_altivec_xor, \ - /* .not = */ NULL, \ - v##sign##int##bits##x##size##_altivec_lshift, \ - v##sign##int##bits##x##size##_altivec_rshift, \ - VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size), \ - v##sign##int##bits##x##size##_altivec_cmplt, \ - v##sign##int##bits##x##size##_altivec_cmple, \ - v##sign##int##bits##x##size##_altivec_cmpeq, \ - v##sign##int##bits##x##size##_altivec_cmpge, \ - v##sign##int##bits##x##size##_altivec_cmpgt, \ - }; - -#define VEC_DEFINE_OPERATIONS(bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) - -VEC_DEFINE_OPERATIONS(8, 16) -VEC_DEFINE_OPERATIONS(16, 8) -VEC_DEFINE_OPERATIONS(32, 4)
--- a/src/impl/x86/avx2.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,219 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include "vec/impl/x86/avx2.h" -#include "vec/impl/generic.h" - -#include <immintrin.h> - -// this does NOT handle sign bits properly, use with caution -#define VEC_AVX2_OPERATION_8x32_16x16(op, sign) \ - do { \ - union v##sign##int8x32_impl_data *vec1d = (union v##sign##int8x32_impl_data *)&vec1; \ - union v##sign##int8x32_impl_data *vec2d = (union v##sign##int8x32_impl_data *)&vec2; \ - \ - /* unpack and multiply */ \ - __m256i dst_even = _mm256_##op##_epi16(vec1d->avx2, vec2d->avx2); \ - __m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1d->avx2, 8), _mm256_srli_epi16(vec2d->avx2, 8)); \ - \ - /* repack */ \ - vec1d->avx2 = _mm256_or_si256( \ - _mm256_slli_epi16(dst_odd, 8), \ - _mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \ - ); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX2_OPERATION_16x16(op, sign) \ - do { \ - union v##sign##int16x16_impl_data *vec1d = (union v##sign##int16x16_impl_data *)&vec1; \ - union v##sign##int16x16_impl_data *vec2d = (union v##sign##int16x16_impl_data *)&vec2; \ - \ - /* unpack and multiply */ \ - __m256i dst_even = _mm256_##op##_epi32(vec1d->avx2, vec2d->avx2); \ - __m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1d->avx2, 16), _mm256_srli_epi32(vec2d->avx2, 16)); \ - \ - /* repack */ \ - vec1d->avx2 = _mm256_or_si256( \ - _mm256_slli_epi32(dst_odd, 16), \ - _mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \ - ); \ - return vec1d->vec; \ - } while (0) - -// multiplication - -#define VEC_AVX2_MUL_8x32(sign) \ - VEC_AVX2_OPERATION_8x32_16x16(mullo, sign) - -#define VEC_AVX2_MUL_16x16(sign) \ - do { \ - union v##sign##int16x16_impl_data *vec1d = (union v##sign##int16x16_impl_data *)&vec1; \ - union v##sign##int16x16_impl_data *vec2d = (union v##sign##int16x16_impl_data *)&vec2; \ - \ - vec1d->avx2 = _mm256_mullo_epi16(vec1d->avx2, vec2d->avx2); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX2_MUL_32x8(sign) \ - do { \ - union v##sign##int32x8_impl_data *vec1d = (union v##sign##int32x8_impl_data *)&vec1; \ - union v##sign##int32x8_impl_data *vec2d = (union v##sign##int32x8_impl_data *)&vec2; \ - \ - vec1d->avx2 = _mm256_mullo_epi32(vec1d->avx2, vec2d->avx2); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX2_MUL_64x4(sign) \ - do { \ - union v##sign##int64x4_impl_data *vec1d = (union v##sign##int64x4_impl_data *)&vec1; \ - union v##sign##int64x4_impl_data *vec2d = (union v##sign##int64x4_impl_data *)&vec2; \ - \ - __m256i ac = _mm256_mul_epu32(vec1d->avx2, vec2d->avx2); \ - __m256i b = _mm256_srli_epi64(vec1d->avx2, 32); \ - __m256i bc = _mm256_mul_epu32(b, vec2d->avx2); \ - __m256i d = _mm256_srli_epi64(vec2d->avx2, 32); \ - __m256i ad = _mm256_mul_epu32(vec1d->avx2, d); \ - __m256i hi = _mm256_add_epi64(bc, ad); \ - hi = _mm256_slli_epi64(hi, 32); \ - \ - vec1d->avx2 = _mm256_add_epi64(hi, ac); \ - return vec1d->vec; \ - } while (0) - -// operations - -#define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - union v##sign##int##bits##x##size##_impl_data { \ - v##sign##int##bits##x##size vec; \ - __m256i avx2; \ - }; \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.avx2 = _mm256_load_si256((const __m256i *)in); \ - return vec.vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.avx2 = _mm256_loadu_si256((const __m256i *)in); \ - return vec.vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm256_store_si256((__m256i *)out, ((union v##sign##int##bits##x##size##_impl_data*)&vec)->avx2); \ - } \ - \ - static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm256_storeu_si256((__m256i *)out, ((union v##sign##int##bits##x##size##_impl_data*)&vec)->avx2); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx2 = _mm256_add_epi##bits(vec1d->avx2, vec2d->avx2); \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx2 = _mm256_sub_epi##bits(vec1d->avx2, vec2d->avx2); \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX2_MUL_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx2 = _mm256_and_si256(vec1d->avx2, vec2d->avx2); \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx2 = _mm256_or_si256(vec1d->avx2, vec2d->avx2); \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx2 = _mm256_xor_si256(vec1d->avx2, vec2d->avx2); \ - return vec1d->vec; \ - } \ - \ - const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \ - v##sign##int##bits##x##size##_generic_splat, \ - v##sign##int##bits##x##size##_avx2_load_aligned, \ - v##sign##int##bits##x##size##_avx2_load, \ - v##sign##int##bits##x##size##_avx2_store_aligned, \ - v##sign##int##bits##x##size##_avx2_store, \ - v##sign##int##bits##x##size##_avx2_add, \ - v##sign##int##bits##x##size##_avx2_sub, \ - v##sign##int##bits##x##size##_avx2_mul, \ - v##sign##int##bits##x##size##_generic_div, \ - v##sign##int##bits##x##size##_generic_avg, \ - v##sign##int##bits##x##size##_avx2_and, \ - v##sign##int##bits##x##size##_avx2_or, \ - v##sign##int##bits##x##size##_avx2_xor, \ - v##sign##int##bits##x##size##_generic_not, \ - v##sign##int##bits##x##size##_generic_lshift, \ - v##sign##int##bits##x##size##_generic_rshift, \ - v##sign##int##bits##x##size##_generic_lrshift, \ - v##sign##int##bits##x##size##_generic_cmplt, \ - v##sign##int##bits##x##size##_generic_cmple, \ - v##sign##int##bits##x##size##_generic_cmpeq, \ - v##sign##int##bits##x##size##_generic_cmpge, \ - v##sign##int##bits##x##size##_generic_cmpgt, \ - }; - -#define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \ - VEC_AVX2_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_AVX2_DEFINE_OPERATIONS_SIGN(u, bits, size) - -VEC_AVX2_DEFINE_OPERATIONS(8, 32) -VEC_AVX2_DEFINE_OPERATIONS(16, 16) -VEC_AVX2_DEFINE_OPERATIONS(32, 8) -VEC_AVX2_DEFINE_OPERATIONS(64, 4)
--- a/src/impl/x86/avx512f.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,342 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include "vec/impl/x86/avx512f.h" -#include "vec/impl/generic.h" - -#include <immintrin.h> - -// this is a stupid amount of work just to do these operations, is it really worth it ? -// also same note in avx2.c applies here, these do not handle sign bits properly, which -// isn't that big of a deal for regular arithmetic operations, but matters quite a bit -// when doing things like arithmetic shifts. -#define VEC_AVX512F_OPERATION_8x64(op, sign) \ - do { \ - union v##sign##int8x64_impl_data *vec1d = (union v##sign##int8x64_impl_data *)&vec1; \ - union v##sign##int8x64_impl_data *vec2d = (union v##sign##int8x64_impl_data *)&vec2; \ - \ - /* unpack and operate */ \ - __m512i dst_1 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 24), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 24), 24)); \ - __m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 16), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 16), 24)); \ - __m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 8), 24), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 8), 24)); \ - __m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1d->avx512f, 24), _mm512_srli_epi32(vec2d->avx512f, 24)); \ - \ - /* repack */ \ - vec1d->avx512f = _mm512_or_si512( \ - _mm512_or_si512( \ - _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 24), 24), \ - _mm512_srli_epi32(_mm512_slli_epi32(dst_2, 24), 16) \ - ), \ - _mm512_or_si512( \ - _mm512_srli_epi32(_mm512_slli_epi32(dst_3, 24), 8), \ - _mm512_slli_epi32(dst_4, 24) \ - ) \ - ); \ - \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_OPERATION_16x32(op, sign) \ - do { \ - union v##sign##int16x32_impl_data *vec1d = (union v##sign##int16x32_impl_data *)&vec1; \ - union v##sign##int16x32_impl_data *vec2d = (union v##sign##int16x32_impl_data *)&vec2; \ - \ - /* unpack and operate; it would be nice if we had an _m512_andi_epi32... */ \ - __m512i dst_1 = _mm512_##op##_epi32(_mm512_srli_epi32(_mm512_slli_epi32(vec1d->avx512f, 16), 16), _mm512_srli_epi32(_mm512_slli_epi32(vec2d->avx512f, 16), 16)); \ - __m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1d->avx512f, 16), _mm512_srli_epi32(vec2d->avx512f, 16)); \ - \ - /* repack */ \ - vec1d->avx512f = _mm512_or_si512( \ - _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 16), 16), \ - _mm512_slli_epi32(dst_2, 16) \ - ); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_ADD_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(add, sign) - -#define VEC_AVX512F_ADD_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(add, sign) - -#define VEC_AVX512F_ADD_32x16(sign) \ - do { \ - union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ - union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_add_epi32(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_ADD_64x8(sign) \ - do { \ - union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ - union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_add_epi64(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_SUB_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(sub, sign) - -#define VEC_AVX512F_SUB_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(sub, sign) - -#define VEC_AVX512F_SUB_32x16(sign) \ - do { \ - union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ - union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_sub_epi32(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_SUB_64x8(sign) \ - do { \ - union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ - union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_sub_epi64(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_MUL_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(mullo, sign) - -#define VEC_AVX512F_MUL_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(mullo, sign) - -#define VEC_AVX512F_MUL_32x16(sign) \ - do { \ - union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ - union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_mullo_epi32(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_MUL_64x8(sign) \ - do { \ - union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ - union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ - \ - __m512i ac = _mm512_mul_epu32(vec1d->avx512f, vec2d->avx512f); \ - __m512i b = _mm512_srli_epi64(vec1d->avx512f, 32); \ - __m512i bc = _mm512_mul_epu32(b, vec2d->avx512f); \ - __m512i d = _mm512_srli_epi64(vec2d->avx512f, 32); \ - __m512i ad = _mm512_mul_epu32(vec1d->avx512f, d); \ - __m512i hi = _mm512_add_epi64(bc, ad); \ - hi = _mm512_slli_epi64(hi, 32); \ - \ - vec1d->avx512f = _mm512_add_epi64(hi, ac); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_LSHIFT_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(sllv, sign) - -#define VEC_AVX512F_LSHIFT_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(sllv, sign) - -#define VEC_AVX512F_LSHIFT_32x16(sign) \ - do { \ - union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ - union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_sllv_epi32(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_LSHIFT_64x8(sign) \ - do { \ - union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ - union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_sllv_epi64(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_lRSHIFT_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(srlv, sign) - -#define VEC_AVX512F_lRSHIFT_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(srlv, sign) - -#define VEC_AVX512F_aRSHIFT_8x64(sign) \ - do { \ - return v##sign##int8x64_generic_rshift(vec1, vec2); \ - } while (0) - -#define VEC_AVX512F_aRSHIFT_16x32(sign) \ - do { \ - return v##sign##int16x32_generic_rshift(vec1, vec2); \ - } while (0) - -#define VEC_AVX512F_RSHIFT_8x64(sign, aORl) VEC_AVX512F_##aORl##RSHIFT_8x64(sign) -#define VEC_AVX512F_RSHIFT_16x32(sign, aORl) VEC_AVX512F_##aORl##RSHIFT_16x32(sign) - -#define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \ - do { \ - union v##sign##int32x16_impl_data *vec1d = (union v##sign##int32x16_impl_data *)&vec1; \ - union v##sign##int32x16_impl_data *vec2d = (union v##sign##int32x16_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_sr##aORl##v_epi32(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \ - do { \ - union v##sign##int64x8_impl_data *vec1d = (union v##sign##int64x8_impl_data *)&vec1; \ - union v##sign##int64x8_impl_data *vec2d = (union v##sign##int64x8_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_sr##aORl##v_epi64(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } while (0) - -#define VEC_AVX512F_uRSHIFT_8x64(sign, aORl) VEC_AVX512F_RSHIFT_8x64(sign, l) -#define VEC_AVX512F_uRSHIFT_16x32(sign, aORl) VEC_AVX512F_RSHIFT_16x32(sign, l) -#define VEC_AVX512F_uRSHIFT_32x16(sign, aORl) VEC_AVX512F_RSHIFT_32x16(sign, l) -#define VEC_AVX512F_uRSHIFT_64x8(sign, aORl) VEC_AVX512F_RSHIFT_64x8(sign, l) - -#define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - union v##sign##int##bits##x##size##_impl_data { \ - v##sign##int##bits##x##size vec; \ - __m512i avx512f; \ - }; \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.avx512f = _mm512_load_si512((const __m512i *)in); \ - return vec.vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \ - return vec.vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm512_store_si512((__m512i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->avx512f); \ - } \ - \ - static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm512_storeu_si512((__m512i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->avx512f); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX512F_ADD_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX512F_SUB_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX512F_MUL_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_and_si512(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_or_si512(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->avx512f = _mm512_xor_si512(vec1d->avx512f, vec2d->avx512f); \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_AVX512F_LSHIFT_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_AVX512F_##sign##RSHIFT_##bits##x##size(sign, a); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \ - } \ - \ - const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \ - v##sign##int##bits##x##size##_generic_splat, \ - v##sign##int##bits##x##size##_avx512f_load_aligned, \ - v##sign##int##bits##x##size##_avx512f_load, \ - v##sign##int##bits##x##size##_avx512f_store_aligned, \ - v##sign##int##bits##x##size##_avx512f_store, \ - v##sign##int##bits##x##size##_avx512f_add, \ - v##sign##int##bits##x##size##_avx512f_sub, \ - v##sign##int##bits##x##size##_avx512f_mul, \ - v##sign##int##bits##x##size##_generic_div, \ - v##sign##int##bits##x##size##_generic_avg, \ - v##sign##int##bits##x##size##_avx512f_and, \ - v##sign##int##bits##x##size##_avx512f_or, \ - v##sign##int##bits##x##size##_avx512f_xor, \ - v##sign##int##bits##x##size##_generic_not, \ - v##sign##int##bits##x##size##_avx512f_lshift, \ - v##sign##int##bits##x##size##_avx512f_rshift, \ - v##sign##int##bits##x##size##_avx512f_lrshift, \ - v##sign##int##bits##x##size##_generic_cmplt, \ - v##sign##int##bits##x##size##_generic_cmple, \ - v##sign##int##bits##x##size##_generic_cmpeq, \ - v##sign##int##bits##x##size##_generic_cmpge, \ - v##sign##int##bits##x##size##_generic_cmpgt, \ - }; - -#define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \ - VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size) \ - VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size) - -VEC_AVX512F_DEFINE_OPERATIONS(8, 64) -VEC_AVX512F_DEFINE_OPERATIONS(16, 32) -VEC_AVX512F_DEFINE_OPERATIONS(32, 16) -VEC_AVX512F_DEFINE_OPERATIONS(64, 8)
--- a/src/impl/x86/mmx.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,172 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include "vec/vec.h" -#include "vec/impl/x86/mmx.h" -#include "vec/impl/generic.h" - -#include <mmintrin.h> -#include <string.h> - -#define VEC_MMX_OPERATION_8x8(op, sign) \ - do { \ - /* unpack and multiply */ \ - union v##sign##int8x8_impl_data *vec1d = (union v##sign##int8x8_impl_data *)&vec1; \ - union v##sign##int8x8_impl_data *vec2d = (union v##sign##int8x8_impl_data *)&vec2; \ - \ - __m64 dst_even = _mm_##op##_pi16(vec1d->mmx, vec2d->mmx); \ - __m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1d->mmx, 8), _mm_srli_pi16(vec2d->mmx, 8)); \ - \ - /* repack */ \ - vec1d->mmx = _mm_or_si64( \ - _mm_slli_pi16(dst_odd, 8), \ - _mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \ - ); \ - return vec1d->vec; \ - } while (0) - -// shared between MMX variations -#define VEC_MMX_MUL_8x8(sign) \ - VEC_MMX_OPERATION_8x8(mullo, sign) - -#define VEC_MMX_MUL_16x4(sign) \ - do { \ - union v##sign##int16x4_impl_data *vec1d = (union v##sign##int16x4_impl_data *)&vec1; \ - union vuint16x4_impl_data *vec2d = (union vuint16x4_impl_data *)&vec2; \ - \ - vec1d->mmx = _mm_mullo_pi16(vec1d->mmx, vec2d->mmx); \ - return vec1d->vec; \ - } while (0) - -#define VEC_MMX_MUL_32x2(sign) \ - /* TODO implement this for real */ \ - do { \ - return v##sign##int32x2_generic_mul(vec1, vec2); \ - } while (0) - -#define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - union v##sign##int##bits##x##size##_impl_data { \ - v##sign##int##bits##x##size vec; \ - __m64 mmx; \ - }; \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - memcpy(&vec, in, sizeof(vec)); \ - return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - memcpy(out, &vec, sizeof(vec)); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->mmx = _mm_add_pi##bits(vec1d->mmx, vec2d->mmx); \ - \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->mmx = _mm_sub_pi##bits(vec1d->mmx, vec2d->mmx); \ - \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_MMX_MUL_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->mmx = _mm_and_si64(vec1d->mmx, vec2d->mmx); \ - \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->mmx = _mm_or_si64(vec1d->mmx, vec2d->mmx); \ - \ - return vec1d->vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->mmx = _mm_xor_si64(vec1d->mmx, vec2d->mmx); \ - \ - return vec1d->vec; \ - } \ - \ - const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \ - v##sign##int##bits##x##size##_generic_splat, \ - v##sign##int##bits##x##size##_mmx_load_aligned, \ - v##sign##int##bits##x##size##_mmx_load_aligned, \ - v##sign##int##bits##x##size##_mmx_store_aligned, \ - v##sign##int##bits##x##size##_mmx_store_aligned, \ - v##sign##int##bits##x##size##_mmx_add, \ - v##sign##int##bits##x##size##_mmx_sub, \ - v##sign##int##bits##x##size##_mmx_mul, \ - v##sign##int##bits##x##size##_generic_div, \ - v##sign##int##bits##x##size##_generic_avg, \ - v##sign##int##bits##x##size##_mmx_and, \ - v##sign##int##bits##x##size##_mmx_or, \ - v##sign##int##bits##x##size##_mmx_xor, \ - v##sign##int##bits##x##size##_generic_not, \ - v##sign##int##bits##x##size##_generic_lshift, \ - v##sign##int##bits##x##size##_generic_rshift, \ - v##sign##int##bits##x##size##_generic_lrshift, \ - v##sign##int##bits##x##size##_generic_cmplt, \ - v##sign##int##bits##x##size##_generic_cmple, \ - v##sign##int##bits##x##size##_generic_cmpeq, \ - v##sign##int##bits##x##size##_generic_cmpge, \ - v##sign##int##bits##x##size##_generic_cmpgt, \ - }; - -#define VEC_MMX_DEFINE_OPERATIONS(bits, size) \ - VEC_MMX_DEFINE_OPERATIONS_SIGN(u, bits, size) \ - VEC_MMX_DEFINE_OPERATIONS_SIGN( , bits, size) - -VEC_MMX_DEFINE_OPERATIONS(8, 8) -VEC_MMX_DEFINE_OPERATIONS(16, 4) -VEC_MMX_DEFINE_OPERATIONS(32, 2)
--- a/src/impl/x86/sse2.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,263 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include "vec/impl/x86/sse2.h" -#include "vec/impl/generic.h" - -#include <emmintrin.h> - -#define VEC_SSE2_OPERATION_8x16(op, sign) \ - do { \ - /* unpack and multiply */ \ - union v##sign##int8x16_impl_data *vec1d = (union v##sign##int8x16_impl_data *)&vec1; \ - union v##sign##int8x16_impl_data *vec2d = (union v##sign##int8x16_impl_data *)&vec2; \ - \ - __m128i dst_even = _mm_##op##_epi16(vec1d->sse, vec2d->sse); \ - __m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1d->sse, 8), _mm_srli_epi16(vec2d->sse, 8)); \ - \ - /* repack */ \ - vec1d->sse = _mm_or_si128( \ - _mm_slli_epi16(dst_odd, 8), \ - _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ - ); \ - return vec1d->vec; \ - } while (0) - -// shared between SSE2 variations -#define VEC_SSE2_MUL_8x16(sign) \ - VEC_SSE2_OPERATION_8x16(mullo, sign) - -#define VEC_SSE2_MUL_16x8(sign) \ - do { \ - /* we have a real instruction for this */ \ - union v##sign##int16x8_impl_data *vec1d = (union v##sign##int16x8_impl_data *)&vec1; \ - union v##sign##int16x8_impl_data *vec2d = (union v##sign##int16x8_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_mullo_epi16(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } while (0) - -#define VEC_SSE2_MUL_32x4(sign) \ - do { \ - /* this was stolen from... somewhere :) */ \ - union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \ - union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \ - \ - __m128i a13 = _mm_shuffle_epi32(vec1d->sse, 0xF5); /* (-,a3,-,a1) */ \ - __m128i b13 = _mm_shuffle_epi32(vec2d->sse, 0xF5); /* (-,b3,-,b1) */ \ - __m128i prod02 = _mm_mul_epu32(vec1d->sse, vec2d->sse); /* (-,a2*b2,-,a0*b0) */ \ - __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ - __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ - \ - vec1d->sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ - return vec1d->vec; \ - } while (0) - -#define VEC_SSE2_MUL_64x2(sign) \ - do { \ - union v##sign##int64x2_impl_data *vec1d = (union v##sign##int64x2_impl_data *)&vec1; \ - union v##sign##int64x2_impl_data *vec2d = (union v##sign##int64x2_impl_data *)&vec2; \ - \ - __m128i ac = _mm_mul_epu32(vec1d->sse, vec2d->sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ - __m128i b = _mm_srli_epi64(vec1d->sse, 32); /* b = vec1 >> 32; */ \ - __m128i bc = _mm_mul_epu32(b, vec2d->sse); /* bc = b * (vec2 & UINT32_MAX); */ \ - __m128i d = _mm_srli_epi64(vec2d->sse, 32); /* d = vec2 >> 32; */ \ - __m128i ad = _mm_mul_epu32(vec1d->sse, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ - __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ - hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ - \ - vec1d->sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \ - return vec1d->vec; \ - } while (0) - -#define VEC_SSE2_CMPEQ_8x16(sign) \ - do { \ - union v##sign##int8x16_impl_data *vec1d = (union v##sign##int8x16_impl_data *)&vec1; \ - union v##sign##int8x16_impl_data *vec2d = (union v##sign##int8x16_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_cmpeq_epi8(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } while (0) - -#define VEC_SSE2_CMPEQ_16x8(sign) \ - do { \ - union v##sign##int16x8_impl_data *vec1d = (union v##sign##int16x8_impl_data *)&vec1; \ - union v##sign##int16x8_impl_data *vec2d = (union v##sign##int16x8_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_cmpeq_epi16(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } while (0) - -#define VEC_SSE2_CMPEQ_32x4(sign) \ - do { \ - union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \ - union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_cmpeq_epi32(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } while (0) - -// SSE2 doesn't have an intrinsic for 64x2 equality comparison, -// so how can we take a 32x4 comparison result and turn it into -// a 64x2 comparison result? -// -// well, Intel conveniently provided an operation where we can -// shuffle around 32-bit integers (_mm_shuffle_epi32). -// -// this means all we have to do is simply do the 32-bit operation, -// shuffle the parts, and then return a bitwise AND of the result. - -#define VEC_SSE2_CMPEQ_64x2(sign) \ - do { \ - union v##sign##int64x2_impl_data *vec1d = (union v##sign##int64x2_impl_data *)&vec1; \ - union v##sign##int64x2_impl_data *vec2d = (union v##sign##int64x2_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_cmpeq_epi32(vec1d->sse, vec2d->sse); \ - vec2d->sse = _mm_shuffle_epi32(vec1d->sse, _MM_SHUFFLE(1, 1, 3, 3)); \ - vec1d->sse = _mm_shuffle_epi32(vec1d->sse, _MM_SHUFFLE(0, 0, 2, 2)); \ - vec1d->sse = _mm_and_si128(vec1d->sse, vec2d->sse); \ - \ - return vec1d->vec; \ - } while (0) - -#define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - union v##sign##int##bits##x##size##_impl_data { \ - v##sign##int##bits##x##size vec; \ - __m128i sse; \ - }; \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.sse = _mm_load_si128((const __m128i *)in); \ - return vec.vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \ - { \ - union v##sign##int##bits##x##size##_impl_data vec; \ - vec.sse = _mm_loadu_si128((const __m128i *)in); \ - return vec.vec; \ - } \ - \ - void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm_store_si128((__m128i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->sse); \ - } \ - \ - void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm_storeu_si128((__m128i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->sse); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_add_epi##bits(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_sub_epi##bits(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_SSE2_MUL_##bits##x##size(sign); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_and_si128(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_or_si128(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \ - union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_xor_si128(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_SSE2_CMPEQ_##bits##x##size(sign); \ - } \ - \ - const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \ - v##sign##int##bits##x##size##_generic_splat, \ - v##sign##int##bits##x##size##_sse2_load_aligned, \ - v##sign##int##bits##x##size##_sse2_load, \ - v##sign##int##bits##x##size##_sse2_store_aligned, \ - v##sign##int##bits##x##size##_sse2_store, \ - v##sign##int##bits##x##size##_sse2_add, \ - v##sign##int##bits##x##size##_sse2_sub, \ - v##sign##int##bits##x##size##_sse2_mul, \ - v##sign##int##bits##x##size##_generic_div, \ - v##sign##int##bits##x##size##_generic_avg, \ - v##sign##int##bits##x##size##_sse2_and, \ - v##sign##int##bits##x##size##_sse2_or, \ - v##sign##int##bits##x##size##_sse2_xor, \ - v##sign##int##bits##x##size##_generic_not, \ - v##sign##int##bits##x##size##_generic_lshift, \ - v##sign##int##bits##x##size##_generic_rshift, \ - v##sign##int##bits##x##size##_generic_lrshift, \ - v##sign##int##bits##x##size##_generic_cmplt, \ - v##sign##int##bits##x##size##_generic_cmple, \ - v##sign##int##bits##x##size##_sse2_cmpeq, \ - v##sign##int##bits##x##size##_generic_cmpge, \ - v##sign##int##bits##x##size##_generic_cmpgt, \ - }; - -#define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \ - VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size) \ - VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size) - -// SSE is *only* 128-bit -VEC_SSE2_DEFINE_OPERATIONS(8, 16) -VEC_SSE2_DEFINE_OPERATIONS(16, 8) -VEC_SSE2_DEFINE_OPERATIONS(32, 4) -VEC_SSE2_DEFINE_OPERATIONS(64, 2)
--- a/src/impl/x86/sse41.c Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include "vec/impl/x86/sse41.h" -#include "vec/impl/x86/sse2.h" -#include "vec/impl/generic.h" - -#include <immintrin.h> - -// SSE 4.1 provides a real _mm_mullo_epi32 -#define VEC_SSE41_DEFINE_OPERATIONS(sign) \ - union v##sign##int32x4_impl_data { \ - v##sign##int32x4 vec; \ - __m128i sse; \ - }; \ - \ - static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \ - { \ - union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \ - union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \ - \ - vec1d->sse = _mm_mullo_epi32(vec1d->sse, vec2d->sse); \ - return vec1d->vec; \ - } \ - \ - const v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \ - v##sign##int32x4_generic_splat, \ - v##sign##int32x4_sse2_load_aligned, \ - v##sign##int32x4_sse2_load, \ - v##sign##int32x4_sse2_store_aligned, \ - v##sign##int32x4_sse2_store, \ - v##sign##int32x4_sse2_add, \ - v##sign##int32x4_sse2_sub, \ - v##sign##int32x4_sse41_mul, \ - v##sign##int32x4_generic_div, \ - v##sign##int32x4_generic_avg, \ - v##sign##int32x4_sse2_and, \ - v##sign##int32x4_sse2_or, \ - v##sign##int32x4_sse2_xor, \ - v##sign##int32x4_generic_not, \ - v##sign##int32x4_generic_lshift, \ - v##sign##int32x4_generic_rshift, \ - v##sign##int32x4_generic_lrshift, \ - v##sign##int32x4_generic_cmplt, \ - v##sign##int32x4_generic_cmple, \ - v##sign##int32x4_sse2_cmpeq, \ - v##sign##int32x4_generic_cmpge, \ - v##sign##int32x4_generic_cmpgt, \ - }; - -VEC_SSE41_DEFINE_OPERATIONS() -VEC_SSE41_DEFINE_OPERATIONS(u)
--- a/src/vec.c Fri Apr 25 17:40:51 2025 -0400 +++ b/src/vec.c Fri Apr 25 17:40:55 2025 -0400 @@ -1,286 +1,2 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - +#define VEC_IMPLEMENTATION #include "vec/vec.h" -#include "vec/cpu.h" -#include "vec/impl/generic.h" -#include "vec/impl/fallback.h" -#ifdef VEC_COMPILER_HAS_MMX -# include "vec/impl/x86/mmx.h" -#endif -#ifdef VEC_COMPILER_HAS_SSE2 -# include "vec/impl/x86/sse2.h" -#endif -#ifdef VEC_COMPILER_HAS_SSE41 -# include "vec/impl/x86/sse41.h" -#endif -#ifdef VEC_COMPILER_HAS_AVX2 -# include "vec/impl/x86/avx2.h" -#endif -#ifdef VEC_COMPILER_HAS_AVX512F -# include "vec/impl/x86/avx512f.h" -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC -# include "vec/impl/ppc/altivec.h" -#endif -#ifdef VEC_COMPILER_HAS_NEON -# include "vec/impl/arm/neon.h" -#endif - -extern inline vec_uintmax vec_lrshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_llshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); -extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); -extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); - -// 16-bit -const vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; -const vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; - -// 32-bit -const vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; -const vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; -const vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; -const vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; - -// 64-bit -const vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; -const vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; -const vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; -const vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; -const vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; -const vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; - -// 128-bit -const vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; -const vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; -const vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; -const vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; -const vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; -const vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; -const vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; -const vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; - -// 256-bit -const vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; -const vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; -const vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; -const vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; -const vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; -const vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; -const vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; -const vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; - -// 512-bit -const vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; -const vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; -const vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; -const vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; -const vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; -const vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; -const vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; -const vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; - -static int vec_init_spinner = 0; - -// returns 0 or a negative error code on failure -int vec_init(void) -{ - // This function is NOT thread safe. However, once vec - // is initialized, all of the vector functions are thread-safe. - // - // In fact, it's possible to use vec without calling - // vec_init() at all, but it would be completely useless since - // it would just use a generic implementation without any - // vectorization whatsoever (unless maybe the compiler is - // smart enough to optimize it into vectors) - - if (vec_init_spinner) - return 0; // already initialized, do nothing - - vec_uint32 cpu = vec_get_CPU_features(); - -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (cpu & VEC_CPU_HAS_ALTIVEC) { - vint8x16_impl_cpu = &vint8x16_impl_altivec; - vuint8x16_impl_cpu = &vuint8x16_impl_altivec; - vint16x8_impl_cpu = &vint16x8_impl_altivec; - vuint16x8_impl_cpu = &vuint16x8_impl_altivec; - vint32x4_impl_cpu = &vint32x4_impl_altivec; - vuint32x4_impl_cpu = &vuint32x4_impl_altivec; -#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX - if (cpu & VEC_CPU_HAS_ALTIVEC_VSX) { - vint64x2_impl_cpu = &vint64x2_impl_altivec; - vuint64x2_impl_cpu = &vuint64x2_impl_altivec; - } -#endif - } -#endif -#ifdef VEC_COMPILER_HAS_AVX512F - if (cpu & VEC_CPU_HAS_AVX512F) { - vint8x64_impl_cpu = &vint8x64_impl_avx512f; - vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; - vint16x32_impl_cpu = &vint16x32_impl_avx512f; - vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; - vint32x16_impl_cpu = &vint32x16_impl_avx512f; - vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; - vint64x8_impl_cpu = &vint64x8_impl_avx512f; - vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; - } -#endif -#ifdef VEC_COMPILER_HAS_AVX2 - if (cpu & VEC_CPU_HAS_AVX2) { - vint8x32_impl_cpu = &vint8x32_impl_avx2; - vuint8x32_impl_cpu = &vuint8x32_impl_avx2; - vint16x16_impl_cpu = &vint16x16_impl_avx2; - vuint16x16_impl_cpu = &vuint16x16_impl_avx2; - vint32x8_impl_cpu = &vint32x8_impl_avx2; - vuint32x8_impl_cpu = &vuint32x8_impl_avx2; - vint64x4_impl_cpu = &vint64x4_impl_avx2; - vuint64x4_impl_cpu = &vuint64x4_impl_avx2; - } -#endif -#ifdef VEC_COMPILER_HAS_SSE2 - if (cpu & VEC_CPU_HAS_SSE2) { - vint8x16_impl_cpu = &vint8x16_impl_sse2; - vuint8x16_impl_cpu = &vuint8x16_impl_sse2; - vint16x8_impl_cpu = &vint16x8_impl_sse2; - vuint16x8_impl_cpu = &vuint16x8_impl_sse2; -# ifdef VEC_COMPILER_HAS_SSE41 - if (cpu & VEC_CPU_HAS_SSE41) { - vint32x4_impl_cpu = &vint32x4_impl_sse41; - vuint32x4_impl_cpu = &vuint32x4_impl_sse41; - } else -# endif - { - vint32x4_impl_cpu = &vint32x4_impl_sse2; - vuint32x4_impl_cpu = &vuint32x4_impl_sse2; - } - vint64x2_impl_cpu = &vint64x2_impl_sse2; - vuint64x2_impl_cpu = &vuint64x2_impl_sse2; - } -#endif -#ifdef VEC_COMPILER_HAS_MMX - if (cpu & VEC_CPU_HAS_MMX) { - vint8x8_impl_cpu = &vint8x8_impl_mmx; - vuint8x8_impl_cpu = &vuint8x8_impl_mmx; - vint16x4_impl_cpu = &vint16x4_impl_mmx; - vuint16x4_impl_cpu = &vuint16x4_impl_mmx; - vint32x2_impl_cpu = &vint32x2_impl_mmx; - vuint32x2_impl_cpu = &vuint32x2_impl_mmx; - } -#endif -#ifdef VEC_COMPILER_HAS_NEON - if (cpu & VEC_CPU_HAS_NEON) { - // 64-bit - vint8x8_impl_cpu = &vint8x8_impl_neon; - vuint8x8_impl_cpu = &vuint8x8_impl_neon; - vint16x4_impl_cpu = &vint16x4_impl_neon; - vuint16x4_impl_cpu = &vuint16x4_impl_neon; - vint32x2_impl_cpu = &vint32x2_impl_neon; - vuint32x2_impl_cpu = &vuint32x2_impl_neon; - - // 128-bit - vint8x16_impl_cpu = &vint8x16_impl_neon; - vuint8x16_impl_cpu = &vuint8x16_impl_neon; - vint16x8_impl_cpu = &vint16x8_impl_neon; - vuint16x8_impl_cpu = &vuint16x8_impl_neon; - vint32x4_impl_cpu = &vint32x4_impl_neon; - vuint32x4_impl_cpu = &vuint32x4_impl_neon; - vint64x2_impl_cpu = &vint64x2_impl_neon; - vuint64x2_impl_cpu = &vuint64x2_impl_neon; - } -#endif - { - // do nothing, they're already set to generics - } - - vec_init_spinner++; - - return 0; -} - -/* ---------------------------------------------------------------- */ - -#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ - extern inline void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - extern inline void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - extern inline v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); - -#define VEC_DEFINE_OPERATIONS(bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) - -// 16-bit -VEC_DEFINE_OPERATIONS(8, 2) - -// 32-bit -VEC_DEFINE_OPERATIONS(8, 4) -VEC_DEFINE_OPERATIONS(16, 2) - -// 64-bit -VEC_DEFINE_OPERATIONS(8, 8) -VEC_DEFINE_OPERATIONS(16, 4) -VEC_DEFINE_OPERATIONS(32, 2) - -// 128-bit -VEC_DEFINE_OPERATIONS(8, 16) -VEC_DEFINE_OPERATIONS(16, 8) -VEC_DEFINE_OPERATIONS(32, 4) -VEC_DEFINE_OPERATIONS(64, 2) - -// 256-bit -VEC_DEFINE_OPERATIONS(8, 32) -VEC_DEFINE_OPERATIONS(16, 16) -VEC_DEFINE_OPERATIONS(32, 8) -VEC_DEFINE_OPERATIONS(64, 4) - -// 512-bit -VEC_DEFINE_OPERATIONS(8, 64) -VEC_DEFINE_OPERATIONS(16, 32) -VEC_DEFINE_OPERATIONS(32, 16) -VEC_DEFINE_OPERATIONS(64, 8) - -#undef VEC_DEFINE_OPERATIONS -#undef VEC_DEFINE_OPERATIONS_SIGN
--- a/test/CMakeLists.txt Fri Apr 25 17:40:51 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ -cmake_minimum_required(VERSION 3.23) - -project(vec-tests) - -# add main vec directory -add_subdirectory(.. vec) - -add_executable(vec-tests test.c) - -target_link_libraries(vec-tests vec)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile.ppc Fri Apr 25 17:40:55 2025 -0400 @@ -0,0 +1,3 @@ +CPPFLAGS += -maltivec + +include Makefile.template \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile.template Fri Apr 25 17:40:55 2025 -0400 @@ -0,0 +1,53 @@ +CPPFLAGS += -g -O2 -I../include -Wall -Wpedantic -Werror=strict-aliasing +CFLAGS += $(CPPFLAGS) -std=c99 +CXXFLAGS += $(CPPFLAGS) -std=c++11 + +HEADERS = ../include/vec/vec.h \ + ../include/vec/impl/ppc/altivec.h \ + ../include/vec/impl/x86/avx2.h \ + ../include/vec/impl/x86/avx512f.h \ + ../include/vec/impl/x86/mmx.h \ + ../include/vec/impl/x86/sse2.h \ + ../include/vec/impl/x86/sse41.h \ + ../include/vec/impl/cpu.h \ + ../include/vec/impl/fallback.h \ + ../include/vec/impl/generic.h \ + test_align.h \ + test_arith.h \ + test_compare.h \ + test_shift.h +BINS = test-generic test-host test-cxx +OBJS = vec-generic.o vec-host.o test.o test-cxx.o + +.PHONY: all clean test + +all: $(BINS) + +vec-generic.o: ../src/vec.c $(HEADERS) + $(CC) $(CFLAGS) -DVEC_SUPPRESS_HW=1 -c -o $@ $< + +vec-host.o: ../src/vec.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +test.o: test.c + $(CC) $(CFLAGS) -c -o $@ $< + +test-cxx.o: test.cc + $(CXX) $(CXXFLAGS) -c -o $@ $< + +test-generic: vec-generic.o test.o + $(CC) $(LDFLAGS) -o $@ $^ + +test-host: vec-host.o test.o + $(CC) $(LDFLAGS) -o $@ $^ + +test-cxx: test-cxx.o $(HEADERS) + $(CXX) $(LDFLAGS) -o $@ $< + +clean: + $(RM) $(BINS) $(OBJS) + +test: clean $(BINS) + ./test-generic + ./test-host + ./test-cxx
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile.x86 Fri Apr 25 17:40:55 2025 -0400 @@ -0,0 +1,3 @@ +CPPFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f + +include Makefile.template \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test.cc Fri Apr 25 17:40:55 2025 -0400 @@ -0,0 +1,27 @@ +#define VEC_IMPLEMENTATION +#include "vec/vec.h" + +#include <iostream> + +/* this test makes sure that vec can be included under C++ */ +int main(void) +{ + int ret = 0; + + VUINT32x8_ALIGNED_ARRAY(varrin); + VUINT32x8_ALIGNED_ARRAY(varrout); + + for (int i = 0; i < 8; i++) + varrin[i] = i; + + vuint32x8 vec = vuint32x8_load_aligned(varrin); + vec = vuint32x8_add(vec, vec); + + vuint32x8_store_aligned(vec, varrout); + + for (int i = 0; i < 8; i++) + if (varrout[i] != (uint32_t)(varrin[i] + varrin[i])) + ret |= 1; + + return ret; +} \ No newline at end of file
--- a/test/test_arith.h Fri Apr 25 17:40:51 2025 -0400 +++ b/test/test_arith.h Fri Apr 25 17:40:55 2025 -0400 @@ -39,8 +39,8 @@ v##sign##int##bits##x##size##_store_aligned(c, orig_c); \ \ for (int i = 0; i < size; i++) { \ - if ((vec_##sign##int##bits)(equiv) != orig_c[i]) { \ - fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%" PRI ## psign ## bits "] does not equal result [%" PRI ## psign ## bits "]!\n", i, (vec_##sign##int##bits)(equiv), orig_c[i]); \ + if ((sign##int##bits##_t)(equiv) != orig_c[i]) { \ + fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%" PRI ## psign ## bits "] does not equal result [%" PRI ## psign ## bits "]!\n", i, (sign##int##bits##_t)(equiv), orig_c[i]); \ print_v##sign##int##bits##x##size(stderr,a); \ print_vuint##bits##x##size(stderr,b); \ print_v##sign##int##bits##x##size(stderr,c); \ @@ -60,10 +60,10 @@ CREATE_TEST(sign, psign, csign, bits, size, and, orig_a[i] & orig_b[i]) \ CREATE_TEST(sign, psign, csign, bits, size, or, orig_a[i] | orig_b[i]) \ CREATE_TEST(sign, psign, csign, bits, size, xor, orig_a[i] ^ orig_b[i]) \ - CREATE_TEST(sign, psign, csign, bits, size, avg, (orig_a[i] + orig_b[i] + 1) / 2) \ + CREATE_TEST(sign, psign, csign, bits, size, avg, (sign##int##bits##_t)(orig_a[i] + orig_b[i]) / 2) \ CREATE_TEST_SHIFT(sign, psign, csign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \ CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \ - CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_lrshift((vec_uint##bits)orig_a[i], orig_b[i])) + CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i])) #define CREATE_TESTS(bits, size) \ CREATE_TESTS_SIGN(, d, , bits, size) \
--- a/test/test_shift.h Fri Apr 25 17:40:51 2025 -0400 +++ b/test/test_shift.h Fri Apr 25 17:40:55 2025 -0400 @@ -2,6 +2,8 @@ { int ret = 0; + ret |= (vec_ulrshift(0xFFFFFFFF, 16) != 0xFFFF); + ret |= (vec_ullshift(0xFFFF, 16) != 0xFFFF0000); ret |= (vec_lrshift(0xFFFFFFFF, 16) != 0xFFFF); ret |= (vec_llshift(0xFFFF, 16) != 0xFFFF0000); ret |= (vec_urshift(0xFFFFFFFF, 16) != 0xFFFF);