Mercurial > vec
changeset 37:4b5a557aa64f
*: turns out extern is a practical joke. rewrite to be always inline again
the sample benchmark performs about 3x as well with optimizations
disabled :)
author | Paper <paper@tflc.us> |
---|---|
date | Sat, 26 Apr 2025 01:04:35 -0400 |
parents | 677c03c382b8 |
children | fd42f9b1b95e |
files | CMakeLists.txt include/vec/impl/align.h include/vec/impl/fallback.h include/vec/impl/generic.h include/vec/impl/integer.h.in include/vec/impl/x86/avx2.h include/vec/impl/x86/avx512f.h include/vec/impl/x86/mmx.h include/vec/impl/x86/sse2.h include/vec/impl/x86/sse3.h include/vec/impl/x86/sse41.h include/vec/impl/x86/sse42.h include/vec/vec.h src/vec.c test/Makefile.template test/Makefile.x86 test/test.c test/test.cc test/test_arith.h test/test_benchmark.h test/test_benchmark_simple.c test/test_benchmark_vec.c test/test_compare.h test/test_shift.h utils/gengeneric.c vec.pc.in |
diffstat | 26 files changed, 7435 insertions(+), 2351 deletions(-) [+] |
line wrap: on
line diff
--- a/CMakeLists.txt Fri Apr 25 17:40:55 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,163 +0,0 @@ -cmake_minimum_required(VERSION 3.23) - -project(vec VERSION 2.0.0 DESCRIPTION "a tiny C99 SIMD vector library") - -add_library(vec SHARED src/vec.c) - -target_sources(vec PUBLIC - $<INSTALL_INTERFACE:vec/vec.h> - $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include/vec/vec.h> - $<INSTALL_INTERFACE:vec/impl/integer.h> - $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/vec/impl/integer.h> -) - -include(CheckCCompilerFlag) - -if(MSVC) - # TODO ? -else() - check_c_compiler_flag("-maltivec" COMPILER_HAS_ALTIVEC) - if(COMPILER_HAS_ALTIVEC) - target_compile_options(vec PRIVATE "-maltivec") - endif() - check_c_compiler_flag("-mmmx" COMPILER_HAS_MMX) - if(COMPILER_HAS_MMX) - target_compile_options(vec PRIVATE "-mmmx") - endif() - check_c_compiler_flag("-msse2" COMPILER_HAS_SSE2) - if(COMPILER_HAS_SSE2) - target_compile_options(vec PRIVATE "-msse2") - endif() - check_c_compiler_flag("-msse4.1" COMPILER_HAS_SSE41) - if(COMPILER_HAS_SSE41) - target_compile_options(vec PRIVATE "-msse4.1") - endif() - check_c_compiler_flag("-mavx2" COMPILER_HAS_AVX2) - if(COMPILER_HAS_AVX2) - target_compile_options(vec PRIVATE "-mavx2") - endif() - check_c_compiler_flag("-mavx512f" COMPILER_HAS_AVX512F) - if(COMPILER_HAS_AVX512F) - target_compile_options(vec PRIVATE "-mavx512f") - endif() -endif() - -######################################################################### -# integer types - -include(CheckTypeSize) - -check_type_size("int16_t" INT16_T_SIZE LANGUAGE C) -check_type_size("uint16_t" UINT16_T_SIZE LANGUAGE C) -check_type_size("u_int16_t" U_INT16_T_SIZE LANGUAGE C) -check_type_size("int32_t" INT32_T_SIZE LANGUAGE C) -check_type_size("uint32_t" UINT32_T_SIZE LANGUAGE C) -check_type_size("u_int32_t" U_INT32_T_SIZE LANGUAGE C) -check_type_size("int64_t" INT64_T_SIZE LANGUAGE C) -check_type_size("uint64_t" UINT64_T_SIZE LANGUAGE C) -check_type_size("u_int64_t" U_INT64_T_SIZE LANGUAGE C) -check_type_size("short" SHORT_SIZE LANGUAGE C) -check_type_size("int" INT_SIZE LANGUAGE C) -check_type_size("long" LONG_SIZE LANGUAGE C) -check_type_size("long long" LONG_LONG_SIZE LANGUAGE C) -check_type_size("uintptr_t" UINTPTR_T_SIZE LANGUAGE C) - -if(INT16_T_SIZE EQUAL 2) - set(SIZE16 "int16_t") -elseif(SHORT_SIZE EQUAL 2) - set(SIZE16 "short") -elseif(INT_SIZE EQUAL 2) - set(SIZE16 "int") -endif() - -if(UINT16_T_SIZE EQUAL 2) - set(USIZE16 "uint16_t") -elseif(U_INT16_T_SIZE EQUAL 2) - set(USIZE16 "u_int16_t") -elseif(SHORT_SIZE EQUAL 2) - set(USIZE16 "unsigned short") -elseif(INT_SIZE EQUAL 2) - set(USIZE16 "unsigned int") -endif() - -if(INT32_T_SIZE EQUAL 4) - set(SIZE32 "int32_t") -elseif(SHORT_SIZE EQUAL 4) - set(SIZE32 "short") -elseif(INT_SIZE EQUAL 4) - set(SIZE32 "int") -elseif(LONG_SIZE EQUAL 4) - set(SIZE32 "long") -endif() - -if(UINT32_T_SIZE EQUAL 4) - set(USIZE32 "uint32_t") -elseif(U_INT32_T_SIZE EQUAL 4) - set(USIZE32 "u_int32_t") -elseif(SHORT_SIZE EQUAL 4) - set(USIZE32 "unsigned short") -elseif(INT_SIZE EQUAL 4) - set(USIZE32 "unsigned int") -elseif(LONG_SIZE EQUAL 4) - set(USIZE32 "unsigned long") -endif() - -if(INT64_T_SIZE EQUAL 8) - set(SIZE64 "int64_t") -elseif(SHORT_SIZE EQUAL 8) - set(SIZE64 "short") -elseif(INT_SIZE EQUAL 8) - set(SIZE64 "int") -elseif(LONG_SIZE EQUAL 8) - set(SIZE64 "long") -elseif(LONG_LONG_SIZE EQUAL 8) - set(SIZE64 "long long") -endif() - -if(UINT64_T_SIZE EQUAL 8) - set(USIZE64 "uint64_t") -elseif(U_INT64_T_SIZE EQUAL 8) - set(USIZE64 "u_int64_t") -elseif(SHORT_SIZE EQUAL 8) - set(USIZE64 "unsigned short") -elseif(INT_SIZE EQUAL 8) - set(USIZE64 "unsigned int") -elseif(LONG_SIZE EQUAL 8) - set(USIZE64 "unsigned long") -elseif(LONG_LONG_SIZE EQUAL 8) - set(USIZE64 "unsigned long long") -endif() - -if(CMAKE_SIZEOF_VOID_P EQUAL UINTPTR_T_SIZE) - set(USIZEPTR "uintptr_t") -elseif(CMAKE_SIZEOF_VOID_P EQUAL 1) - set(USIZEPTR "unsigned char") -elseif(CMAKE_SIZEOF_VOID_P EQUAL 2) - set(USIZEPTR "${USIZE16}") -elseif(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(USIZEPTR "${USIZE32}") -elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) - set(USIZEPTR "${USIZE64}") -endif() - -configure_file(include/vec/impl/integer.h.in include/vec/impl/integer.h @ONLY) - -target_compile_definitions(vec PRIVATE "VEC_HAVE_IMPL_INTEGER_H") - -######################################################################### - -target_compile_features(vec PRIVATE $<IF:$<COMPILE_FEATURES:c_std_11>,c_std_11,c_std_99>) -target_include_directories(vec PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_BINARY_DIR}/include/vec") - -# Installing - -include(GNUInstallDirs) - -install(TARGETS vec LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) - -install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/include/vec/vec.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/vec") -install(FILES "${CMAKE_CURRENT_BINARY_DIR}/include/vec/impl/integer.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/vec/impl") - -# pkg-config -configure_file(vec.pc.in vec.pc @ONLY) -install(FILES ${CMAKE_BINARY_DIR}/vec.pc DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig)
--- a/include/vec/impl/align.h Fri Apr 25 17:40:55 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,267 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#ifndef VEC_IMPL_ALIGN_H_ -#define VEC_IMPL_ALIGN_H_ - -// Array alignment macros - -#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) -# define VEC_ALIGNAS(x) alignas(x) -#elif (__STDC_VERSION__ >= 201112L) -# define VEC_ALIGNAS(x) _Alignas(x) -#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0) -# define VEC_ALIGNAS(x) __attribute__((__aligned__(x))) -#endif - -/* the alignment must be specified in bytes and must be a multiple of the - * type size. it is always assumed that the type will be on a boundary of - * its size, which may or may not be true */ -#ifdef VEC_ALIGNAS -# define VEC_ALIGNED_ARRAY(type, var, length, align) \ - VEC_ALIGNAS(align) type var[length] -# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(var)) -#else -// use unions to get an aligned offset without triggering strict aliasing -# define VEC_ALIGNED_ARRAY(type, var, length, align) \ - VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \ - union vec_aligned_union_##var##_ { \ - type arr[length]; \ - unsigned char bytes[sizeof(type) * length]; \ - }; \ - unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \ - type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \ - VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") -# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(vec_unaligned_##var##_) - (align - 1)) -#endif - -#define VEC_ALIGNED_ARRAY_LENGTH(var) \ - (VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var)) - -////////////////////////////////////////////////////////////////////////////////////// -// predefined variants for each vector type - -////////////////////////////////////////////////////////////////////////////////////// -// 16-bit - -#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT) -#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT) -#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT) -#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0) - -#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT) -#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT) -#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT) -#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 32-bit - -#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT) -#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT) -#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT) -#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0) - -#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT) -#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT) -#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT) -#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0) - -#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT) -#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT) -#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT) -#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0) - -#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT) -#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT) -#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT) -#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 64-bit - -#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT) -#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT) -#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT) -#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0) - -#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT) -#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT) -#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT) -#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0) - -#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT) -#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT) -#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT) -#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0) - -#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT) -#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT) -#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT) -#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0) - -#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT) -#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT) -#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT) -#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0) - -#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT) -#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT) -#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT) -#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 128-bit - -#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) -#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) - -#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) -#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) - -#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) -#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) - -#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) -#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) - -#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) - -#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) - -#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) - -#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 256-bit - -#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) -#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) - -#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) -#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) - -#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) -#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) - -#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) -#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) - -#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) -#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) - -#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) -#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) - -#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) -#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) - -#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) -#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// -// 512-bit - -#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) -#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) - -#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT) -#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) - -#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) -#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) - -#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) -#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) - -#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) -#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) - -#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT) -#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) - -#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) -#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) - -#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) -#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) - -////////////////////////////////////////////////////////////////////////////////////// - -#endif /* VEC_IMPL_ALIGN_H_ */
--- a/include/vec/impl/fallback.h Fri Apr 25 17:40:55 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,211 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#ifndef VEC_IMPL_FALLBACK_H_ -#define VEC_IMPL_FALLBACK_H_ - -#include <string.h> - -// Fallback implementations - this is what an implementation should use if it -// doesn't support a specific function. Note that the load_aligned and -// store_aligned functions are not implemented here - this is on purpose; -// every single implementation *needs* to have one of these. - -#define VEC_FALLBACK_OPERATION(op, sign, csign, bits, size) \ - do { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ - v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \ - \ - for (int i = 0; i < size; i++) varr1[i] = (op); \ - \ - return v##sign##int##bits##x##size##_load_aligned(varr1); \ - } while (0) - -#define VEC_FALLBACK_CMP(op, sign, csign, bits, size) \ - VEC_FALLBACK_OPERATION((varr1[i] op varr2[i]) ? UINT##bits##_MAX : 0, sign, csign, bits, size) - -#define VEC_FALLBACK_SHIFT(op, sign, csign, bits, size) \ - do { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ - VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ - vuint##bits##x##size##_store_aligned(vec2, varr2); \ - \ - for (int i = 0; i < size; i++) varr1[i] = (op); \ - \ - return v##sign##int##bits##x##size##_load_aligned(varr1); \ - } while (0) - -#define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ - for (int i = 0; i < size; i++) arr[i] = x; \ - return v##sign##int##bits##x##size##_load_aligned(arr); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ - memcpy(arr, in, sizeof(vec_##sign##int##bits) * size); \ - return v##sign##int##bits##x##size##_load_aligned(arr); \ - } \ - \ - static void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ - v##sign##int##bits##x##size##_store_aligned(vec, arr); \ - memcpy(out, arr, sizeof(vec_##sign##int##bits) * size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_add(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \ - { \ - return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)UINT##bits##_MAX)); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(<, sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(<=, sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(==, sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(>=, sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_FALLBACK_CMP(>, sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_FALLBACK_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_FALLBACK_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_FALLBACK_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size); \ - } - -#define VEC_DEFINE_FALLBACK_OPERATIONS(bits, size) \ - VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( , , bits, size) \ - VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(u, U, bits, size) - -// 16-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 2) - -// 32-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 4) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 2) - -// 64-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 8) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 4) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 2) - -// 128-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 16) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 8) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 4) -VEC_DEFINE_FALLBACK_OPERATIONS(64, 2) - -// 256-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 32) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 16) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 8) -VEC_DEFINE_FALLBACK_OPERATIONS(64, 4) - -// 512-bit -VEC_DEFINE_FALLBACK_OPERATIONS(8, 64) -VEC_DEFINE_FALLBACK_OPERATIONS(16, 32) -VEC_DEFINE_FALLBACK_OPERATIONS(32, 16) -VEC_DEFINE_FALLBACK_OPERATIONS(64, 8) - -#undef VEC_FALLBACK_OPERATION -#undef VEC_FALLBACK_CMP -#undef VEC_FALLBACK_SHIFT -#undef VEC_DEFINE_FALLBACK_OPERATIONS -#undef VEC_DEFINE_FALLBACK_OPERATIONS_SIGN - -#endif /* VEC_IMPL_FALLBACK_H_ */
--- a/include/vec/impl/generic.h Fri Apr 25 17:40:55 2025 -0400 +++ b/include/vec/impl/generic.h Sat Apr 26 01:04:35 2025 -0400 @@ -22,7 +22,8 @@ * SOFTWARE. **/ -/* Generic array-based implementation. */ +/* This file is automatically generated! Do not edit it directly! + * Edit the code that generates it in utils/gengeneric.c --paper */ #ifndef VEC_IMPL_GENERIC_H_ #define VEC_IMPL_GENERIC_H_ @@ -31,110 +32,4526 @@ // ----------------------------------------------------------------- -// TODO implement these so we don't waste stack space by doing the -// fallbacks -#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ +#define VEC_GENERIC_OPERATION(op, sign, bits, size) \ + do { \ + int i; \ + \ + for (i = 0; i < size; i++) \ + vec1.generic[i] = (op); \ + \ + return vec1; \ + } while (0) + +#define VEC_GENERIC_BUILTIN_OPERATION(op, sign, bits, size) \ + VEC_GENERIC_OPERATION(vec1.generic[i] op vec2.generic[i], sign, bits, size) + +#define VEC_GENERIC_CMP(op, sign, bits, size) \ + VEC_GENERIC_OPERATION((vec1.generic[i] op vec2.generic[i]) ? (vec_##sign##int##bits)VEC_MAX_OF_TYPE(vec_uint##bits) : 0, sign, bits, size) + +/* okay, now we can do this crap: */ + +#define VEC_GENERIC_SPLAT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ + { \ + v##sign##int##bits##x##size vec; \ + for (int i = 0; i < size; i++) \ + vec.generic[i] = x; \ + return vec; \ + } + +#define VEC_GENERIC_LOAD_EX(name, sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ - memcpy(vec.generic, in, sizeof(vec_##sign##int##bits) * size); \ + memcpy(&vec, in, sizeof(vec_##sign##int##bits) * size); \ return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + } + +#define VEC_GENERIC_LOAD_ALIGNED(sign, bits, size) VEC_GENERIC_LOAD_EX(load_aligned, sign, bits, size) +#define VEC_GENERIC_LOAD(sign, bits, size) VEC_GENERIC_LOAD_EX(load, sign, bits, size) + +#define VEC_GENERIC_STORE_EX(name, sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + memcpy(out, &vec, sizeof(vec_##sign##int##bits) * size); \ + } + +#define VEC_GENERIC_STORE_ALIGNED(sign, bits, size) VEC_GENERIC_STORE_EX(store_aligned, sign, bits, size) +#define VEC_GENERIC_STORE(sign, bits, size) VEC_GENERIC_STORE_EX(store, sign, bits, size) + +#define VEC_GENERIC_ADD(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - memcpy(out, vec.generic, sizeof(vec_##sign##int##bits) * size); \ - } \ + VEC_GENERIC_BUILTIN_OPERATION(+, sign, bits, size); \ + } + +#define VEC_GENERIC_SUB(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_BUILTIN_OPERATION(-, sign, bits, size); \ + } + +#define VEC_GENERIC_MUL(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_BUILTIN_OPERATION(*, sign, bits, size); \ + } + +#define VEC_GENERIC_DIV(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_OPERATION(vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0, sign, bits, size); \ + } + +#define VEC_GENERIC_AVG(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + for (int i = 0; i < size; i++) \ + vec1.generic[i] = vec_##sign##avg(vec1.generic[i], vec2.generic[i]); \ \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_generic_load_aligned, \ - v##sign##int##bits##x##size##_generic_load_aligned, \ - v##sign##int##bits##x##size##_generic_store_aligned, \ - v##sign##int##bits##x##size##_generic_store_aligned, \ - }; - -#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) - -VEC_GENERIC_DEFINE_OPERATIONS(8, 2) -VEC_GENERIC_DEFINE_OPERATIONS(16, 2) -VEC_GENERIC_DEFINE_OPERATIONS(32, 2) -VEC_GENERIC_DEFINE_OPERATIONS(64, 2) - -#undef VEC_GENERIC_DEFINE_OPERATIONS -#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN - -// ----------------------------------------------------------------- -// now we can just keep doubling the same implementation - -#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ + return vec1; \ + } + +#define VEC_GENERIC_AND(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_BUILTIN_OPERATION(&, sign, bits, size); \ + } + +#define VEC_GENERIC_OR(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_BUILTIN_OPERATION(|, sign, bits, size); \ + } + +#define VEC_GENERIC_XOR(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_BUILTIN_OPERATION(^, sign, bits, size); \ + } + +#define VEC_GENERIC_NOT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ + { \ + return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)VEC_MAX_OF_TYPE(vec_uint##bits))); \ + } + +#define VEC_GENERIC_CMPLT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_CMP(<, sign, bits, size); \ + } + +#define VEC_GENERIC_CMPLE(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmpgt(vec1, vec2)); \ + } + +#define VEC_GENERIC_CMPEQ(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_CMP(==, sign, bits, size); \ + } + +#define VEC_GENERIC_CMPGE(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmplt(vec1, vec2)); \ + } + +#define VEC_GENERIC_CMPGT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_GENERIC_CMP(>, sign, bits, size); \ + } + +#define VEC_GENERIC_LSHIFT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_GENERIC_OPERATION(vec_##sign##lshift(vec1.generic[i], vec2.generic[i]), sign, bits, size); \ + } + +#define VEC_GENERIC_RSHIFT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_GENERIC_OPERATION(vec_##sign##rshift(vec1.generic[i], vec2.generic[i]), sign, bits, size); \ + } + +#define VEC_GENERIC_LRSHIFT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_GENERIC_OPERATION(vec_urshift((vec_uint##bits)vec1.generic[i], vec2.generic[i]), sign, bits, size); \ + } + +#define VEC_GENERIC_MIN(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size cmplt = v##sign##int##bits##x##size##_cmplt(vec1, vec2); \ + \ + v##sign##int##bits##x##size a = v##sign##int##bits##x##size##_and(vec1, cmplt); \ + v##sign##int##bits##x##size b = v##sign##int##bits##x##size##_and(vec2, v##sign##int##bits##x##size##_not(cmplt)); \ + \ + return v##sign##int##bits##x##size##_or(a, b); \ + } + +#define VEC_GENERIC_MAX(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size cmplt = v##sign##int##bits##x##size##_cmpgt(vec1, vec2); \ + \ + v##sign##int##bits##x##size a = v##sign##int##bits##x##size##_and(vec1, cmplt); \ + v##sign##int##bits##x##size b = v##sign##int##bits##x##size##_and(vec2, v##sign##int##bits##x##size##_not(cmplt)); \ + \ + return v##sign##int##bits##x##size##_or(a, b); \ + } + +#define VEC_GENERIC_DBL_SPLAT(sign, bits, size, halfsize) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ { \ v##sign##int##bits##x##size vec; \ - vec.generic[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \ - vec.generic[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \ + \ + vec.generic[0] = v##sign##int##bits##x##halfsize##_splat(x); \ + vec.generic[1] = v##sign##int##bits##x##halfsize##_splat(x); \ + \ return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load(const vec_##sign##int##bits in[size]) \ + } + +#define VEC_GENERIC_DBL_LOAD_EX(name, sign, bits, size, halfsize) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(const vec_##sign##int##bits x[size]) \ { \ v##sign##int##bits##x##size vec; \ - vec.generic[0] = v##sign##int##bits##x##halfsize##_load(in); \ - vec.generic[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \ + \ + vec.generic[0] = v##sign##int##bits##x##halfsize##_##name(x); \ + vec.generic[1] = v##sign##int##bits##x##halfsize##_##name(x + halfsize); \ + \ return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + } + +#define VEC_GENERIC_DBL_LOAD(sign, bits, size, halfsize) VEC_GENERIC_DBL_LOAD_EX(load, sign, bits, size, halfsize) +#define VEC_GENERIC_DBL_LOAD_ALIGNED(sign, bits, size, halfsize) VEC_GENERIC_DBL_LOAD_EX(load_aligned, sign, bits, size, halfsize) + +#define VEC_GENERIC_DBL_STORE_EX(name, sign, bits, size, halfsize) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec, vec_##sign##int##bits x[size]) \ + { \ + v##sign##int##bits##x##halfsize##_##name(vec.generic[0], x); \ + v##sign##int##bits##x##halfsize##_##name(vec.generic[1], x + halfsize); \ + } + +#define VEC_GENERIC_DBL_STORE(sign, bits, size, halfsize) VEC_GENERIC_DBL_STORE_EX(store, sign, bits, size, halfsize) +#define VEC_GENERIC_DBL_STORE_ALIGNED(sign, bits, size, halfsize) VEC_GENERIC_DBL_STORE_EX(store_aligned, sign, bits, size, halfsize) + +#define VEC_GENERIC_DBL_OP(name, sign, bits, size, halfsize, secondsign) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##secondsign##int##bits##x##size vec2) \ { \ - v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[0], out); \ - v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[1], out + halfsize); \ - } \ + vec1.generic[0] = v##sign##int##bits##x##halfsize##_##name(vec1.generic[0], vec2.generic[0]); \ + vec1.generic[1] = v##sign##int##bits##x##halfsize##_##name(vec1.generic[1], vec2.generic[1]); \ \ - static void v##sign##int##bits##x##size##_generic_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + return vec1; \ + } + +#define VEC_GENERIC_DBL_ADD(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(add, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_SUB(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(sub, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_MUL(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(mul, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_DIV(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(div, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_AVG(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(avg, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_LSHIFT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(lshift, sign, bits, size, halfsize, u) +#define VEC_GENERIC_DBL_RSHIFT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(rshift, sign, bits, size, halfsize, u) +#define VEC_GENERIC_DBL_LRSHIFT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(lrshift, sign, bits, size, halfsize, u) +#define VEC_GENERIC_DBL_AND(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(and, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_OR(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(or, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_XOR(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(xor, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_MIN(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(min, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_MAX(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(max, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_CMPLT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmplt, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_CMPLE(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmple, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_CMPEQ(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmpeq, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_CMPGE(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmpge, sign, bits, size, halfsize, sign) +#define VEC_GENERIC_DBL_CMPGT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmpgt, sign, bits, size, halfsize, sign) + +#define VEC_GENERIC_DBL_NOT(sign, bits, size, halfsize) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ { \ - v##sign##int##bits##x##halfsize##_store(vec.generic[0], out); \ - v##sign##int##bits##x##halfsize##_store(vec.generic[1], out + halfsize); \ - } \ + vec.generic[0] = v##sign##int##bits##x##halfsize##_not(vec.generic[0]); \ + vec.generic[1] = v##sign##int##bits##x##halfsize##_not(vec.generic[1]); \ \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_generic_load_aligned, \ - v##sign##int##bits##x##size##_generic_load, \ - v##sign##int##bits##x##size##_generic_store_aligned, \ - v##sign##int##bits##x##size##_generic_store, \ - }; - -#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size, halfsize) \ - VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize) - -// 32-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 4, 2) - -// 64-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 8, 4) -VEC_GENERIC_DEFINE_OPERATIONS(16, 4, 2) - -// 128-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8) -VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4) -VEC_GENERIC_DEFINE_OPERATIONS(32, 4, 2) - -// 256-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 32, 16) -VEC_GENERIC_DEFINE_OPERATIONS(16, 16, 8) -VEC_GENERIC_DEFINE_OPERATIONS(32, 8, 4) -VEC_GENERIC_DEFINE_OPERATIONS(64, 4, 2) - -// 512-bit -VEC_GENERIC_DEFINE_OPERATIONS(8, 64, 32) -VEC_GENERIC_DEFINE_OPERATIONS(16, 32, 16) -VEC_GENERIC_DEFINE_OPERATIONS(32, 16, 8) -VEC_GENERIC_DEFINE_OPERATIONS(64, 8, 4) - -#undef VEC_GENERIC_DEFINE_OPERATIONS -#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN + return vec; \ + } + +/* ------------------------------------------------------------------------ */ +/* PREPROCESSOR HELL INCOMING */ + + + +/* vuint8x2 */ + +#ifndef VINT8x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(/* nothing */, 8, 2) +# define VINT8x2_SPLAT_DEFINED +#endif +#ifndef VINT8x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(/* nothing */, 8, 2) +# define VINT8x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VINT8x2_LOAD_DEFINED +VEC_GENERIC_LOAD(/* nothing */, 8, 2) +# define VINT8x2_LOAD_DEFINED +#endif +#ifndef VINT8x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(/* nothing */, 8, 2) +# define VINT8x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VINT8x2_STORE_DEFINED +VEC_GENERIC_STORE(/* nothing */, 8, 2) +# define VINT8x2_STORE_DEFINED +#endif +#ifndef VINT8x2_ADD_DEFINED +VEC_GENERIC_ADD(/* nothing */, 8, 2) +# define VINT8x2_ADD_DEFINED +#endif +#ifndef VINT8x2_SUB_DEFINED +VEC_GENERIC_SUB(/* nothing */, 8, 2) +# define VINT8x2_SUB_DEFINED +#endif +#ifndef VINT8x2_MUL_DEFINED +VEC_GENERIC_MUL(/* nothing */, 8, 2) +# define VINT8x2_MUL_DEFINED +#endif +#ifndef VINT8x2_DIV_DEFINED +VEC_GENERIC_DIV(/* nothing */, 8, 2) +# define VINT8x2_DIV_DEFINED +#endif +#ifndef VINT8x2_AVG_DEFINED +VEC_GENERIC_AVG(/* nothing */, 8, 2) +# define VINT8x2_AVG_DEFINED +#endif +#ifndef VINT8x2_AND_DEFINED +VEC_GENERIC_AND(/* nothing */, 8, 2) +# define VINT8x2_AND_DEFINED +#endif +#ifndef VINT8x2_OR_DEFINED +VEC_GENERIC_OR(/* nothing */, 8, 2) +# define VINT8x2_OR_DEFINED +#endif +#ifndef VINT8x2_XOR_DEFINED +VEC_GENERIC_XOR(/* nothing */, 8, 2) +# define VINT8x2_XOR_DEFINED +#endif +#ifndef VINT8x2_NOT_DEFINED +VEC_GENERIC_NOT(/* nothing */, 8, 2) +# define VINT8x2_NOT_DEFINED +#endif +#ifndef VINT8x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(/* nothing */, 8, 2) +# define VINT8x2_CMPLT_DEFINED +#endif +#ifndef VINT8x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(/* nothing */, 8, 2) +# define VINT8x2_CMPEQ_DEFINED +#endif +#ifndef VINT8x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(/* nothing */, 8, 2) +# define VINT8x2_CMPGT_DEFINED +#endif +#ifndef VINT8x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(/* nothing */, 8, 2) +# define VINT8x2_CMPLE_DEFINED +#endif +#ifndef VINT8x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(/* nothing */, 8, 2) +# define VINT8x2_CMPGE_DEFINED +#endif +#ifndef VINT8x2_MIN_DEFINED +VEC_GENERIC_MIN(/* nothing */, 8, 2) +# define VINT8x2_MIN_DEFINED +#endif +#ifndef VINT8x2_MAX_DEFINED +VEC_GENERIC_MAX(/* nothing */, 8, 2) +# define VINT8x2_MAX_DEFINED +#endif +#ifndef VINT8x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(/* nothing */, 8, 2) +# define VINT8x2_RSHIFT_DEFINED +#endif +#ifndef VINT8x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(/* nothing */, 8, 2) +# define VINT8x2_LRSHIFT_DEFINED +#endif +#ifndef VINT8x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(/* nothing */, 8, 2) +# define VINT8x2_LSHIFT_DEFINED +#endif + + +/* vint8x2 */ + +#ifndef VUINT8x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(u, 8, 2) +# define VUINT8x2_SPLAT_DEFINED +#endif +#ifndef VUINT8x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(u, 8, 2) +# define VUINT8x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VUINT8x2_LOAD_DEFINED +VEC_GENERIC_LOAD(u, 8, 2) +# define VUINT8x2_LOAD_DEFINED +#endif +#ifndef VUINT8x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(u, 8, 2) +# define VUINT8x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VUINT8x2_STORE_DEFINED +VEC_GENERIC_STORE(u, 8, 2) +# define VUINT8x2_STORE_DEFINED +#endif +#ifndef VUINT8x2_ADD_DEFINED +VEC_GENERIC_ADD(u, 8, 2) +# define VUINT8x2_ADD_DEFINED +#endif +#ifndef VUINT8x2_SUB_DEFINED +VEC_GENERIC_SUB(u, 8, 2) +# define VUINT8x2_SUB_DEFINED +#endif +#ifndef VUINT8x2_MUL_DEFINED +VEC_GENERIC_MUL(u, 8, 2) +# define VUINT8x2_MUL_DEFINED +#endif +#ifndef VUINT8x2_DIV_DEFINED +VEC_GENERIC_DIV(u, 8, 2) +# define VUINT8x2_DIV_DEFINED +#endif +#ifndef VUINT8x2_AVG_DEFINED +VEC_GENERIC_AVG(u, 8, 2) +# define VUINT8x2_AVG_DEFINED +#endif +#ifndef VUINT8x2_AND_DEFINED +VEC_GENERIC_AND(u, 8, 2) +# define VUINT8x2_AND_DEFINED +#endif +#ifndef VUINT8x2_OR_DEFINED +VEC_GENERIC_OR(u, 8, 2) +# define VUINT8x2_OR_DEFINED +#endif +#ifndef VUINT8x2_XOR_DEFINED +VEC_GENERIC_XOR(u, 8, 2) +# define VUINT8x2_XOR_DEFINED +#endif +#ifndef VUINT8x2_NOT_DEFINED +VEC_GENERIC_NOT(u, 8, 2) +# define VUINT8x2_NOT_DEFINED +#endif +#ifndef VUINT8x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(u, 8, 2) +# define VUINT8x2_CMPLT_DEFINED +#endif +#ifndef VUINT8x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(u, 8, 2) +# define VUINT8x2_CMPEQ_DEFINED +#endif +#ifndef VUINT8x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(u, 8, 2) +# define VUINT8x2_CMPGT_DEFINED +#endif +#ifndef VUINT8x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(u, 8, 2) +# define VUINT8x2_CMPLE_DEFINED +#endif +#ifndef VUINT8x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(u, 8, 2) +# define VUINT8x2_CMPGE_DEFINED +#endif +#ifndef VUINT8x2_MIN_DEFINED +VEC_GENERIC_MIN(u, 8, 2) +# define VUINT8x2_MIN_DEFINED +#endif +#ifndef VUINT8x2_MAX_DEFINED +VEC_GENERIC_MAX(u, 8, 2) +# define VUINT8x2_MAX_DEFINED +#endif +#ifndef VUINT8x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(u, 8, 2) +# define VUINT8x2_RSHIFT_DEFINED +#endif +#ifndef VUINT8x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(u, 8, 2) +# define VUINT8x2_LRSHIFT_DEFINED +#endif +#ifndef VUINT8x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(u, 8, 2) +# define VUINT8x2_LSHIFT_DEFINED +#endif + + +/* vuint8x4 */ + +#ifndef VINT8x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 8, 4, 2) +# define VINT8x4_SPLAT_DEFINED +#endif + +#ifndef VINT8x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 8, 4, 2) +# define VINT8x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 8, 4, 2) +# define VINT8x4_LOAD_DEFINED +#endif + +#ifndef VINT8x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 8, 4, 2) +# define VINT8x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 8, 4, 2) +# define VINT8x4_STORE_DEFINED +#endif + +#ifndef VINT8x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 8, 4, 2) +# define VINT8x4_ADD_DEFINED +#endif + +#ifndef VINT8x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 8, 4, 2) +# define VINT8x4_SUB_DEFINED +#endif + +#ifndef VINT8x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 8, 4, 2) +# define VINT8x4_MUL_DEFINED +#endif + +#ifndef VINT8x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 8, 4, 2) +# define VINT8x4_DIV_DEFINED +#endif + +#ifndef VINT8x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 8, 4, 2) +# define VINT8x4_AVG_DEFINED +#endif + +#ifndef VINT8x4_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 8, 4, 2) +# define VINT8x4_AND_DEFINED +#endif + +#ifndef VINT8x4_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 8, 4, 2) +# define VINT8x4_OR_DEFINED +#endif + +#ifndef VINT8x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 8, 4, 2) +# define VINT8x4_XOR_DEFINED +#endif + +#ifndef VINT8x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 8, 4, 2) +# define VINT8x4_NOT_DEFINED +#endif + +#ifndef VINT8x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 8, 4, 2) +# define VINT8x4_CMPLT_DEFINED +#endif + +#ifndef VINT8x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 8, 4, 2) +# define VINT8x4_CMPEQ_DEFINED +#endif + +#ifndef VINT8x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 8, 4, 2) +# define VINT8x4_CMPGT_DEFINED +#endif + +#ifndef VINT8x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 8, 4, 2) +# define VINT8x4_CMPLE_DEFINED +#endif + +#ifndef VINT8x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 8, 4, 2) +# define VINT8x4_CMPGE_DEFINED +#endif + +#ifndef VINT8x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 8, 4, 2) +# define VINT8x4_MIN_DEFINED +#endif + +#ifndef VINT8x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 8, 4, 2) +# define VINT8x4_MAX_DEFINED +#endif + +#ifndef VINT8x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 8, 4, 2) +# define VINT8x4_RSHIFT_DEFINED +#endif + +#ifndef VINT8x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 8, 4, 2) +# define VINT8x4_LRSHIFT_DEFINED +#endif + +#ifndef VINT8x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 8, 4, 2) +# define VINT8x4_LSHIFT_DEFINED +#endif + + + +/* vint8x4 */ + +#ifndef VUINT8x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 8, 4, 2) +# define VUINT8x4_SPLAT_DEFINED +#endif + +#ifndef VUINT8x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 8, 4, 2) +# define VUINT8x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 8, 4, 2) +# define VUINT8x4_LOAD_DEFINED +#endif + +#ifndef VUINT8x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 8, 4, 2) +# define VUINT8x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 8, 4, 2) +# define VUINT8x4_STORE_DEFINED +#endif + +#ifndef VUINT8x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 8, 4, 2) +# define VUINT8x4_ADD_DEFINED +#endif + +#ifndef VUINT8x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 8, 4, 2) +# define VUINT8x4_SUB_DEFINED +#endif + +#ifndef VUINT8x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 8, 4, 2) +# define VUINT8x4_MUL_DEFINED +#endif + +#ifndef VUINT8x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 8, 4, 2) +# define VUINT8x4_DIV_DEFINED +#endif + +#ifndef VUINT8x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 8, 4, 2) +# define VUINT8x4_AVG_DEFINED +#endif + +#ifndef VUINT8x4_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 8, 4, 2) +# define VUINT8x4_AND_DEFINED +#endif + +#ifndef VUINT8x4_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 8, 4, 2) +# define VUINT8x4_OR_DEFINED +#endif + +#ifndef VUINT8x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 8, 4, 2) +# define VUINT8x4_XOR_DEFINED +#endif + +#ifndef VUINT8x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 8, 4, 2) +# define VUINT8x4_NOT_DEFINED +#endif + +#ifndef VUINT8x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 8, 4, 2) +# define VUINT8x4_CMPLT_DEFINED +#endif + +#ifndef VUINT8x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 8, 4, 2) +# define VUINT8x4_CMPEQ_DEFINED +#endif + +#ifndef VUINT8x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 8, 4, 2) +# define VUINT8x4_CMPGT_DEFINED +#endif + +#ifndef VUINT8x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 8, 4, 2) +# define VUINT8x4_CMPLE_DEFINED +#endif + +#ifndef VUINT8x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 8, 4, 2) +# define VUINT8x4_CMPGE_DEFINED +#endif + +#ifndef VUINT8x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 8, 4, 2) +# define VUINT8x4_MIN_DEFINED +#endif + +#ifndef VUINT8x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 8, 4, 2) +# define VUINT8x4_MAX_DEFINED +#endif + +#ifndef VUINT8x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 8, 4, 2) +# define VUINT8x4_RSHIFT_DEFINED +#endif + +#ifndef VUINT8x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 8, 4, 2) +# define VUINT8x4_LRSHIFT_DEFINED +#endif + +#ifndef VUINT8x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 8, 4, 2) +# define VUINT8x4_LSHIFT_DEFINED +#endif + + + +/* vuint8x8 */ + +#ifndef VINT8x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 8, 8, 4) +# define VINT8x8_SPLAT_DEFINED +#endif + +#ifndef VINT8x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 8, 8, 4) +# define VINT8x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 8, 8, 4) +# define VINT8x8_LOAD_DEFINED +#endif + +#ifndef VINT8x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 8, 8, 4) +# define VINT8x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 8, 8, 4) +# define VINT8x8_STORE_DEFINED +#endif + +#ifndef VINT8x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 8, 8, 4) +# define VINT8x8_ADD_DEFINED +#endif + +#ifndef VINT8x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 8, 8, 4) +# define VINT8x8_SUB_DEFINED +#endif + +#ifndef VINT8x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 8, 8, 4) +# define VINT8x8_MUL_DEFINED +#endif + +#ifndef VINT8x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 8, 8, 4) +# define VINT8x8_DIV_DEFINED +#endif + +#ifndef VINT8x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 8, 8, 4) +# define VINT8x8_AVG_DEFINED +#endif + +#ifndef VINT8x8_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 8, 8, 4) +# define VINT8x8_AND_DEFINED +#endif + +#ifndef VINT8x8_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 8, 8, 4) +# define VINT8x8_OR_DEFINED +#endif + +#ifndef VINT8x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 8, 8, 4) +# define VINT8x8_XOR_DEFINED +#endif + +#ifndef VINT8x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 8, 8, 4) +# define VINT8x8_NOT_DEFINED +#endif + +#ifndef VINT8x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 8, 8, 4) +# define VINT8x8_CMPLT_DEFINED +#endif + +#ifndef VINT8x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 8, 8, 4) +# define VINT8x8_CMPEQ_DEFINED +#endif + +#ifndef VINT8x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 8, 8, 4) +# define VINT8x8_CMPGT_DEFINED +#endif + +#ifndef VINT8x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 8, 8, 4) +# define VINT8x8_CMPLE_DEFINED +#endif + +#ifndef VINT8x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 8, 8, 4) +# define VINT8x8_CMPGE_DEFINED +#endif + +#ifndef VINT8x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 8, 8, 4) +# define VINT8x8_MIN_DEFINED +#endif + +#ifndef VINT8x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 8, 8, 4) +# define VINT8x8_MAX_DEFINED +#endif + +#ifndef VINT8x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 8, 8, 4) +# define VINT8x8_RSHIFT_DEFINED +#endif + +#ifndef VINT8x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 8, 8, 4) +# define VINT8x8_LRSHIFT_DEFINED +#endif + +#ifndef VINT8x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 8, 8, 4) +# define VINT8x8_LSHIFT_DEFINED +#endif + + + +/* vint8x8 */ + +#ifndef VUINT8x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 8, 8, 4) +# define VUINT8x8_SPLAT_DEFINED +#endif + +#ifndef VUINT8x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 8, 8, 4) +# define VUINT8x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 8, 8, 4) +# define VUINT8x8_LOAD_DEFINED +#endif + +#ifndef VUINT8x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 8, 8, 4) +# define VUINT8x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 8, 8, 4) +# define VUINT8x8_STORE_DEFINED +#endif + +#ifndef VUINT8x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 8, 8, 4) +# define VUINT8x8_ADD_DEFINED +#endif + +#ifndef VUINT8x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 8, 8, 4) +# define VUINT8x8_SUB_DEFINED +#endif + +#ifndef VUINT8x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 8, 8, 4) +# define VUINT8x8_MUL_DEFINED +#endif + +#ifndef VUINT8x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 8, 8, 4) +# define VUINT8x8_DIV_DEFINED +#endif + +#ifndef VUINT8x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 8, 8, 4) +# define VUINT8x8_AVG_DEFINED +#endif + +#ifndef VUINT8x8_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 8, 8, 4) +# define VUINT8x8_AND_DEFINED +#endif + +#ifndef VUINT8x8_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 8, 8, 4) +# define VUINT8x8_OR_DEFINED +#endif + +#ifndef VUINT8x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 8, 8, 4) +# define VUINT8x8_XOR_DEFINED +#endif + +#ifndef VUINT8x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 8, 8, 4) +# define VUINT8x8_NOT_DEFINED +#endif + +#ifndef VUINT8x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 8, 8, 4) +# define VUINT8x8_CMPLT_DEFINED +#endif + +#ifndef VUINT8x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 8, 8, 4) +# define VUINT8x8_CMPEQ_DEFINED +#endif + +#ifndef VUINT8x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 8, 8, 4) +# define VUINT8x8_CMPGT_DEFINED +#endif + +#ifndef VUINT8x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 8, 8, 4) +# define VUINT8x8_CMPLE_DEFINED +#endif + +#ifndef VUINT8x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 8, 8, 4) +# define VUINT8x8_CMPGE_DEFINED +#endif + +#ifndef VUINT8x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 8, 8, 4) +# define VUINT8x8_MIN_DEFINED +#endif + +#ifndef VUINT8x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 8, 8, 4) +# define VUINT8x8_MAX_DEFINED +#endif + +#ifndef VUINT8x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 8, 8, 4) +# define VUINT8x8_RSHIFT_DEFINED +#endif + +#ifndef VUINT8x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 8, 8, 4) +# define VUINT8x8_LRSHIFT_DEFINED +#endif + +#ifndef VUINT8x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 8, 8, 4) +# define VUINT8x8_LSHIFT_DEFINED +#endif + + + +/* vuint8x16 */ + +#ifndef VINT8x16_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 8, 16, 8) +# define VINT8x16_SPLAT_DEFINED +#endif + +#ifndef VINT8x16_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 8, 16, 8) +# define VINT8x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x16_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 8, 16, 8) +# define VINT8x16_LOAD_DEFINED +#endif + +#ifndef VINT8x16_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 8, 16, 8) +# define VINT8x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x16_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 8, 16, 8) +# define VINT8x16_STORE_DEFINED +#endif + +#ifndef VINT8x16_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 8, 16, 8) +# define VINT8x16_ADD_DEFINED +#endif + +#ifndef VINT8x16_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 8, 16, 8) +# define VINT8x16_SUB_DEFINED +#endif + +#ifndef VINT8x16_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 8, 16, 8) +# define VINT8x16_MUL_DEFINED +#endif + +#ifndef VINT8x16_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 8, 16, 8) +# define VINT8x16_DIV_DEFINED +#endif + +#ifndef VINT8x16_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 8, 16, 8) +# define VINT8x16_AVG_DEFINED +#endif + +#ifndef VINT8x16_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 8, 16, 8) +# define VINT8x16_AND_DEFINED +#endif + +#ifndef VINT8x16_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 8, 16, 8) +# define VINT8x16_OR_DEFINED +#endif + +#ifndef VINT8x16_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 8, 16, 8) +# define VINT8x16_XOR_DEFINED +#endif + +#ifndef VINT8x16_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 8, 16, 8) +# define VINT8x16_NOT_DEFINED +#endif + +#ifndef VINT8x16_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 8, 16, 8) +# define VINT8x16_CMPLT_DEFINED +#endif + +#ifndef VINT8x16_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 8, 16, 8) +# define VINT8x16_CMPEQ_DEFINED +#endif + +#ifndef VINT8x16_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 8, 16, 8) +# define VINT8x16_CMPGT_DEFINED +#endif + +#ifndef VINT8x16_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 8, 16, 8) +# define VINT8x16_CMPLE_DEFINED +#endif + +#ifndef VINT8x16_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 8, 16, 8) +# define VINT8x16_CMPGE_DEFINED +#endif + +#ifndef VINT8x16_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 8, 16, 8) +# define VINT8x16_MIN_DEFINED +#endif + +#ifndef VINT8x16_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 8, 16, 8) +# define VINT8x16_MAX_DEFINED +#endif + +#ifndef VINT8x16_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 8, 16, 8) +# define VINT8x16_RSHIFT_DEFINED +#endif + +#ifndef VINT8x16_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 8, 16, 8) +# define VINT8x16_LRSHIFT_DEFINED +#endif + +#ifndef VINT8x16_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 8, 16, 8) +# define VINT8x16_LSHIFT_DEFINED +#endif + + + +/* vint8x16 */ + +#ifndef VUINT8x16_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 8, 16, 8) +# define VUINT8x16_SPLAT_DEFINED +#endif + +#ifndef VUINT8x16_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 8, 16, 8) +# define VUINT8x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x16_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 8, 16, 8) +# define VUINT8x16_LOAD_DEFINED +#endif + +#ifndef VUINT8x16_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 8, 16, 8) +# define VUINT8x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x16_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 8, 16, 8) +# define VUINT8x16_STORE_DEFINED +#endif + +#ifndef VUINT8x16_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 8, 16, 8) +# define VUINT8x16_ADD_DEFINED +#endif + +#ifndef VUINT8x16_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 8, 16, 8) +# define VUINT8x16_SUB_DEFINED +#endif + +#ifndef VUINT8x16_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 8, 16, 8) +# define VUINT8x16_MUL_DEFINED +#endif + +#ifndef VUINT8x16_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 8, 16, 8) +# define VUINT8x16_DIV_DEFINED +#endif + +#ifndef VUINT8x16_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 8, 16, 8) +# define VUINT8x16_AVG_DEFINED +#endif + +#ifndef VUINT8x16_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 8, 16, 8) +# define VUINT8x16_AND_DEFINED +#endif + +#ifndef VUINT8x16_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 8, 16, 8) +# define VUINT8x16_OR_DEFINED +#endif + +#ifndef VUINT8x16_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 8, 16, 8) +# define VUINT8x16_XOR_DEFINED +#endif + +#ifndef VUINT8x16_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 8, 16, 8) +# define VUINT8x16_NOT_DEFINED +#endif + +#ifndef VUINT8x16_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 8, 16, 8) +# define VUINT8x16_CMPLT_DEFINED +#endif + +#ifndef VUINT8x16_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 8, 16, 8) +# define VUINT8x16_CMPEQ_DEFINED +#endif + +#ifndef VUINT8x16_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 8, 16, 8) +# define VUINT8x16_CMPGT_DEFINED +#endif + +#ifndef VUINT8x16_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 8, 16, 8) +# define VUINT8x16_CMPLE_DEFINED +#endif + +#ifndef VUINT8x16_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 8, 16, 8) +# define VUINT8x16_CMPGE_DEFINED +#endif + +#ifndef VUINT8x16_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 8, 16, 8) +# define VUINT8x16_MIN_DEFINED +#endif + +#ifndef VUINT8x16_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 8, 16, 8) +# define VUINT8x16_MAX_DEFINED +#endif + +#ifndef VUINT8x16_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 8, 16, 8) +# define VUINT8x16_RSHIFT_DEFINED +#endif + +#ifndef VUINT8x16_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 8, 16, 8) +# define VUINT8x16_LRSHIFT_DEFINED +#endif + +#ifndef VUINT8x16_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 8, 16, 8) +# define VUINT8x16_LSHIFT_DEFINED +#endif + + + +/* vuint8x32 */ + +#ifndef VINT8x32_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 8, 32, 16) +# define VINT8x32_SPLAT_DEFINED +#endif + +#ifndef VINT8x32_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 8, 32, 16) +# define VINT8x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x32_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 8, 32, 16) +# define VINT8x32_LOAD_DEFINED +#endif + +#ifndef VINT8x32_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 8, 32, 16) +# define VINT8x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x32_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 8, 32, 16) +# define VINT8x32_STORE_DEFINED +#endif + +#ifndef VINT8x32_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 8, 32, 16) +# define VINT8x32_ADD_DEFINED +#endif + +#ifndef VINT8x32_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 8, 32, 16) +# define VINT8x32_SUB_DEFINED +#endif + +#ifndef VINT8x32_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 8, 32, 16) +# define VINT8x32_MUL_DEFINED +#endif + +#ifndef VINT8x32_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 8, 32, 16) +# define VINT8x32_DIV_DEFINED +#endif + +#ifndef VINT8x32_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 8, 32, 16) +# define VINT8x32_AVG_DEFINED +#endif + +#ifndef VINT8x32_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 8, 32, 16) +# define VINT8x32_AND_DEFINED +#endif + +#ifndef VINT8x32_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 8, 32, 16) +# define VINT8x32_OR_DEFINED +#endif + +#ifndef VINT8x32_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 8, 32, 16) +# define VINT8x32_XOR_DEFINED +#endif + +#ifndef VINT8x32_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 8, 32, 16) +# define VINT8x32_NOT_DEFINED +#endif + +#ifndef VINT8x32_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 8, 32, 16) +# define VINT8x32_CMPLT_DEFINED +#endif + +#ifndef VINT8x32_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 8, 32, 16) +# define VINT8x32_CMPEQ_DEFINED +#endif + +#ifndef VINT8x32_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 8, 32, 16) +# define VINT8x32_CMPGT_DEFINED +#endif + +#ifndef VINT8x32_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 8, 32, 16) +# define VINT8x32_CMPLE_DEFINED +#endif + +#ifndef VINT8x32_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 8, 32, 16) +# define VINT8x32_CMPGE_DEFINED +#endif + +#ifndef VINT8x32_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 8, 32, 16) +# define VINT8x32_MIN_DEFINED +#endif + +#ifndef VINT8x32_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 8, 32, 16) +# define VINT8x32_MAX_DEFINED +#endif + +#ifndef VINT8x32_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 8, 32, 16) +# define VINT8x32_RSHIFT_DEFINED +#endif + +#ifndef VINT8x32_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 8, 32, 16) +# define VINT8x32_LRSHIFT_DEFINED +#endif + +#ifndef VINT8x32_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 8, 32, 16) +# define VINT8x32_LSHIFT_DEFINED +#endif + + + +/* vint8x32 */ + +#ifndef VUINT8x32_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 8, 32, 16) +# define VUINT8x32_SPLAT_DEFINED +#endif + +#ifndef VUINT8x32_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 8, 32, 16) +# define VUINT8x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x32_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 8, 32, 16) +# define VUINT8x32_LOAD_DEFINED +#endif + +#ifndef VUINT8x32_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 8, 32, 16) +# define VUINT8x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x32_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 8, 32, 16) +# define VUINT8x32_STORE_DEFINED +#endif + +#ifndef VUINT8x32_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 8, 32, 16) +# define VUINT8x32_ADD_DEFINED +#endif + +#ifndef VUINT8x32_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 8, 32, 16) +# define VUINT8x32_SUB_DEFINED +#endif + +#ifndef VUINT8x32_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 8, 32, 16) +# define VUINT8x32_MUL_DEFINED +#endif + +#ifndef VUINT8x32_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 8, 32, 16) +# define VUINT8x32_DIV_DEFINED +#endif + +#ifndef VUINT8x32_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 8, 32, 16) +# define VUINT8x32_AVG_DEFINED +#endif + +#ifndef VUINT8x32_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 8, 32, 16) +# define VUINT8x32_AND_DEFINED +#endif + +#ifndef VUINT8x32_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 8, 32, 16) +# define VUINT8x32_OR_DEFINED +#endif + +#ifndef VUINT8x32_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 8, 32, 16) +# define VUINT8x32_XOR_DEFINED +#endif + +#ifndef VUINT8x32_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 8, 32, 16) +# define VUINT8x32_NOT_DEFINED +#endif + +#ifndef VUINT8x32_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 8, 32, 16) +# define VUINT8x32_CMPLT_DEFINED +#endif + +#ifndef VUINT8x32_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 8, 32, 16) +# define VUINT8x32_CMPEQ_DEFINED +#endif + +#ifndef VUINT8x32_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 8, 32, 16) +# define VUINT8x32_CMPGT_DEFINED +#endif + +#ifndef VUINT8x32_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 8, 32, 16) +# define VUINT8x32_CMPLE_DEFINED +#endif + +#ifndef VUINT8x32_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 8, 32, 16) +# define VUINT8x32_CMPGE_DEFINED +#endif + +#ifndef VUINT8x32_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 8, 32, 16) +# define VUINT8x32_MIN_DEFINED +#endif + +#ifndef VUINT8x32_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 8, 32, 16) +# define VUINT8x32_MAX_DEFINED +#endif + +#ifndef VUINT8x32_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 8, 32, 16) +# define VUINT8x32_RSHIFT_DEFINED +#endif + +#ifndef VUINT8x32_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 8, 32, 16) +# define VUINT8x32_LRSHIFT_DEFINED +#endif + +#ifndef VUINT8x32_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 8, 32, 16) +# define VUINT8x32_LSHIFT_DEFINED +#endif + + + +/* vuint8x64 */ + +#ifndef VINT8x64_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 8, 64, 32) +# define VINT8x64_SPLAT_DEFINED +#endif + +#ifndef VINT8x64_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 8, 64, 32) +# define VINT8x64_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x64_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 8, 64, 32) +# define VINT8x64_LOAD_DEFINED +#endif + +#ifndef VINT8x64_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 8, 64, 32) +# define VINT8x64_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x64_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 8, 64, 32) +# define VINT8x64_STORE_DEFINED +#endif + +#ifndef VINT8x64_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 8, 64, 32) +# define VINT8x64_ADD_DEFINED +#endif + +#ifndef VINT8x64_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 8, 64, 32) +# define VINT8x64_SUB_DEFINED +#endif + +#ifndef VINT8x64_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 8, 64, 32) +# define VINT8x64_MUL_DEFINED +#endif + +#ifndef VINT8x64_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 8, 64, 32) +# define VINT8x64_DIV_DEFINED +#endif + +#ifndef VINT8x64_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 8, 64, 32) +# define VINT8x64_AVG_DEFINED +#endif + +#ifndef VINT8x64_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 8, 64, 32) +# define VINT8x64_AND_DEFINED +#endif + +#ifndef VINT8x64_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 8, 64, 32) +# define VINT8x64_OR_DEFINED +#endif + +#ifndef VINT8x64_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 8, 64, 32) +# define VINT8x64_XOR_DEFINED +#endif + +#ifndef VINT8x64_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 8, 64, 32) +# define VINT8x64_NOT_DEFINED +#endif + +#ifndef VINT8x64_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 8, 64, 32) +# define VINT8x64_CMPLT_DEFINED +#endif + +#ifndef VINT8x64_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 8, 64, 32) +# define VINT8x64_CMPEQ_DEFINED +#endif + +#ifndef VINT8x64_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 8, 64, 32) +# define VINT8x64_CMPGT_DEFINED +#endif + +#ifndef VINT8x64_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 8, 64, 32) +# define VINT8x64_CMPLE_DEFINED +#endif + +#ifndef VINT8x64_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 8, 64, 32) +# define VINT8x64_CMPGE_DEFINED +#endif + +#ifndef VINT8x64_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 8, 64, 32) +# define VINT8x64_MIN_DEFINED +#endif + +#ifndef VINT8x64_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 8, 64, 32) +# define VINT8x64_MAX_DEFINED +#endif + +#ifndef VINT8x64_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 8, 64, 32) +# define VINT8x64_RSHIFT_DEFINED +#endif + +#ifndef VINT8x64_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 8, 64, 32) +# define VINT8x64_LRSHIFT_DEFINED +#endif + +#ifndef VINT8x64_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 8, 64, 32) +# define VINT8x64_LSHIFT_DEFINED +#endif + + + +/* vint8x64 */ + +#ifndef VUINT8x64_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 8, 64, 32) +# define VUINT8x64_SPLAT_DEFINED +#endif + +#ifndef VUINT8x64_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 8, 64, 32) +# define VUINT8x64_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x64_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 8, 64, 32) +# define VUINT8x64_LOAD_DEFINED +#endif + +#ifndef VUINT8x64_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 8, 64, 32) +# define VUINT8x64_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x64_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 8, 64, 32) +# define VUINT8x64_STORE_DEFINED +#endif + +#ifndef VUINT8x64_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 8, 64, 32) +# define VUINT8x64_ADD_DEFINED +#endif + +#ifndef VUINT8x64_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 8, 64, 32) +# define VUINT8x64_SUB_DEFINED +#endif + +#ifndef VUINT8x64_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 8, 64, 32) +# define VUINT8x64_MUL_DEFINED +#endif + +#ifndef VUINT8x64_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 8, 64, 32) +# define VUINT8x64_DIV_DEFINED +#endif + +#ifndef VUINT8x64_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 8, 64, 32) +# define VUINT8x64_AVG_DEFINED +#endif + +#ifndef VUINT8x64_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 8, 64, 32) +# define VUINT8x64_AND_DEFINED +#endif + +#ifndef VUINT8x64_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 8, 64, 32) +# define VUINT8x64_OR_DEFINED +#endif + +#ifndef VUINT8x64_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 8, 64, 32) +# define VUINT8x64_XOR_DEFINED +#endif + +#ifndef VUINT8x64_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 8, 64, 32) +# define VUINT8x64_NOT_DEFINED +#endif + +#ifndef VUINT8x64_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 8, 64, 32) +# define VUINT8x64_CMPLT_DEFINED +#endif + +#ifndef VUINT8x64_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 8, 64, 32) +# define VUINT8x64_CMPEQ_DEFINED +#endif + +#ifndef VUINT8x64_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 8, 64, 32) +# define VUINT8x64_CMPGT_DEFINED +#endif + +#ifndef VUINT8x64_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 8, 64, 32) +# define VUINT8x64_CMPLE_DEFINED +#endif + +#ifndef VUINT8x64_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 8, 64, 32) +# define VUINT8x64_CMPGE_DEFINED +#endif + +#ifndef VUINT8x64_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 8, 64, 32) +# define VUINT8x64_MIN_DEFINED +#endif + +#ifndef VUINT8x64_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 8, 64, 32) +# define VUINT8x64_MAX_DEFINED +#endif + +#ifndef VUINT8x64_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 8, 64, 32) +# define VUINT8x64_RSHIFT_DEFINED +#endif + +#ifndef VUINT8x64_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 8, 64, 32) +# define VUINT8x64_LRSHIFT_DEFINED +#endif + +#ifndef VUINT8x64_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 8, 64, 32) +# define VUINT8x64_LSHIFT_DEFINED +#endif + + + +/* vuint16x2 */ + +#ifndef VINT16x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(/* nothing */, 16, 2) +# define VINT16x2_SPLAT_DEFINED +#endif +#ifndef VINT16x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(/* nothing */, 16, 2) +# define VINT16x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VINT16x2_LOAD_DEFINED +VEC_GENERIC_LOAD(/* nothing */, 16, 2) +# define VINT16x2_LOAD_DEFINED +#endif +#ifndef VINT16x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(/* nothing */, 16, 2) +# define VINT16x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VINT16x2_STORE_DEFINED +VEC_GENERIC_STORE(/* nothing */, 16, 2) +# define VINT16x2_STORE_DEFINED +#endif +#ifndef VINT16x2_ADD_DEFINED +VEC_GENERIC_ADD(/* nothing */, 16, 2) +# define VINT16x2_ADD_DEFINED +#endif +#ifndef VINT16x2_SUB_DEFINED +VEC_GENERIC_SUB(/* nothing */, 16, 2) +# define VINT16x2_SUB_DEFINED +#endif +#ifndef VINT16x2_MUL_DEFINED +VEC_GENERIC_MUL(/* nothing */, 16, 2) +# define VINT16x2_MUL_DEFINED +#endif +#ifndef VINT16x2_DIV_DEFINED +VEC_GENERIC_DIV(/* nothing */, 16, 2) +# define VINT16x2_DIV_DEFINED +#endif +#ifndef VINT16x2_AVG_DEFINED +VEC_GENERIC_AVG(/* nothing */, 16, 2) +# define VINT16x2_AVG_DEFINED +#endif +#ifndef VINT16x2_AND_DEFINED +VEC_GENERIC_AND(/* nothing */, 16, 2) +# define VINT16x2_AND_DEFINED +#endif +#ifndef VINT16x2_OR_DEFINED +VEC_GENERIC_OR(/* nothing */, 16, 2) +# define VINT16x2_OR_DEFINED +#endif +#ifndef VINT16x2_XOR_DEFINED +VEC_GENERIC_XOR(/* nothing */, 16, 2) +# define VINT16x2_XOR_DEFINED +#endif +#ifndef VINT16x2_NOT_DEFINED +VEC_GENERIC_NOT(/* nothing */, 16, 2) +# define VINT16x2_NOT_DEFINED +#endif +#ifndef VINT16x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(/* nothing */, 16, 2) +# define VINT16x2_CMPLT_DEFINED +#endif +#ifndef VINT16x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(/* nothing */, 16, 2) +# define VINT16x2_CMPEQ_DEFINED +#endif +#ifndef VINT16x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(/* nothing */, 16, 2) +# define VINT16x2_CMPGT_DEFINED +#endif +#ifndef VINT16x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(/* nothing */, 16, 2) +# define VINT16x2_CMPLE_DEFINED +#endif +#ifndef VINT16x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(/* nothing */, 16, 2) +# define VINT16x2_CMPGE_DEFINED +#endif +#ifndef VINT16x2_MIN_DEFINED +VEC_GENERIC_MIN(/* nothing */, 16, 2) +# define VINT16x2_MIN_DEFINED +#endif +#ifndef VINT16x2_MAX_DEFINED +VEC_GENERIC_MAX(/* nothing */, 16, 2) +# define VINT16x2_MAX_DEFINED +#endif +#ifndef VINT16x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(/* nothing */, 16, 2) +# define VINT16x2_RSHIFT_DEFINED +#endif +#ifndef VINT16x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(/* nothing */, 16, 2) +# define VINT16x2_LRSHIFT_DEFINED +#endif +#ifndef VINT16x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(/* nothing */, 16, 2) +# define VINT16x2_LSHIFT_DEFINED +#endif + + +/* vint16x2 */ + +#ifndef VUINT16x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(u, 16, 2) +# define VUINT16x2_SPLAT_DEFINED +#endif +#ifndef VUINT16x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(u, 16, 2) +# define VUINT16x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VUINT16x2_LOAD_DEFINED +VEC_GENERIC_LOAD(u, 16, 2) +# define VUINT16x2_LOAD_DEFINED +#endif +#ifndef VUINT16x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(u, 16, 2) +# define VUINT16x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VUINT16x2_STORE_DEFINED +VEC_GENERIC_STORE(u, 16, 2) +# define VUINT16x2_STORE_DEFINED +#endif +#ifndef VUINT16x2_ADD_DEFINED +VEC_GENERIC_ADD(u, 16, 2) +# define VUINT16x2_ADD_DEFINED +#endif +#ifndef VUINT16x2_SUB_DEFINED +VEC_GENERIC_SUB(u, 16, 2) +# define VUINT16x2_SUB_DEFINED +#endif +#ifndef VUINT16x2_MUL_DEFINED +VEC_GENERIC_MUL(u, 16, 2) +# define VUINT16x2_MUL_DEFINED +#endif +#ifndef VUINT16x2_DIV_DEFINED +VEC_GENERIC_DIV(u, 16, 2) +# define VUINT16x2_DIV_DEFINED +#endif +#ifndef VUINT16x2_AVG_DEFINED +VEC_GENERIC_AVG(u, 16, 2) +# define VUINT16x2_AVG_DEFINED +#endif +#ifndef VUINT16x2_AND_DEFINED +VEC_GENERIC_AND(u, 16, 2) +# define VUINT16x2_AND_DEFINED +#endif +#ifndef VUINT16x2_OR_DEFINED +VEC_GENERIC_OR(u, 16, 2) +# define VUINT16x2_OR_DEFINED +#endif +#ifndef VUINT16x2_XOR_DEFINED +VEC_GENERIC_XOR(u, 16, 2) +# define VUINT16x2_XOR_DEFINED +#endif +#ifndef VUINT16x2_NOT_DEFINED +VEC_GENERIC_NOT(u, 16, 2) +# define VUINT16x2_NOT_DEFINED +#endif +#ifndef VUINT16x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(u, 16, 2) +# define VUINT16x2_CMPLT_DEFINED +#endif +#ifndef VUINT16x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(u, 16, 2) +# define VUINT16x2_CMPEQ_DEFINED +#endif +#ifndef VUINT16x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(u, 16, 2) +# define VUINT16x2_CMPGT_DEFINED +#endif +#ifndef VUINT16x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(u, 16, 2) +# define VUINT16x2_CMPLE_DEFINED +#endif +#ifndef VUINT16x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(u, 16, 2) +# define VUINT16x2_CMPGE_DEFINED +#endif +#ifndef VUINT16x2_MIN_DEFINED +VEC_GENERIC_MIN(u, 16, 2) +# define VUINT16x2_MIN_DEFINED +#endif +#ifndef VUINT16x2_MAX_DEFINED +VEC_GENERIC_MAX(u, 16, 2) +# define VUINT16x2_MAX_DEFINED +#endif +#ifndef VUINT16x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(u, 16, 2) +# define VUINT16x2_RSHIFT_DEFINED +#endif +#ifndef VUINT16x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(u, 16, 2) +# define VUINT16x2_LRSHIFT_DEFINED +#endif +#ifndef VUINT16x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(u, 16, 2) +# define VUINT16x2_LSHIFT_DEFINED +#endif + + +/* vuint16x4 */ + +#ifndef VINT16x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 16, 4, 2) +# define VINT16x4_SPLAT_DEFINED +#endif + +#ifndef VINT16x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 16, 4, 2) +# define VINT16x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 16, 4, 2) +# define VINT16x4_LOAD_DEFINED +#endif + +#ifndef VINT16x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 16, 4, 2) +# define VINT16x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 16, 4, 2) +# define VINT16x4_STORE_DEFINED +#endif + +#ifndef VINT16x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 16, 4, 2) +# define VINT16x4_ADD_DEFINED +#endif + +#ifndef VINT16x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 16, 4, 2) +# define VINT16x4_SUB_DEFINED +#endif + +#ifndef VINT16x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 16, 4, 2) +# define VINT16x4_MUL_DEFINED +#endif + +#ifndef VINT16x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 16, 4, 2) +# define VINT16x4_DIV_DEFINED +#endif + +#ifndef VINT16x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 16, 4, 2) +# define VINT16x4_AVG_DEFINED +#endif + +#ifndef VINT16x4_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 16, 4, 2) +# define VINT16x4_AND_DEFINED +#endif + +#ifndef VINT16x4_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 16, 4, 2) +# define VINT16x4_OR_DEFINED +#endif + +#ifndef VINT16x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 16, 4, 2) +# define VINT16x4_XOR_DEFINED +#endif + +#ifndef VINT16x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 16, 4, 2) +# define VINT16x4_NOT_DEFINED +#endif + +#ifndef VINT16x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 16, 4, 2) +# define VINT16x4_CMPLT_DEFINED +#endif + +#ifndef VINT16x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 16, 4, 2) +# define VINT16x4_CMPEQ_DEFINED +#endif + +#ifndef VINT16x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 16, 4, 2) +# define VINT16x4_CMPGT_DEFINED +#endif + +#ifndef VINT16x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 16, 4, 2) +# define VINT16x4_CMPLE_DEFINED +#endif + +#ifndef VINT16x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 16, 4, 2) +# define VINT16x4_CMPGE_DEFINED +#endif + +#ifndef VINT16x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 16, 4, 2) +# define VINT16x4_MIN_DEFINED +#endif + +#ifndef VINT16x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 16, 4, 2) +# define VINT16x4_MAX_DEFINED +#endif + +#ifndef VINT16x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 16, 4, 2) +# define VINT16x4_RSHIFT_DEFINED +#endif + +#ifndef VINT16x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 16, 4, 2) +# define VINT16x4_LRSHIFT_DEFINED +#endif + +#ifndef VINT16x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 16, 4, 2) +# define VINT16x4_LSHIFT_DEFINED +#endif + + + +/* vint16x4 */ + +#ifndef VUINT16x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 16, 4, 2) +# define VUINT16x4_SPLAT_DEFINED +#endif + +#ifndef VUINT16x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 16, 4, 2) +# define VUINT16x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 16, 4, 2) +# define VUINT16x4_LOAD_DEFINED +#endif + +#ifndef VUINT16x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 16, 4, 2) +# define VUINT16x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 16, 4, 2) +# define VUINT16x4_STORE_DEFINED +#endif + +#ifndef VUINT16x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 16, 4, 2) +# define VUINT16x4_ADD_DEFINED +#endif + +#ifndef VUINT16x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 16, 4, 2) +# define VUINT16x4_SUB_DEFINED +#endif + +#ifndef VUINT16x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 16, 4, 2) +# define VUINT16x4_MUL_DEFINED +#endif + +#ifndef VUINT16x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 16, 4, 2) +# define VUINT16x4_DIV_DEFINED +#endif + +#ifndef VUINT16x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 16, 4, 2) +# define VUINT16x4_AVG_DEFINED +#endif + +#ifndef VUINT16x4_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 16, 4, 2) +# define VUINT16x4_AND_DEFINED +#endif + +#ifndef VUINT16x4_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 16, 4, 2) +# define VUINT16x4_OR_DEFINED +#endif + +#ifndef VUINT16x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 16, 4, 2) +# define VUINT16x4_XOR_DEFINED +#endif + +#ifndef VUINT16x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 16, 4, 2) +# define VUINT16x4_NOT_DEFINED +#endif + +#ifndef VUINT16x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 16, 4, 2) +# define VUINT16x4_CMPLT_DEFINED +#endif + +#ifndef VUINT16x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 16, 4, 2) +# define VUINT16x4_CMPEQ_DEFINED +#endif + +#ifndef VUINT16x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 16, 4, 2) +# define VUINT16x4_CMPGT_DEFINED +#endif + +#ifndef VUINT16x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 16, 4, 2) +# define VUINT16x4_CMPLE_DEFINED +#endif + +#ifndef VUINT16x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 16, 4, 2) +# define VUINT16x4_CMPGE_DEFINED +#endif + +#ifndef VUINT16x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 16, 4, 2) +# define VUINT16x4_MIN_DEFINED +#endif + +#ifndef VUINT16x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 16, 4, 2) +# define VUINT16x4_MAX_DEFINED +#endif + +#ifndef VUINT16x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 16, 4, 2) +# define VUINT16x4_RSHIFT_DEFINED +#endif + +#ifndef VUINT16x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 16, 4, 2) +# define VUINT16x4_LRSHIFT_DEFINED +#endif + +#ifndef VUINT16x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 16, 4, 2) +# define VUINT16x4_LSHIFT_DEFINED +#endif + + + +/* vuint16x8 */ + +#ifndef VINT16x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 16, 8, 4) +# define VINT16x8_SPLAT_DEFINED +#endif + +#ifndef VINT16x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 16, 8, 4) +# define VINT16x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 16, 8, 4) +# define VINT16x8_LOAD_DEFINED +#endif + +#ifndef VINT16x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 16, 8, 4) +# define VINT16x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 16, 8, 4) +# define VINT16x8_STORE_DEFINED +#endif + +#ifndef VINT16x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 16, 8, 4) +# define VINT16x8_ADD_DEFINED +#endif + +#ifndef VINT16x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 16, 8, 4) +# define VINT16x8_SUB_DEFINED +#endif + +#ifndef VINT16x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 16, 8, 4) +# define VINT16x8_MUL_DEFINED +#endif + +#ifndef VINT16x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 16, 8, 4) +# define VINT16x8_DIV_DEFINED +#endif + +#ifndef VINT16x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 16, 8, 4) +# define VINT16x8_AVG_DEFINED +#endif + +#ifndef VINT16x8_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 16, 8, 4) +# define VINT16x8_AND_DEFINED +#endif + +#ifndef VINT16x8_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 16, 8, 4) +# define VINT16x8_OR_DEFINED +#endif + +#ifndef VINT16x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 16, 8, 4) +# define VINT16x8_XOR_DEFINED +#endif + +#ifndef VINT16x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 16, 8, 4) +# define VINT16x8_NOT_DEFINED +#endif + +#ifndef VINT16x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 16, 8, 4) +# define VINT16x8_CMPLT_DEFINED +#endif + +#ifndef VINT16x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 16, 8, 4) +# define VINT16x8_CMPEQ_DEFINED +#endif + +#ifndef VINT16x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 16, 8, 4) +# define VINT16x8_CMPGT_DEFINED +#endif + +#ifndef VINT16x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 16, 8, 4) +# define VINT16x8_CMPLE_DEFINED +#endif + +#ifndef VINT16x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 16, 8, 4) +# define VINT16x8_CMPGE_DEFINED +#endif + +#ifndef VINT16x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 16, 8, 4) +# define VINT16x8_MIN_DEFINED +#endif + +#ifndef VINT16x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 16, 8, 4) +# define VINT16x8_MAX_DEFINED +#endif + +#ifndef VINT16x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 16, 8, 4) +# define VINT16x8_RSHIFT_DEFINED +#endif + +#ifndef VINT16x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 16, 8, 4) +# define VINT16x8_LRSHIFT_DEFINED +#endif + +#ifndef VINT16x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 16, 8, 4) +# define VINT16x8_LSHIFT_DEFINED +#endif + + + +/* vint16x8 */ + +#ifndef VUINT16x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 16, 8, 4) +# define VUINT16x8_SPLAT_DEFINED +#endif + +#ifndef VUINT16x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 16, 8, 4) +# define VUINT16x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 16, 8, 4) +# define VUINT16x8_LOAD_DEFINED +#endif + +#ifndef VUINT16x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 16, 8, 4) +# define VUINT16x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 16, 8, 4) +# define VUINT16x8_STORE_DEFINED +#endif + +#ifndef VUINT16x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 16, 8, 4) +# define VUINT16x8_ADD_DEFINED +#endif + +#ifndef VUINT16x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 16, 8, 4) +# define VUINT16x8_SUB_DEFINED +#endif + +#ifndef VUINT16x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 16, 8, 4) +# define VUINT16x8_MUL_DEFINED +#endif + +#ifndef VUINT16x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 16, 8, 4) +# define VUINT16x8_DIV_DEFINED +#endif + +#ifndef VUINT16x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 16, 8, 4) +# define VUINT16x8_AVG_DEFINED +#endif + +#ifndef VUINT16x8_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 16, 8, 4) +# define VUINT16x8_AND_DEFINED +#endif + +#ifndef VUINT16x8_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 16, 8, 4) +# define VUINT16x8_OR_DEFINED +#endif + +#ifndef VUINT16x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 16, 8, 4) +# define VUINT16x8_XOR_DEFINED +#endif + +#ifndef VUINT16x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 16, 8, 4) +# define VUINT16x8_NOT_DEFINED +#endif + +#ifndef VUINT16x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 16, 8, 4) +# define VUINT16x8_CMPLT_DEFINED +#endif + +#ifndef VUINT16x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 16, 8, 4) +# define VUINT16x8_CMPEQ_DEFINED +#endif + +#ifndef VUINT16x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 16, 8, 4) +# define VUINT16x8_CMPGT_DEFINED +#endif + +#ifndef VUINT16x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 16, 8, 4) +# define VUINT16x8_CMPLE_DEFINED +#endif + +#ifndef VUINT16x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 16, 8, 4) +# define VUINT16x8_CMPGE_DEFINED +#endif + +#ifndef VUINT16x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 16, 8, 4) +# define VUINT16x8_MIN_DEFINED +#endif + +#ifndef VUINT16x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 16, 8, 4) +# define VUINT16x8_MAX_DEFINED +#endif + +#ifndef VUINT16x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 16, 8, 4) +# define VUINT16x8_RSHIFT_DEFINED +#endif + +#ifndef VUINT16x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 16, 8, 4) +# define VUINT16x8_LRSHIFT_DEFINED +#endif + +#ifndef VUINT16x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 16, 8, 4) +# define VUINT16x8_LSHIFT_DEFINED +#endif + + + +/* vuint16x16 */ + +#ifndef VINT16x16_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 16, 16, 8) +# define VINT16x16_SPLAT_DEFINED +#endif + +#ifndef VINT16x16_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 16, 16, 8) +# define VINT16x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x16_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 16, 16, 8) +# define VINT16x16_LOAD_DEFINED +#endif + +#ifndef VINT16x16_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 16, 16, 8) +# define VINT16x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x16_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 16, 16, 8) +# define VINT16x16_STORE_DEFINED +#endif + +#ifndef VINT16x16_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 16, 16, 8) +# define VINT16x16_ADD_DEFINED +#endif + +#ifndef VINT16x16_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 16, 16, 8) +# define VINT16x16_SUB_DEFINED +#endif + +#ifndef VINT16x16_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 16, 16, 8) +# define VINT16x16_MUL_DEFINED +#endif + +#ifndef VINT16x16_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 16, 16, 8) +# define VINT16x16_DIV_DEFINED +#endif + +#ifndef VINT16x16_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 16, 16, 8) +# define VINT16x16_AVG_DEFINED +#endif + +#ifndef VINT16x16_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 16, 16, 8) +# define VINT16x16_AND_DEFINED +#endif + +#ifndef VINT16x16_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 16, 16, 8) +# define VINT16x16_OR_DEFINED +#endif + +#ifndef VINT16x16_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 16, 16, 8) +# define VINT16x16_XOR_DEFINED +#endif + +#ifndef VINT16x16_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 16, 16, 8) +# define VINT16x16_NOT_DEFINED +#endif + +#ifndef VINT16x16_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 16, 16, 8) +# define VINT16x16_CMPLT_DEFINED +#endif + +#ifndef VINT16x16_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 16, 16, 8) +# define VINT16x16_CMPEQ_DEFINED +#endif + +#ifndef VINT16x16_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 16, 16, 8) +# define VINT16x16_CMPGT_DEFINED +#endif + +#ifndef VINT16x16_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 16, 16, 8) +# define VINT16x16_CMPLE_DEFINED +#endif + +#ifndef VINT16x16_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 16, 16, 8) +# define VINT16x16_CMPGE_DEFINED +#endif + +#ifndef VINT16x16_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 16, 16, 8) +# define VINT16x16_MIN_DEFINED +#endif + +#ifndef VINT16x16_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 16, 16, 8) +# define VINT16x16_MAX_DEFINED +#endif + +#ifndef VINT16x16_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 16, 16, 8) +# define VINT16x16_RSHIFT_DEFINED +#endif + +#ifndef VINT16x16_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 16, 16, 8) +# define VINT16x16_LRSHIFT_DEFINED +#endif + +#ifndef VINT16x16_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 16, 16, 8) +# define VINT16x16_LSHIFT_DEFINED +#endif + + + +/* vint16x16 */ + +#ifndef VUINT16x16_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 16, 16, 8) +# define VUINT16x16_SPLAT_DEFINED +#endif + +#ifndef VUINT16x16_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 16, 16, 8) +# define VUINT16x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x16_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 16, 16, 8) +# define VUINT16x16_LOAD_DEFINED +#endif + +#ifndef VUINT16x16_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 16, 16, 8) +# define VUINT16x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x16_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 16, 16, 8) +# define VUINT16x16_STORE_DEFINED +#endif + +#ifndef VUINT16x16_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 16, 16, 8) +# define VUINT16x16_ADD_DEFINED +#endif + +#ifndef VUINT16x16_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 16, 16, 8) +# define VUINT16x16_SUB_DEFINED +#endif + +#ifndef VUINT16x16_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 16, 16, 8) +# define VUINT16x16_MUL_DEFINED +#endif + +#ifndef VUINT16x16_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 16, 16, 8) +# define VUINT16x16_DIV_DEFINED +#endif + +#ifndef VUINT16x16_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 16, 16, 8) +# define VUINT16x16_AVG_DEFINED +#endif + +#ifndef VUINT16x16_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 16, 16, 8) +# define VUINT16x16_AND_DEFINED +#endif + +#ifndef VUINT16x16_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 16, 16, 8) +# define VUINT16x16_OR_DEFINED +#endif + +#ifndef VUINT16x16_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 16, 16, 8) +# define VUINT16x16_XOR_DEFINED +#endif + +#ifndef VUINT16x16_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 16, 16, 8) +# define VUINT16x16_NOT_DEFINED +#endif + +#ifndef VUINT16x16_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 16, 16, 8) +# define VUINT16x16_CMPLT_DEFINED +#endif + +#ifndef VUINT16x16_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 16, 16, 8) +# define VUINT16x16_CMPEQ_DEFINED +#endif + +#ifndef VUINT16x16_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 16, 16, 8) +# define VUINT16x16_CMPGT_DEFINED +#endif + +#ifndef VUINT16x16_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 16, 16, 8) +# define VUINT16x16_CMPLE_DEFINED +#endif + +#ifndef VUINT16x16_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 16, 16, 8) +# define VUINT16x16_CMPGE_DEFINED +#endif + +#ifndef VUINT16x16_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 16, 16, 8) +# define VUINT16x16_MIN_DEFINED +#endif + +#ifndef VUINT16x16_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 16, 16, 8) +# define VUINT16x16_MAX_DEFINED +#endif + +#ifndef VUINT16x16_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 16, 16, 8) +# define VUINT16x16_RSHIFT_DEFINED +#endif + +#ifndef VUINT16x16_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 16, 16, 8) +# define VUINT16x16_LRSHIFT_DEFINED +#endif + +#ifndef VUINT16x16_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 16, 16, 8) +# define VUINT16x16_LSHIFT_DEFINED +#endif + + + +/* vuint16x32 */ + +#ifndef VINT16x32_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 16, 32, 16) +# define VINT16x32_SPLAT_DEFINED +#endif + +#ifndef VINT16x32_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 16, 32, 16) +# define VINT16x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x32_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 16, 32, 16) +# define VINT16x32_LOAD_DEFINED +#endif + +#ifndef VINT16x32_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 16, 32, 16) +# define VINT16x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x32_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 16, 32, 16) +# define VINT16x32_STORE_DEFINED +#endif + +#ifndef VINT16x32_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 16, 32, 16) +# define VINT16x32_ADD_DEFINED +#endif + +#ifndef VINT16x32_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 16, 32, 16) +# define VINT16x32_SUB_DEFINED +#endif + +#ifndef VINT16x32_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 16, 32, 16) +# define VINT16x32_MUL_DEFINED +#endif + +#ifndef VINT16x32_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 16, 32, 16) +# define VINT16x32_DIV_DEFINED +#endif + +#ifndef VINT16x32_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 16, 32, 16) +# define VINT16x32_AVG_DEFINED +#endif + +#ifndef VINT16x32_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 16, 32, 16) +# define VINT16x32_AND_DEFINED +#endif + +#ifndef VINT16x32_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 16, 32, 16) +# define VINT16x32_OR_DEFINED +#endif + +#ifndef VINT16x32_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 16, 32, 16) +# define VINT16x32_XOR_DEFINED +#endif + +#ifndef VINT16x32_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 16, 32, 16) +# define VINT16x32_NOT_DEFINED +#endif + +#ifndef VINT16x32_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 16, 32, 16) +# define VINT16x32_CMPLT_DEFINED +#endif + +#ifndef VINT16x32_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 16, 32, 16) +# define VINT16x32_CMPEQ_DEFINED +#endif + +#ifndef VINT16x32_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 16, 32, 16) +# define VINT16x32_CMPGT_DEFINED +#endif + +#ifndef VINT16x32_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 16, 32, 16) +# define VINT16x32_CMPLE_DEFINED +#endif + +#ifndef VINT16x32_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 16, 32, 16) +# define VINT16x32_CMPGE_DEFINED +#endif + +#ifndef VINT16x32_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 16, 32, 16) +# define VINT16x32_MIN_DEFINED +#endif + +#ifndef VINT16x32_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 16, 32, 16) +# define VINT16x32_MAX_DEFINED +#endif + +#ifndef VINT16x32_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 16, 32, 16) +# define VINT16x32_RSHIFT_DEFINED +#endif + +#ifndef VINT16x32_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 16, 32, 16) +# define VINT16x32_LRSHIFT_DEFINED +#endif + +#ifndef VINT16x32_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 16, 32, 16) +# define VINT16x32_LSHIFT_DEFINED +#endif + + + +/* vint16x32 */ + +#ifndef VUINT16x32_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 16, 32, 16) +# define VUINT16x32_SPLAT_DEFINED +#endif + +#ifndef VUINT16x32_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 16, 32, 16) +# define VUINT16x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x32_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 16, 32, 16) +# define VUINT16x32_LOAD_DEFINED +#endif + +#ifndef VUINT16x32_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 16, 32, 16) +# define VUINT16x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x32_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 16, 32, 16) +# define VUINT16x32_STORE_DEFINED +#endif + +#ifndef VUINT16x32_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 16, 32, 16) +# define VUINT16x32_ADD_DEFINED +#endif + +#ifndef VUINT16x32_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 16, 32, 16) +# define VUINT16x32_SUB_DEFINED +#endif + +#ifndef VUINT16x32_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 16, 32, 16) +# define VUINT16x32_MUL_DEFINED +#endif + +#ifndef VUINT16x32_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 16, 32, 16) +# define VUINT16x32_DIV_DEFINED +#endif + +#ifndef VUINT16x32_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 16, 32, 16) +# define VUINT16x32_AVG_DEFINED +#endif + +#ifndef VUINT16x32_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 16, 32, 16) +# define VUINT16x32_AND_DEFINED +#endif + +#ifndef VUINT16x32_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 16, 32, 16) +# define VUINT16x32_OR_DEFINED +#endif + +#ifndef VUINT16x32_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 16, 32, 16) +# define VUINT16x32_XOR_DEFINED +#endif + +#ifndef VUINT16x32_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 16, 32, 16) +# define VUINT16x32_NOT_DEFINED +#endif + +#ifndef VUINT16x32_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 16, 32, 16) +# define VUINT16x32_CMPLT_DEFINED +#endif + +#ifndef VUINT16x32_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 16, 32, 16) +# define VUINT16x32_CMPEQ_DEFINED +#endif + +#ifndef VUINT16x32_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 16, 32, 16) +# define VUINT16x32_CMPGT_DEFINED +#endif + +#ifndef VUINT16x32_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 16, 32, 16) +# define VUINT16x32_CMPLE_DEFINED +#endif + +#ifndef VUINT16x32_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 16, 32, 16) +# define VUINT16x32_CMPGE_DEFINED +#endif + +#ifndef VUINT16x32_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 16, 32, 16) +# define VUINT16x32_MIN_DEFINED +#endif + +#ifndef VUINT16x32_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 16, 32, 16) +# define VUINT16x32_MAX_DEFINED +#endif + +#ifndef VUINT16x32_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 16, 32, 16) +# define VUINT16x32_RSHIFT_DEFINED +#endif + +#ifndef VUINT16x32_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 16, 32, 16) +# define VUINT16x32_LRSHIFT_DEFINED +#endif + +#ifndef VUINT16x32_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 16, 32, 16) +# define VUINT16x32_LSHIFT_DEFINED +#endif + + + +/* vuint32x2 */ + +#ifndef VINT32x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(/* nothing */, 32, 2) +# define VINT32x2_SPLAT_DEFINED +#endif +#ifndef VINT32x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(/* nothing */, 32, 2) +# define VINT32x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VINT32x2_LOAD_DEFINED +VEC_GENERIC_LOAD(/* nothing */, 32, 2) +# define VINT32x2_LOAD_DEFINED +#endif +#ifndef VINT32x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(/* nothing */, 32, 2) +# define VINT32x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VINT32x2_STORE_DEFINED +VEC_GENERIC_STORE(/* nothing */, 32, 2) +# define VINT32x2_STORE_DEFINED +#endif +#ifndef VINT32x2_ADD_DEFINED +VEC_GENERIC_ADD(/* nothing */, 32, 2) +# define VINT32x2_ADD_DEFINED +#endif +#ifndef VINT32x2_SUB_DEFINED +VEC_GENERIC_SUB(/* nothing */, 32, 2) +# define VINT32x2_SUB_DEFINED +#endif +#ifndef VINT32x2_MUL_DEFINED +VEC_GENERIC_MUL(/* nothing */, 32, 2) +# define VINT32x2_MUL_DEFINED +#endif +#ifndef VINT32x2_DIV_DEFINED +VEC_GENERIC_DIV(/* nothing */, 32, 2) +# define VINT32x2_DIV_DEFINED +#endif +#ifndef VINT32x2_AVG_DEFINED +VEC_GENERIC_AVG(/* nothing */, 32, 2) +# define VINT32x2_AVG_DEFINED +#endif +#ifndef VINT32x2_AND_DEFINED +VEC_GENERIC_AND(/* nothing */, 32, 2) +# define VINT32x2_AND_DEFINED +#endif +#ifndef VINT32x2_OR_DEFINED +VEC_GENERIC_OR(/* nothing */, 32, 2) +# define VINT32x2_OR_DEFINED +#endif +#ifndef VINT32x2_XOR_DEFINED +VEC_GENERIC_XOR(/* nothing */, 32, 2) +# define VINT32x2_XOR_DEFINED +#endif +#ifndef VINT32x2_NOT_DEFINED +VEC_GENERIC_NOT(/* nothing */, 32, 2) +# define VINT32x2_NOT_DEFINED +#endif +#ifndef VINT32x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(/* nothing */, 32, 2) +# define VINT32x2_CMPLT_DEFINED +#endif +#ifndef VINT32x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(/* nothing */, 32, 2) +# define VINT32x2_CMPEQ_DEFINED +#endif +#ifndef VINT32x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(/* nothing */, 32, 2) +# define VINT32x2_CMPGT_DEFINED +#endif +#ifndef VINT32x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(/* nothing */, 32, 2) +# define VINT32x2_CMPLE_DEFINED +#endif +#ifndef VINT32x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(/* nothing */, 32, 2) +# define VINT32x2_CMPGE_DEFINED +#endif +#ifndef VINT32x2_MIN_DEFINED +VEC_GENERIC_MIN(/* nothing */, 32, 2) +# define VINT32x2_MIN_DEFINED +#endif +#ifndef VINT32x2_MAX_DEFINED +VEC_GENERIC_MAX(/* nothing */, 32, 2) +# define VINT32x2_MAX_DEFINED +#endif +#ifndef VINT32x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(/* nothing */, 32, 2) +# define VINT32x2_RSHIFT_DEFINED +#endif +#ifndef VINT32x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(/* nothing */, 32, 2) +# define VINT32x2_LRSHIFT_DEFINED +#endif +#ifndef VINT32x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(/* nothing */, 32, 2) +# define VINT32x2_LSHIFT_DEFINED +#endif + + +/* vint32x2 */ + +#ifndef VUINT32x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(u, 32, 2) +# define VUINT32x2_SPLAT_DEFINED +#endif +#ifndef VUINT32x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(u, 32, 2) +# define VUINT32x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VUINT32x2_LOAD_DEFINED +VEC_GENERIC_LOAD(u, 32, 2) +# define VUINT32x2_LOAD_DEFINED +#endif +#ifndef VUINT32x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(u, 32, 2) +# define VUINT32x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VUINT32x2_STORE_DEFINED +VEC_GENERIC_STORE(u, 32, 2) +# define VUINT32x2_STORE_DEFINED +#endif +#ifndef VUINT32x2_ADD_DEFINED +VEC_GENERIC_ADD(u, 32, 2) +# define VUINT32x2_ADD_DEFINED +#endif +#ifndef VUINT32x2_SUB_DEFINED +VEC_GENERIC_SUB(u, 32, 2) +# define VUINT32x2_SUB_DEFINED +#endif +#ifndef VUINT32x2_MUL_DEFINED +VEC_GENERIC_MUL(u, 32, 2) +# define VUINT32x2_MUL_DEFINED +#endif +#ifndef VUINT32x2_DIV_DEFINED +VEC_GENERIC_DIV(u, 32, 2) +# define VUINT32x2_DIV_DEFINED +#endif +#ifndef VUINT32x2_AVG_DEFINED +VEC_GENERIC_AVG(u, 32, 2) +# define VUINT32x2_AVG_DEFINED +#endif +#ifndef VUINT32x2_AND_DEFINED +VEC_GENERIC_AND(u, 32, 2) +# define VUINT32x2_AND_DEFINED +#endif +#ifndef VUINT32x2_OR_DEFINED +VEC_GENERIC_OR(u, 32, 2) +# define VUINT32x2_OR_DEFINED +#endif +#ifndef VUINT32x2_XOR_DEFINED +VEC_GENERIC_XOR(u, 32, 2) +# define VUINT32x2_XOR_DEFINED +#endif +#ifndef VUINT32x2_NOT_DEFINED +VEC_GENERIC_NOT(u, 32, 2) +# define VUINT32x2_NOT_DEFINED +#endif +#ifndef VUINT32x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(u, 32, 2) +# define VUINT32x2_CMPLT_DEFINED +#endif +#ifndef VUINT32x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(u, 32, 2) +# define VUINT32x2_CMPEQ_DEFINED +#endif +#ifndef VUINT32x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(u, 32, 2) +# define VUINT32x2_CMPGT_DEFINED +#endif +#ifndef VUINT32x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(u, 32, 2) +# define VUINT32x2_CMPLE_DEFINED +#endif +#ifndef VUINT32x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(u, 32, 2) +# define VUINT32x2_CMPGE_DEFINED +#endif +#ifndef VUINT32x2_MIN_DEFINED +VEC_GENERIC_MIN(u, 32, 2) +# define VUINT32x2_MIN_DEFINED +#endif +#ifndef VUINT32x2_MAX_DEFINED +VEC_GENERIC_MAX(u, 32, 2) +# define VUINT32x2_MAX_DEFINED +#endif +#ifndef VUINT32x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(u, 32, 2) +# define VUINT32x2_RSHIFT_DEFINED +#endif +#ifndef VUINT32x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(u, 32, 2) +# define VUINT32x2_LRSHIFT_DEFINED +#endif +#ifndef VUINT32x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(u, 32, 2) +# define VUINT32x2_LSHIFT_DEFINED +#endif + + +/* vuint32x4 */ + +#ifndef VINT32x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 32, 4, 2) +# define VINT32x4_SPLAT_DEFINED +#endif + +#ifndef VINT32x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 32, 4, 2) +# define VINT32x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT32x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 32, 4, 2) +# define VINT32x4_LOAD_DEFINED +#endif + +#ifndef VINT32x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 32, 4, 2) +# define VINT32x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT32x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 32, 4, 2) +# define VINT32x4_STORE_DEFINED +#endif + +#ifndef VINT32x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 32, 4, 2) +# define VINT32x4_ADD_DEFINED +#endif + +#ifndef VINT32x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 32, 4, 2) +# define VINT32x4_SUB_DEFINED +#endif + +#ifndef VINT32x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 32, 4, 2) +# define VINT32x4_MUL_DEFINED +#endif + +#ifndef VINT32x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 32, 4, 2) +# define VINT32x4_DIV_DEFINED +#endif + +#ifndef VINT32x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 32, 4, 2) +# define VINT32x4_AVG_DEFINED +#endif + +#ifndef VINT32x4_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 32, 4, 2) +# define VINT32x4_AND_DEFINED +#endif + +#ifndef VINT32x4_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 32, 4, 2) +# define VINT32x4_OR_DEFINED +#endif + +#ifndef VINT32x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 32, 4, 2) +# define VINT32x4_XOR_DEFINED +#endif + +#ifndef VINT32x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 32, 4, 2) +# define VINT32x4_NOT_DEFINED +#endif + +#ifndef VINT32x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 32, 4, 2) +# define VINT32x4_CMPLT_DEFINED +#endif + +#ifndef VINT32x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 32, 4, 2) +# define VINT32x4_CMPEQ_DEFINED +#endif + +#ifndef VINT32x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 32, 4, 2) +# define VINT32x4_CMPGT_DEFINED +#endif + +#ifndef VINT32x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 32, 4, 2) +# define VINT32x4_CMPLE_DEFINED +#endif + +#ifndef VINT32x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 32, 4, 2) +# define VINT32x4_CMPGE_DEFINED +#endif + +#ifndef VINT32x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 32, 4, 2) +# define VINT32x4_MIN_DEFINED +#endif + +#ifndef VINT32x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 32, 4, 2) +# define VINT32x4_MAX_DEFINED +#endif + +#ifndef VINT32x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 32, 4, 2) +# define VINT32x4_RSHIFT_DEFINED +#endif + +#ifndef VINT32x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 32, 4, 2) +# define VINT32x4_LRSHIFT_DEFINED +#endif + +#ifndef VINT32x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 32, 4, 2) +# define VINT32x4_LSHIFT_DEFINED +#endif + + + +/* vint32x4 */ + +#ifndef VUINT32x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 32, 4, 2) +# define VUINT32x4_SPLAT_DEFINED +#endif + +#ifndef VUINT32x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 32, 4, 2) +# define VUINT32x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 32, 4, 2) +# define VUINT32x4_LOAD_DEFINED +#endif + +#ifndef VUINT32x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 32, 4, 2) +# define VUINT32x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 32, 4, 2) +# define VUINT32x4_STORE_DEFINED +#endif + +#ifndef VUINT32x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 32, 4, 2) +# define VUINT32x4_ADD_DEFINED +#endif + +#ifndef VUINT32x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 32, 4, 2) +# define VUINT32x4_SUB_DEFINED +#endif + +#ifndef VUINT32x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 32, 4, 2) +# define VUINT32x4_MUL_DEFINED +#endif + +#ifndef VUINT32x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 32, 4, 2) +# define VUINT32x4_DIV_DEFINED +#endif + +#ifndef VUINT32x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 32, 4, 2) +# define VUINT32x4_AVG_DEFINED +#endif + +#ifndef VUINT32x4_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 32, 4, 2) +# define VUINT32x4_AND_DEFINED +#endif + +#ifndef VUINT32x4_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 32, 4, 2) +# define VUINT32x4_OR_DEFINED +#endif + +#ifndef VUINT32x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 32, 4, 2) +# define VUINT32x4_XOR_DEFINED +#endif + +#ifndef VUINT32x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 32, 4, 2) +# define VUINT32x4_NOT_DEFINED +#endif + +#ifndef VUINT32x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 32, 4, 2) +# define VUINT32x4_CMPLT_DEFINED +#endif + +#ifndef VUINT32x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 32, 4, 2) +# define VUINT32x4_CMPEQ_DEFINED +#endif + +#ifndef VUINT32x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 32, 4, 2) +# define VUINT32x4_CMPGT_DEFINED +#endif + +#ifndef VUINT32x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 32, 4, 2) +# define VUINT32x4_CMPLE_DEFINED +#endif + +#ifndef VUINT32x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 32, 4, 2) +# define VUINT32x4_CMPGE_DEFINED +#endif + +#ifndef VUINT32x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 32, 4, 2) +# define VUINT32x4_MIN_DEFINED +#endif + +#ifndef VUINT32x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 32, 4, 2) +# define VUINT32x4_MAX_DEFINED +#endif + +#ifndef VUINT32x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 32, 4, 2) +# define VUINT32x4_RSHIFT_DEFINED +#endif + +#ifndef VUINT32x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 32, 4, 2) +# define VUINT32x4_LRSHIFT_DEFINED +#endif + +#ifndef VUINT32x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 32, 4, 2) +# define VUINT32x4_LSHIFT_DEFINED +#endif + + + +/* vuint32x8 */ + +#ifndef VINT32x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 32, 8, 4) +# define VINT32x8_SPLAT_DEFINED +#endif + +#ifndef VINT32x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 32, 8, 4) +# define VINT32x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT32x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 32, 8, 4) +# define VINT32x8_LOAD_DEFINED +#endif + +#ifndef VINT32x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 32, 8, 4) +# define VINT32x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT32x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 32, 8, 4) +# define VINT32x8_STORE_DEFINED +#endif + +#ifndef VINT32x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 32, 8, 4) +# define VINT32x8_ADD_DEFINED +#endif + +#ifndef VINT32x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 32, 8, 4) +# define VINT32x8_SUB_DEFINED +#endif + +#ifndef VINT32x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 32, 8, 4) +# define VINT32x8_MUL_DEFINED +#endif + +#ifndef VINT32x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 32, 8, 4) +# define VINT32x8_DIV_DEFINED +#endif + +#ifndef VINT32x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 32, 8, 4) +# define VINT32x8_AVG_DEFINED +#endif + +#ifndef VINT32x8_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 32, 8, 4) +# define VINT32x8_AND_DEFINED +#endif + +#ifndef VINT32x8_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 32, 8, 4) +# define VINT32x8_OR_DEFINED +#endif + +#ifndef VINT32x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 32, 8, 4) +# define VINT32x8_XOR_DEFINED +#endif + +#ifndef VINT32x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 32, 8, 4) +# define VINT32x8_NOT_DEFINED +#endif + +#ifndef VINT32x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 32, 8, 4) +# define VINT32x8_CMPLT_DEFINED +#endif + +#ifndef VINT32x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 32, 8, 4) +# define VINT32x8_CMPEQ_DEFINED +#endif + +#ifndef VINT32x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 32, 8, 4) +# define VINT32x8_CMPGT_DEFINED +#endif + +#ifndef VINT32x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 32, 8, 4) +# define VINT32x8_CMPLE_DEFINED +#endif + +#ifndef VINT32x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 32, 8, 4) +# define VINT32x8_CMPGE_DEFINED +#endif + +#ifndef VINT32x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 32, 8, 4) +# define VINT32x8_MIN_DEFINED +#endif + +#ifndef VINT32x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 32, 8, 4) +# define VINT32x8_MAX_DEFINED +#endif + +#ifndef VINT32x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 32, 8, 4) +# define VINT32x8_RSHIFT_DEFINED +#endif + +#ifndef VINT32x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 32, 8, 4) +# define VINT32x8_LRSHIFT_DEFINED +#endif + +#ifndef VINT32x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 32, 8, 4) +# define VINT32x8_LSHIFT_DEFINED +#endif + + + +/* vint32x8 */ + +#ifndef VUINT32x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 32, 8, 4) +# define VUINT32x8_SPLAT_DEFINED +#endif + +#ifndef VUINT32x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 32, 8, 4) +# define VUINT32x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 32, 8, 4) +# define VUINT32x8_LOAD_DEFINED +#endif + +#ifndef VUINT32x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 32, 8, 4) +# define VUINT32x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 32, 8, 4) +# define VUINT32x8_STORE_DEFINED +#endif + +#ifndef VUINT32x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 32, 8, 4) +# define VUINT32x8_ADD_DEFINED +#endif + +#ifndef VUINT32x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 32, 8, 4) +# define VUINT32x8_SUB_DEFINED +#endif + +#ifndef VUINT32x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 32, 8, 4) +# define VUINT32x8_MUL_DEFINED +#endif + +#ifndef VUINT32x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 32, 8, 4) +# define VUINT32x8_DIV_DEFINED +#endif + +#ifndef VUINT32x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 32, 8, 4) +# define VUINT32x8_AVG_DEFINED +#endif + +#ifndef VUINT32x8_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 32, 8, 4) +# define VUINT32x8_AND_DEFINED +#endif + +#ifndef VUINT32x8_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 32, 8, 4) +# define VUINT32x8_OR_DEFINED +#endif + +#ifndef VUINT32x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 32, 8, 4) +# define VUINT32x8_XOR_DEFINED +#endif + +#ifndef VUINT32x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 32, 8, 4) +# define VUINT32x8_NOT_DEFINED +#endif + +#ifndef VUINT32x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 32, 8, 4) +# define VUINT32x8_CMPLT_DEFINED +#endif + +#ifndef VUINT32x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 32, 8, 4) +# define VUINT32x8_CMPEQ_DEFINED +#endif + +#ifndef VUINT32x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 32, 8, 4) +# define VUINT32x8_CMPGT_DEFINED +#endif + +#ifndef VUINT32x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 32, 8, 4) +# define VUINT32x8_CMPLE_DEFINED +#endif + +#ifndef VUINT32x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 32, 8, 4) +# define VUINT32x8_CMPGE_DEFINED +#endif + +#ifndef VUINT32x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 32, 8, 4) +# define VUINT32x8_MIN_DEFINED +#endif + +#ifndef VUINT32x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 32, 8, 4) +# define VUINT32x8_MAX_DEFINED +#endif + +#ifndef VUINT32x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 32, 8, 4) +# define VUINT32x8_RSHIFT_DEFINED +#endif + +#ifndef VUINT32x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 32, 8, 4) +# define VUINT32x8_LRSHIFT_DEFINED +#endif + +#ifndef VUINT32x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 32, 8, 4) +# define VUINT32x8_LSHIFT_DEFINED +#endif + + + +/* vuint32x16 */ + +#ifndef VINT32x16_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 32, 16, 8) +# define VINT32x16_SPLAT_DEFINED +#endif + +#ifndef VINT32x16_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 32, 16, 8) +# define VINT32x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT32x16_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 32, 16, 8) +# define VINT32x16_LOAD_DEFINED +#endif + +#ifndef VINT32x16_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 32, 16, 8) +# define VINT32x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT32x16_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 32, 16, 8) +# define VINT32x16_STORE_DEFINED +#endif + +#ifndef VINT32x16_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 32, 16, 8) +# define VINT32x16_ADD_DEFINED +#endif + +#ifndef VINT32x16_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 32, 16, 8) +# define VINT32x16_SUB_DEFINED +#endif + +#ifndef VINT32x16_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 32, 16, 8) +# define VINT32x16_MUL_DEFINED +#endif + +#ifndef VINT32x16_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 32, 16, 8) +# define VINT32x16_DIV_DEFINED +#endif + +#ifndef VINT32x16_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 32, 16, 8) +# define VINT32x16_AVG_DEFINED +#endif + +#ifndef VINT32x16_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 32, 16, 8) +# define VINT32x16_AND_DEFINED +#endif + +#ifndef VINT32x16_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 32, 16, 8) +# define VINT32x16_OR_DEFINED +#endif + +#ifndef VINT32x16_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 32, 16, 8) +# define VINT32x16_XOR_DEFINED +#endif + +#ifndef VINT32x16_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 32, 16, 8) +# define VINT32x16_NOT_DEFINED +#endif + +#ifndef VINT32x16_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 32, 16, 8) +# define VINT32x16_CMPLT_DEFINED +#endif + +#ifndef VINT32x16_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 32, 16, 8) +# define VINT32x16_CMPEQ_DEFINED +#endif + +#ifndef VINT32x16_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 32, 16, 8) +# define VINT32x16_CMPGT_DEFINED +#endif + +#ifndef VINT32x16_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 32, 16, 8) +# define VINT32x16_CMPLE_DEFINED +#endif + +#ifndef VINT32x16_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 32, 16, 8) +# define VINT32x16_CMPGE_DEFINED +#endif + +#ifndef VINT32x16_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 32, 16, 8) +# define VINT32x16_MIN_DEFINED +#endif + +#ifndef VINT32x16_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 32, 16, 8) +# define VINT32x16_MAX_DEFINED +#endif + +#ifndef VINT32x16_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 32, 16, 8) +# define VINT32x16_RSHIFT_DEFINED +#endif + +#ifndef VINT32x16_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 32, 16, 8) +# define VINT32x16_LRSHIFT_DEFINED +#endif + +#ifndef VINT32x16_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 32, 16, 8) +# define VINT32x16_LSHIFT_DEFINED +#endif + + + +/* vint32x16 */ + +#ifndef VUINT32x16_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 32, 16, 8) +# define VUINT32x16_SPLAT_DEFINED +#endif + +#ifndef VUINT32x16_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 32, 16, 8) +# define VUINT32x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x16_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 32, 16, 8) +# define VUINT32x16_LOAD_DEFINED +#endif + +#ifndef VUINT32x16_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 32, 16, 8) +# define VUINT32x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x16_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 32, 16, 8) +# define VUINT32x16_STORE_DEFINED +#endif + +#ifndef VUINT32x16_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 32, 16, 8) +# define VUINT32x16_ADD_DEFINED +#endif + +#ifndef VUINT32x16_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 32, 16, 8) +# define VUINT32x16_SUB_DEFINED +#endif + +#ifndef VUINT32x16_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 32, 16, 8) +# define VUINT32x16_MUL_DEFINED +#endif + +#ifndef VUINT32x16_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 32, 16, 8) +# define VUINT32x16_DIV_DEFINED +#endif + +#ifndef VUINT32x16_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 32, 16, 8) +# define VUINT32x16_AVG_DEFINED +#endif + +#ifndef VUINT32x16_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 32, 16, 8) +# define VUINT32x16_AND_DEFINED +#endif + +#ifndef VUINT32x16_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 32, 16, 8) +# define VUINT32x16_OR_DEFINED +#endif + +#ifndef VUINT32x16_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 32, 16, 8) +# define VUINT32x16_XOR_DEFINED +#endif + +#ifndef VUINT32x16_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 32, 16, 8) +# define VUINT32x16_NOT_DEFINED +#endif + +#ifndef VUINT32x16_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 32, 16, 8) +# define VUINT32x16_CMPLT_DEFINED +#endif + +#ifndef VUINT32x16_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 32, 16, 8) +# define VUINT32x16_CMPEQ_DEFINED +#endif + +#ifndef VUINT32x16_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 32, 16, 8) +# define VUINT32x16_CMPGT_DEFINED +#endif + +#ifndef VUINT32x16_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 32, 16, 8) +# define VUINT32x16_CMPLE_DEFINED +#endif + +#ifndef VUINT32x16_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 32, 16, 8) +# define VUINT32x16_CMPGE_DEFINED +#endif + +#ifndef VUINT32x16_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 32, 16, 8) +# define VUINT32x16_MIN_DEFINED +#endif + +#ifndef VUINT32x16_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 32, 16, 8) +# define VUINT32x16_MAX_DEFINED +#endif + +#ifndef VUINT32x16_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 32, 16, 8) +# define VUINT32x16_RSHIFT_DEFINED +#endif + +#ifndef VUINT32x16_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 32, 16, 8) +# define VUINT32x16_LRSHIFT_DEFINED +#endif + +#ifndef VUINT32x16_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 32, 16, 8) +# define VUINT32x16_LSHIFT_DEFINED +#endif + + + +/* vuint64x2 */ + +#ifndef VINT64x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(/* nothing */, 64, 2) +# define VINT64x2_SPLAT_DEFINED +#endif +#ifndef VINT64x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(/* nothing */, 64, 2) +# define VINT64x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VINT64x2_LOAD_DEFINED +VEC_GENERIC_LOAD(/* nothing */, 64, 2) +# define VINT64x2_LOAD_DEFINED +#endif +#ifndef VINT64x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(/* nothing */, 64, 2) +# define VINT64x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VINT64x2_STORE_DEFINED +VEC_GENERIC_STORE(/* nothing */, 64, 2) +# define VINT64x2_STORE_DEFINED +#endif +#ifndef VINT64x2_ADD_DEFINED +VEC_GENERIC_ADD(/* nothing */, 64, 2) +# define VINT64x2_ADD_DEFINED +#endif +#ifndef VINT64x2_SUB_DEFINED +VEC_GENERIC_SUB(/* nothing */, 64, 2) +# define VINT64x2_SUB_DEFINED +#endif +#ifndef VINT64x2_MUL_DEFINED +VEC_GENERIC_MUL(/* nothing */, 64, 2) +# define VINT64x2_MUL_DEFINED +#endif +#ifndef VINT64x2_DIV_DEFINED +VEC_GENERIC_DIV(/* nothing */, 64, 2) +# define VINT64x2_DIV_DEFINED +#endif +#ifndef VINT64x2_AVG_DEFINED +VEC_GENERIC_AVG(/* nothing */, 64, 2) +# define VINT64x2_AVG_DEFINED +#endif +#ifndef VINT64x2_AND_DEFINED +VEC_GENERIC_AND(/* nothing */, 64, 2) +# define VINT64x2_AND_DEFINED +#endif +#ifndef VINT64x2_OR_DEFINED +VEC_GENERIC_OR(/* nothing */, 64, 2) +# define VINT64x2_OR_DEFINED +#endif +#ifndef VINT64x2_XOR_DEFINED +VEC_GENERIC_XOR(/* nothing */, 64, 2) +# define VINT64x2_XOR_DEFINED +#endif +#ifndef VINT64x2_NOT_DEFINED +VEC_GENERIC_NOT(/* nothing */, 64, 2) +# define VINT64x2_NOT_DEFINED +#endif +#ifndef VINT64x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(/* nothing */, 64, 2) +# define VINT64x2_CMPLT_DEFINED +#endif +#ifndef VINT64x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(/* nothing */, 64, 2) +# define VINT64x2_CMPEQ_DEFINED +#endif +#ifndef VINT64x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(/* nothing */, 64, 2) +# define VINT64x2_CMPGT_DEFINED +#endif +#ifndef VINT64x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(/* nothing */, 64, 2) +# define VINT64x2_CMPLE_DEFINED +#endif +#ifndef VINT64x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(/* nothing */, 64, 2) +# define VINT64x2_CMPGE_DEFINED +#endif +#ifndef VINT64x2_MIN_DEFINED +VEC_GENERIC_MIN(/* nothing */, 64, 2) +# define VINT64x2_MIN_DEFINED +#endif +#ifndef VINT64x2_MAX_DEFINED +VEC_GENERIC_MAX(/* nothing */, 64, 2) +# define VINT64x2_MAX_DEFINED +#endif +#ifndef VINT64x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(/* nothing */, 64, 2) +# define VINT64x2_RSHIFT_DEFINED +#endif +#ifndef VINT64x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(/* nothing */, 64, 2) +# define VINT64x2_LRSHIFT_DEFINED +#endif +#ifndef VINT64x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(/* nothing */, 64, 2) +# define VINT64x2_LSHIFT_DEFINED +#endif + + +/* vint64x2 */ + +#ifndef VUINT64x2_SPLAT_DEFINED +VEC_GENERIC_SPLAT(u, 64, 2) +# define VUINT64x2_SPLAT_DEFINED +#endif +#ifndef VUINT64x2_LOAD_ALIGNED_DEFINED +VEC_GENERIC_LOAD_ALIGNED(u, 64, 2) +# define VUINT64x2_LOAD_ALIGNED_DEFINED +#endif +#ifndef VUINT64x2_LOAD_DEFINED +VEC_GENERIC_LOAD(u, 64, 2) +# define VUINT64x2_LOAD_DEFINED +#endif +#ifndef VUINT64x2_STORE_ALIGNED_DEFINED +VEC_GENERIC_STORE_ALIGNED(u, 64, 2) +# define VUINT64x2_STORE_ALIGNED_DEFINED +#endif +#ifndef VUINT64x2_STORE_DEFINED +VEC_GENERIC_STORE(u, 64, 2) +# define VUINT64x2_STORE_DEFINED +#endif +#ifndef VUINT64x2_ADD_DEFINED +VEC_GENERIC_ADD(u, 64, 2) +# define VUINT64x2_ADD_DEFINED +#endif +#ifndef VUINT64x2_SUB_DEFINED +VEC_GENERIC_SUB(u, 64, 2) +# define VUINT64x2_SUB_DEFINED +#endif +#ifndef VUINT64x2_MUL_DEFINED +VEC_GENERIC_MUL(u, 64, 2) +# define VUINT64x2_MUL_DEFINED +#endif +#ifndef VUINT64x2_DIV_DEFINED +VEC_GENERIC_DIV(u, 64, 2) +# define VUINT64x2_DIV_DEFINED +#endif +#ifndef VUINT64x2_AVG_DEFINED +VEC_GENERIC_AVG(u, 64, 2) +# define VUINT64x2_AVG_DEFINED +#endif +#ifndef VUINT64x2_AND_DEFINED +VEC_GENERIC_AND(u, 64, 2) +# define VUINT64x2_AND_DEFINED +#endif +#ifndef VUINT64x2_OR_DEFINED +VEC_GENERIC_OR(u, 64, 2) +# define VUINT64x2_OR_DEFINED +#endif +#ifndef VUINT64x2_XOR_DEFINED +VEC_GENERIC_XOR(u, 64, 2) +# define VUINT64x2_XOR_DEFINED +#endif +#ifndef VUINT64x2_NOT_DEFINED +VEC_GENERIC_NOT(u, 64, 2) +# define VUINT64x2_NOT_DEFINED +#endif +#ifndef VUINT64x2_CMPLT_DEFINED +VEC_GENERIC_CMPLT(u, 64, 2) +# define VUINT64x2_CMPLT_DEFINED +#endif +#ifndef VUINT64x2_CMPEQ_DEFINED +VEC_GENERIC_CMPEQ(u, 64, 2) +# define VUINT64x2_CMPEQ_DEFINED +#endif +#ifndef VUINT64x2_CMPGT_DEFINED +VEC_GENERIC_CMPGT(u, 64, 2) +# define VUINT64x2_CMPGT_DEFINED +#endif +#ifndef VUINT64x2_CMPLE_DEFINED +VEC_GENERIC_CMPLE(u, 64, 2) +# define VUINT64x2_CMPLE_DEFINED +#endif +#ifndef VUINT64x2_CMPGE_DEFINED +VEC_GENERIC_CMPGE(u, 64, 2) +# define VUINT64x2_CMPGE_DEFINED +#endif +#ifndef VUINT64x2_MIN_DEFINED +VEC_GENERIC_MIN(u, 64, 2) +# define VUINT64x2_MIN_DEFINED +#endif +#ifndef VUINT64x2_MAX_DEFINED +VEC_GENERIC_MAX(u, 64, 2) +# define VUINT64x2_MAX_DEFINED +#endif +#ifndef VUINT64x2_RSHIFT_DEFINED +VEC_GENERIC_RSHIFT(u, 64, 2) +# define VUINT64x2_RSHIFT_DEFINED +#endif +#ifndef VUINT64x2_LRSHIFT_DEFINED +VEC_GENERIC_LRSHIFT(u, 64, 2) +# define VUINT64x2_LRSHIFT_DEFINED +#endif +#ifndef VUINT64x2_LSHIFT_DEFINED +VEC_GENERIC_LSHIFT(u, 64, 2) +# define VUINT64x2_LSHIFT_DEFINED +#endif + + +/* vuint64x4 */ + +#ifndef VINT64x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 64, 4, 2) +# define VINT64x4_SPLAT_DEFINED +#endif + +#ifndef VINT64x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 64, 4, 2) +# define VINT64x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT64x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 64, 4, 2) +# define VINT64x4_LOAD_DEFINED +#endif + +#ifndef VINT64x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 64, 4, 2) +# define VINT64x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT64x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 64, 4, 2) +# define VINT64x4_STORE_DEFINED +#endif + +#ifndef VINT64x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 64, 4, 2) +# define VINT64x4_ADD_DEFINED +#endif + +#ifndef VINT64x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 64, 4, 2) +# define VINT64x4_SUB_DEFINED +#endif + +#ifndef VINT64x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 64, 4, 2) +# define VINT64x4_MUL_DEFINED +#endif + +#ifndef VINT64x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 64, 4, 2) +# define VINT64x4_DIV_DEFINED +#endif + +#ifndef VINT64x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 64, 4, 2) +# define VINT64x4_AVG_DEFINED +#endif + +#ifndef VINT64x4_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 64, 4, 2) +# define VINT64x4_AND_DEFINED +#endif + +#ifndef VINT64x4_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 64, 4, 2) +# define VINT64x4_OR_DEFINED +#endif + +#ifndef VINT64x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 64, 4, 2) +# define VINT64x4_XOR_DEFINED +#endif + +#ifndef VINT64x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 64, 4, 2) +# define VINT64x4_NOT_DEFINED +#endif + +#ifndef VINT64x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 64, 4, 2) +# define VINT64x4_CMPLT_DEFINED +#endif + +#ifndef VINT64x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 64, 4, 2) +# define VINT64x4_CMPEQ_DEFINED +#endif + +#ifndef VINT64x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 64, 4, 2) +# define VINT64x4_CMPGT_DEFINED +#endif + +#ifndef VINT64x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 64, 4, 2) +# define VINT64x4_CMPLE_DEFINED +#endif + +#ifndef VINT64x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 64, 4, 2) +# define VINT64x4_CMPGE_DEFINED +#endif + +#ifndef VINT64x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 64, 4, 2) +# define VINT64x4_MIN_DEFINED +#endif + +#ifndef VINT64x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 64, 4, 2) +# define VINT64x4_MAX_DEFINED +#endif + +#ifndef VINT64x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 64, 4, 2) +# define VINT64x4_RSHIFT_DEFINED +#endif + +#ifndef VINT64x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 64, 4, 2) +# define VINT64x4_LRSHIFT_DEFINED +#endif + +#ifndef VINT64x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 64, 4, 2) +# define VINT64x4_LSHIFT_DEFINED +#endif + + + +/* vint64x4 */ + +#ifndef VUINT64x4_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 64, 4, 2) +# define VUINT64x4_SPLAT_DEFINED +#endif + +#ifndef VUINT64x4_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 64, 4, 2) +# define VUINT64x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT64x4_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 64, 4, 2) +# define VUINT64x4_LOAD_DEFINED +#endif + +#ifndef VUINT64x4_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 64, 4, 2) +# define VUINT64x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT64x4_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 64, 4, 2) +# define VUINT64x4_STORE_DEFINED +#endif + +#ifndef VUINT64x4_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 64, 4, 2) +# define VUINT64x4_ADD_DEFINED +#endif + +#ifndef VUINT64x4_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 64, 4, 2) +# define VUINT64x4_SUB_DEFINED +#endif + +#ifndef VUINT64x4_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 64, 4, 2) +# define VUINT64x4_MUL_DEFINED +#endif + +#ifndef VUINT64x4_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 64, 4, 2) +# define VUINT64x4_DIV_DEFINED +#endif + +#ifndef VUINT64x4_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 64, 4, 2) +# define VUINT64x4_AVG_DEFINED +#endif + +#ifndef VUINT64x4_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 64, 4, 2) +# define VUINT64x4_AND_DEFINED +#endif + +#ifndef VUINT64x4_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 64, 4, 2) +# define VUINT64x4_OR_DEFINED +#endif + +#ifndef VUINT64x4_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 64, 4, 2) +# define VUINT64x4_XOR_DEFINED +#endif + +#ifndef VUINT64x4_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 64, 4, 2) +# define VUINT64x4_NOT_DEFINED +#endif + +#ifndef VUINT64x4_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 64, 4, 2) +# define VUINT64x4_CMPLT_DEFINED +#endif + +#ifndef VUINT64x4_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 64, 4, 2) +# define VUINT64x4_CMPEQ_DEFINED +#endif + +#ifndef VUINT64x4_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 64, 4, 2) +# define VUINT64x4_CMPGT_DEFINED +#endif + +#ifndef VUINT64x4_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 64, 4, 2) +# define VUINT64x4_CMPLE_DEFINED +#endif + +#ifndef VUINT64x4_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 64, 4, 2) +# define VUINT64x4_CMPGE_DEFINED +#endif + +#ifndef VUINT64x4_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 64, 4, 2) +# define VUINT64x4_MIN_DEFINED +#endif + +#ifndef VUINT64x4_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 64, 4, 2) +# define VUINT64x4_MAX_DEFINED +#endif + +#ifndef VUINT64x4_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 64, 4, 2) +# define VUINT64x4_RSHIFT_DEFINED +#endif + +#ifndef VUINT64x4_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 64, 4, 2) +# define VUINT64x4_LRSHIFT_DEFINED +#endif + +#ifndef VUINT64x4_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 64, 4, 2) +# define VUINT64x4_LSHIFT_DEFINED +#endif + + + +/* vuint64x8 */ + +#ifndef VINT64x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(/* nothing */, 64, 8, 4) +# define VINT64x8_SPLAT_DEFINED +#endif + +#ifndef VINT64x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(/* nothing */, 64, 8, 4) +# define VINT64x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT64x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(/* nothing */, 64, 8, 4) +# define VINT64x8_LOAD_DEFINED +#endif + +#ifndef VINT64x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(/* nothing */, 64, 8, 4) +# define VINT64x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT64x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(/* nothing */, 64, 8, 4) +# define VINT64x8_STORE_DEFINED +#endif + +#ifndef VINT64x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(/* nothing */, 64, 8, 4) +# define VINT64x8_ADD_DEFINED +#endif + +#ifndef VINT64x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(/* nothing */, 64, 8, 4) +# define VINT64x8_SUB_DEFINED +#endif + +#ifndef VINT64x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(/* nothing */, 64, 8, 4) +# define VINT64x8_MUL_DEFINED +#endif + +#ifndef VINT64x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(/* nothing */, 64, 8, 4) +# define VINT64x8_DIV_DEFINED +#endif + +#ifndef VINT64x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(/* nothing */, 64, 8, 4) +# define VINT64x8_AVG_DEFINED +#endif + +#ifndef VINT64x8_AND_DEFINED +VEC_GENERIC_DBL_AND(/* nothing */, 64, 8, 4) +# define VINT64x8_AND_DEFINED +#endif + +#ifndef VINT64x8_OR_DEFINED +VEC_GENERIC_DBL_OR(/* nothing */, 64, 8, 4) +# define VINT64x8_OR_DEFINED +#endif + +#ifndef VINT64x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(/* nothing */, 64, 8, 4) +# define VINT64x8_XOR_DEFINED +#endif + +#ifndef VINT64x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(/* nothing */, 64, 8, 4) +# define VINT64x8_NOT_DEFINED +#endif + +#ifndef VINT64x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(/* nothing */, 64, 8, 4) +# define VINT64x8_CMPLT_DEFINED +#endif + +#ifndef VINT64x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(/* nothing */, 64, 8, 4) +# define VINT64x8_CMPEQ_DEFINED +#endif + +#ifndef VINT64x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(/* nothing */, 64, 8, 4) +# define VINT64x8_CMPGT_DEFINED +#endif + +#ifndef VINT64x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(/* nothing */, 64, 8, 4) +# define VINT64x8_CMPLE_DEFINED +#endif + +#ifndef VINT64x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(/* nothing */, 64, 8, 4) +# define VINT64x8_CMPGE_DEFINED +#endif + +#ifndef VINT64x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(/* nothing */, 64, 8, 4) +# define VINT64x8_MIN_DEFINED +#endif + +#ifndef VINT64x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(/* nothing */, 64, 8, 4) +# define VINT64x8_MAX_DEFINED +#endif + +#ifndef VINT64x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(/* nothing */, 64, 8, 4) +# define VINT64x8_RSHIFT_DEFINED +#endif + +#ifndef VINT64x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(/* nothing */, 64, 8, 4) +# define VINT64x8_LRSHIFT_DEFINED +#endif + +#ifndef VINT64x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(/* nothing */, 64, 8, 4) +# define VINT64x8_LSHIFT_DEFINED +#endif + + + +/* vint64x8 */ + +#ifndef VUINT64x8_SPLAT_DEFINED +VEC_GENERIC_DBL_SPLAT(u, 64, 8, 4) +# define VUINT64x8_SPLAT_DEFINED +#endif + +#ifndef VUINT64x8_LOAD_ALIGNED_DEFINED +VEC_GENERIC_DBL_LOAD_ALIGNED(u, 64, 8, 4) +# define VUINT64x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT64x8_LOAD_DEFINED +VEC_GENERIC_DBL_LOAD(u, 64, 8, 4) +# define VUINT64x8_LOAD_DEFINED +#endif + +#ifndef VUINT64x8_STORE_ALIGNED_DEFINED +VEC_GENERIC_DBL_STORE_ALIGNED(u, 64, 8, 4) +# define VUINT64x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT64x8_STORE_DEFINED +VEC_GENERIC_DBL_STORE(u, 64, 8, 4) +# define VUINT64x8_STORE_DEFINED +#endif + +#ifndef VUINT64x8_ADD_DEFINED +VEC_GENERIC_DBL_ADD(u, 64, 8, 4) +# define VUINT64x8_ADD_DEFINED +#endif + +#ifndef VUINT64x8_SUB_DEFINED +VEC_GENERIC_DBL_SUB(u, 64, 8, 4) +# define VUINT64x8_SUB_DEFINED +#endif + +#ifndef VUINT64x8_MUL_DEFINED +VEC_GENERIC_DBL_MUL(u, 64, 8, 4) +# define VUINT64x8_MUL_DEFINED +#endif + +#ifndef VUINT64x8_DIV_DEFINED +VEC_GENERIC_DBL_DIV(u, 64, 8, 4) +# define VUINT64x8_DIV_DEFINED +#endif + +#ifndef VUINT64x8_AVG_DEFINED +VEC_GENERIC_DBL_AVG(u, 64, 8, 4) +# define VUINT64x8_AVG_DEFINED +#endif + +#ifndef VUINT64x8_AND_DEFINED +VEC_GENERIC_DBL_AND(u, 64, 8, 4) +# define VUINT64x8_AND_DEFINED +#endif + +#ifndef VUINT64x8_OR_DEFINED +VEC_GENERIC_DBL_OR(u, 64, 8, 4) +# define VUINT64x8_OR_DEFINED +#endif + +#ifndef VUINT64x8_XOR_DEFINED +VEC_GENERIC_DBL_XOR(u, 64, 8, 4) +# define VUINT64x8_XOR_DEFINED +#endif + +#ifndef VUINT64x8_NOT_DEFINED +VEC_GENERIC_DBL_NOT(u, 64, 8, 4) +# define VUINT64x8_NOT_DEFINED +#endif + +#ifndef VUINT64x8_CMPLT_DEFINED +VEC_GENERIC_DBL_CMPLT(u, 64, 8, 4) +# define VUINT64x8_CMPLT_DEFINED +#endif + +#ifndef VUINT64x8_CMPEQ_DEFINED +VEC_GENERIC_DBL_CMPEQ(u, 64, 8, 4) +# define VUINT64x8_CMPEQ_DEFINED +#endif + +#ifndef VUINT64x8_CMPGT_DEFINED +VEC_GENERIC_DBL_CMPGT(u, 64, 8, 4) +# define VUINT64x8_CMPGT_DEFINED +#endif + +#ifndef VUINT64x8_CMPLE_DEFINED +VEC_GENERIC_DBL_CMPLE(u, 64, 8, 4) +# define VUINT64x8_CMPLE_DEFINED +#endif + +#ifndef VUINT64x8_CMPGE_DEFINED +VEC_GENERIC_DBL_CMPGE(u, 64, 8, 4) +# define VUINT64x8_CMPGE_DEFINED +#endif + +#ifndef VUINT64x8_MIN_DEFINED +VEC_GENERIC_DBL_MIN(u, 64, 8, 4) +# define VUINT64x8_MIN_DEFINED +#endif + +#ifndef VUINT64x8_MAX_DEFINED +VEC_GENERIC_DBL_MAX(u, 64, 8, 4) +# define VUINT64x8_MAX_DEFINED +#endif + +#ifndef VUINT64x8_RSHIFT_DEFINED +VEC_GENERIC_DBL_RSHIFT(u, 64, 8, 4) +# define VUINT64x8_RSHIFT_DEFINED +#endif + +#ifndef VUINT64x8_LRSHIFT_DEFINED +VEC_GENERIC_DBL_LRSHIFT(u, 64, 8, 4) +# define VUINT64x8_LRSHIFT_DEFINED +#endif + +#ifndef VUINT64x8_LSHIFT_DEFINED +VEC_GENERIC_DBL_LSHIFT(u, 64, 8, 4) +# define VUINT64x8_LSHIFT_DEFINED +#endif #endif /* VEC_IMPL_GENERIC_H_ */ +
--- a/include/vec/impl/integer.h.in Fri Apr 25 17:40:55 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -/** - * vec - a tiny SIMD vector library in plain C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#ifndef VEC_IMPL_INTEGER_H_ -#define VEC_IMPL_INTEGER_H_ - -#cmakedefine HAVE_SYS_TYPES_H -#cmakedefine HAVE_STDDEF_H -#cmakedefine HAVE_STDINT_H - -#ifdef HAVE_SYS_TYPES_H -# include <sys/types.h> -#endif -#ifdef HAVE_STDDEF_H -# include <stddef.h> -#endif -#ifdef HAVE_STDINT_H -# include <stdint.h> -#endif - -typedef signed char vec_int8; -typedef @SIZE16@ vec_int16; -typedef @SIZE32@ vec_int32; -typedef @SIZE64@ vec_int64; - -typedef unsigned char vec_uint8; -typedef @USIZE16@ vec_uint16; -typedef @USIZE32@ vec_uint32; -typedef @USIZE64@ vec_uint64; - -/* this is only used for bitshifting right now */ -typedef vec_int64 vec_intmax; -typedef vec_uint64 vec_uintmax; - -typedef @USIZEPTR@ vec_uintptr; - -#endif /* VEC_IMPL_INTEGER_H_ */ \ No newline at end of file
--- a/include/vec/impl/x86/avx2.h Fri Apr 25 17:40:55 2025 -0400 +++ b/include/vec/impl/x86/avx2.h Sat Apr 26 01:04:35 2025 -0400 @@ -25,130 +25,100 @@ #ifndef VEC_IMPL_X86_AVX2_H_ #define VEC_IMPL_X86_AVX2_H_ -#define VEC_AVX2_OPERATION_8x32_16x16(op, sign) \ - do { \ - /* unpack and multiply */ \ - __m256i dst_even = _mm256_##op##_epi16(vec1.avx2, vec2.avx2); \ - __m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1.avx2, 8), _mm256_srli_epi16(vec2.avx2, 8)); \ - \ - /* repack */ \ - v##sign##int8x32 vec; \ - vec.avx2 = _mm256_or_si256( \ - _mm256_slli_epi16(dst_odd, 8), \ - _mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \ - ); \ - return vec; \ - } while (0) - -#define VEC_AVX2_OPERATION_8x32_32x8(op, sign) \ - do { \ - /* unpack */ \ - __m256i dst_1 = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \ - __m256i dst_2 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 8), _mm256_srli_epi32(vec2.avx2, 8)); \ - __m256i dst_3 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \ - __m256i dst_4 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 24), _mm256_srli_epi32(vec2.avx2, 24)); \ +#define VEC_AVX2_OP(NAME, SIGN, BITS, SIZE, INTLSIGN, OP) \ + VEC_FUNC_IMPL v##SIGN##int##BITS##x##SIZE v##SIGN##int##BITS##x##SIZE##_##NAME(v##SIGN##int##BITS##x##SIZE vec1, v##SIGN##int##BITS##x##SIZE vec2) \ + { \ + vec1.avx2 = _mm256_##OP##_ep##INTLSIGN##BITS(vec1.avx2, vec2.avx2); \ \ - /* repack */ \ - v##sign##int8x32 vec; \ - vec.avx2 = _mm256_or_si256( \ - _mm256_or_si256( \ - _mm256_slli_epi32(dst_4, 8), \ - _mm256_srli_epi32(_mm256_slli_epi32(dst_3, 8), 8) \ - ), \ - _mm256_or_si256( \ - _mm256_slli_epi32(_mm256_slli_epi32(dst_2, 8), 16), \ - _mm256_srli_epi32(_mm256_slli_epi32(dst_1, 8), 24) \ - ) \ - ); \ - return vec; \ - } while (0) + return vec1; \ + } + +#ifndef VINT8x32_MIN_DEFINED +VEC_AVX2_OP(min, /* nothing */, 8, 32, i, min) +# define VINT8x32_MIN_DEFINED +#endif + +#ifndef VINT8x32_MAX_DEFINED +VEC_AVX2_OP(max, /* nothing */, 8, 32, i, max) +# define VINT8x32_MAX_DEFINED +#endif + +#ifndef VUINT8x32_MIN_DEFINED +VEC_AVX2_OP(min, u, 8, 32, u, min) +# define VUINT8x32_MIN_DEFINED +#endif -#define VEC_AVX2_OPERATION_16x16(op, sign) \ - do { \ - /* unpack and multiply */ \ - __m256i dst_even = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \ - __m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \ - \ - /* repack */ \ - v##sign##int16x16 vec; \ - vec.avx2 = _mm256_or_si256( \ - _mm256_slli_epi32(dst_odd, 16), \ - _mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \ - ); \ - return vec; \ - } while (0) +#ifndef VUINT8x32_MAX_DEFINED +VEC_AVX2_OP(max, u, 8, 32, u, max) +# define VUINT8x32_MAX_DEFINED +#endif + +#ifndef VINT16x16_MIN_DEFINED +VEC_AVX2_OP(min, /* nothing */, 16, 16, i, min) +# define VINT16x16_MIN_DEFINED +#endif -// shifting +#ifndef VINT16x16_MAX_DEFINED +VEC_AVX2_OP(max, /* nothing */, 16, 16, i, max) +# define VINT16x16_MAX_DEFINED +#endif -#define VEC_AVX2_LSHIFT_8x32(sign) \ - VEC_AVX2_OPERATION_8x32_32x8(sllv, sign) - -#define VEC_AVX2_LSHIFT_16x16(sign) \ - VEC_AVX2_OPERATION_16x16(sllv, sign) +#ifndef VUINT16x16_MIN_DEFINED +VEC_AVX2_OP(min, u, 16, 16, u, min) +# define VUINT16x16_MIN_DEFINED +#endif -#define VEC_AVX2_LSHIFT_32x8(sign) \ - do { \ - v##sign##int32x8 vec; \ - vec.avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2); \ - return vec; \ - } while (0) +#ifndef VUINT16x16_MAX_DEFINED +VEC_AVX2_OP(max, u, 16, 16, u, max) +# define VUINT16x16_MAX_DEFINED +#endif + +#ifndef VINT32x8_MIN_DEFINED +VEC_AVX2_OP(min, /* nothing */, 32, 8, i, min) +# define VINT32x8_MIN_DEFINED +#endif -#define VEC_AVX2_LSHIFT_64x4(sign) \ - do { \ - v##sign##int64x4 vec; \ - vec.avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2); \ - return vec; \ - } while (0) - -#define VEC_AVX2_RSHIFT_8x32(sign, aORl) \ - VEC_AVX2_OPERATION_8x32_32x8(sr##aORl##v, sign) +#ifndef VINT32x8_MAX_DEFINED +VEC_AVX2_OP(max, /* nothing */, 32, 8, i, max) +# define VINT32x8_MAX_DEFINED +#endif -#define VEC_AVX2_RSHIFT_16x16(sign, aORl) \ - VEC_AVX2_OPERATION_16x16(sr##aORl##v, sign) - -#define VEC_AVX2_RSHIFT_32x8(sign, aORl) \ - do { \ - v##sign##int32x8 vec; \ - vec.avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2); \ - return vec; \ - } while (0) +#ifndef VUINT32x8_MIN_DEFINED +VEC_AVX2_OP(min, u, 32, 8, u, min) +# define VUINT32x8_MIN_DEFINED +#endif -#define VEC_AVX2_aRSHIFT_64x4(sign) \ - do { \ - return v##sign##int64x4_fallback_rshift(vec1, vec2); \ - } while (0) +#ifndef VUINT32x8_MAX_DEFINED +VEC_AVX2_OP(max, u, 32, 8, u, max) +# define VUINT32x8_MAX_DEFINED +#endif -#define VEC_AVX2_lRSHIFT_64x4(sign) \ - do { \ - v##sign##int64x4 vec; \ - vec.avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2); \ - return vec; \ - } while (0) +/* ------------------------------------------------------------------------ */ +/* multiplication */ -#define VEC_AVX2_RSHIFT_64x4(sign, aORl) \ - VEC_AVX2_##aORl##RSHIFT_64x4(sign) - -// multiplication +#ifndef VINT16x16_MUL_DEFINED +VEC_AVX2_OP(mul, /* nothing */, 16, 16, i, mullo) +# define VINT16x16_MUL_DEFINED +#endif -#define VEC_AVX2_MUL_8x32(sign) \ - VEC_AVX2_OPERATION_8x32_16x16(mullo, sign) +#ifndef VUINT16x16_MUL_DEFINED +VEC_AVX2_OP(mul, u, 16, 16, i, mullo) +# define VUINT16x16_MUL_DEFINED +#endif -#define VEC_AVX2_MUL_16x16(sign) \ - do { \ - v##sign##int16x16 vec; \ - vec.avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2); \ - return vec; \ - } while (0) +#ifndef VINT32x8_MUL_DEFINED +VEC_AVX2_OP(mul, /* nothing */, 32, 8, i, mullo) +# define VINT32x8_MUL_DEFINED +#endif -#define VEC_AVX2_MUL_32x8(sign) \ - do { \ - v##sign##int32x8 vec; \ - vec.avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2); \ - return vec; \ - } while (0) +#ifndef VUINT32x8_MUL_DEFINED +VEC_AVX2_OP(mul, u, 32, 8, i, mullo) +# define VUINT32x8_MUL_DEFINED +#endif #define VEC_AVX2_MUL_64x4(sign) \ - do { \ + VEC_FUNC_IMPL v##sign##int64x4 v##sign##int64x4_mul(v##sign##int64x4 vec1, v##sign##int64x4 vec2) \ + { \ __m256i ac = _mm256_mul_epu32(vec1.avx2, vec2.avx2); \ __m256i b = _mm256_srli_epi64(vec1.avx2, 32); \ __m256i bc = _mm256_mul_epu32(b, vec2.avx2); \ @@ -157,140 +127,354 @@ __m256i hi = _mm256_add_epi64(bc, ad); \ hi = _mm256_slli_epi64(hi, 32); \ \ - v##sign##int64x4 vec; \ - vec.avx2 = _mm256_add_epi64(hi, ac); \ - return vec; \ - } while (0) + vec1.avx2 = _mm256_add_epi64(hi, ac); \ + return vec1; \ + } + +#ifndef VINT64x4_MUL_DEFINED +VEC_AVX2_MUL_64x4(/* nothing */) +# define VINT64x4_MUL_DEFINED +#endif + +#ifndef VUINT64x4_MUL_DEFINED +VEC_AVX2_MUL_64x4(u) +# define VUINT64x4_MUL_DEFINED +#endif -// operations +/* -------------------------------------------------------------------- */ +/* avg */ + +#ifndef VUINT8x32_AVG_DEFINED +VEC_AVX2_OP(avg, u, 8, 32, u, avg) +# define VUINT8x32_AVG_DEFINED +#endif -#define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const vec_##sign##int##bits in[size]) \ +#ifndef VUINT16x16_AVG_DEFINED +VEC_AVX2_OP(avg, u, 16, 16, u, avg) +# define VUINT16x16_AVG_DEFINED +#endif + +/* -------------------------------------------------------------------- */ +/* generic ops */ + +#define VEC_AVX2_LOAD_ALIGNED(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ vec.avx2 = _mm256_load_si256((const __m256i *)in); \ return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const vec_##sign##int##bits in[size]) \ + } + +#define VEC_AVX2_LOAD(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ vec.avx2 = _mm256_loadu_si256((const __m256i *)in); \ return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + } + +#define VEC_AVX2_STORE_ALIGNED(sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm256_store_si256((__m256i *)out, vec.avx2); \ - } \ - \ - static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + } + +#define VEC_AVX2_STORE(sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm256_storeu_si256((__m256i *)out, vec.avx2); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX2_MUL_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_AVX2_LSHIFT_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + } + +#define VEC_AVX2_BITWISE(op, sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - VEC_AVX2_RSHIFT_##bits##x##size(sign, a); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_AVX2_RSHIFT_##bits##x##size(sign, l); \ - } \ - \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_avx2_load_aligned, \ - v##sign##int##bits##x##size##_avx2_load, \ - v##sign##int##bits##x##size##_avx2_store_aligned, \ - v##sign##int##bits##x##size##_avx2_store, \ - v##sign##int##bits##x##size##_avx2_add, \ - v##sign##int##bits##x##size##_avx2_sub, \ - v##sign##int##bits##x##size##_avx2_mul, \ - /* .div = */ NULL, \ - /* .avg = */ NULL, \ - v##sign##int##bits##x##size##_avx2_and, \ - v##sign##int##bits##x##size##_avx2_or, \ - v##sign##int##bits##x##size##_avx2_xor, \ - /* .not = */ NULL, \ - v##sign##int##bits##x##size##_avx2_lshift, \ - v##sign##int##bits##x##size##_avx2_rshift, \ - v##sign##int##bits##x##size##_avx2_lrshift, \ - }; + vec1.avx2 = _mm256_##op##_si256(vec1.avx2, vec2.avx2); \ + return vec1; \ + } + +/* ------------------------------------------------------------------------ */ +/* 8x32 */ + +#ifndef VINT8x32_LOAD_ALIGNED_DEFINED +VEC_AVX2_LOAD_ALIGNED(/* nothing */, 8, 32) +# define VINT8x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x32_LOAD_DEFINED +VEC_AVX2_LOAD(/* nothing */, 8, 32) +# define VINT8x32_LOAD_DEFINED +#endif + +#ifndef VINT8x32_STORE_ALIGNED_DEFINED +VEC_AVX2_STORE_ALIGNED(/* nothing */, 8, 32) +# define VINT8x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x32_STORE_DEFINED +VEC_AVX2_STORE(/* nothing */, 8, 32) +# define VINT8x32_STORE_DEFINED +#endif + +#ifndef VINT8x32_ADD_DEFINED +VEC_AVX2_OP(add, /* nothing */, 8, 32, i, add) +# define VINT8x32_ADD_DEFINED +#endif + +#ifndef VINT8x32_SUB_DEFINED +VEC_AVX2_OP(sub, /* nothing */, 8, 32, i, sub) +# define VINT8x32_SUB_DEFINED +#endif + +#ifndef VINT8x32_AND_DEFINED +VEC_AVX2_BITWISE(and, /* nothing */, 8, 32) +# define VINT8x32_AND_DEFINED +#endif + +#ifndef VINT8x32_OR_DEFINED +VEC_AVX2_BITWISE(or, /* nothing */, 8, 32) +# define VINT8x32_OR_DEFINED +#endif + +#ifndef VINT8x32_XOR_DEFINED +VEC_AVX2_BITWISE(xor, /* nothing */, 8, 32) +# define VINT8x32_XOR_DEFINED +#endif + +/* u8x32 */ + +#ifndef VUINT8x32_LOAD_ALIGNED_DEFINED +VEC_AVX2_LOAD_ALIGNED(u, 8, 32) +# define VUINT8x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x32_LOAD_DEFINED +VEC_AVX2_LOAD(u, 8, 32) +# define VUINT8x32_LOAD_DEFINED +#endif + +#ifndef VUINT8x32_STORE_ALIGNED_DEFINED +VEC_AVX2_STORE_ALIGNED(u, 8, 32) +# define VUINT8x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x32_STORE_DEFINED +VEC_AVX2_STORE(u, 8, 32) +# define VUINT8x32_STORE_DEFINED +#endif + +#ifndef VUINT8x32_ADD_DEFINED +VEC_AVX2_OP(add, u, 8, 32, i, add) +# define VUINT8x32_ADD_DEFINED +#endif + +#ifndef VUINT8x32_SUB_DEFINED +VEC_AVX2_OP(sub, u, 8, 32, i, sub) +# define VUINT8x32_SUB_DEFINED +#endif + +#ifndef VUINT8x32_AND_DEFINED +VEC_AVX2_BITWISE(and, u, 8, 32) +# define VUINT8x32_AND_DEFINED +#endif + +#ifndef VUINT8x32_OR_DEFINED +VEC_AVX2_BITWISE(or, u, 8, 32) +# define VUINT8x32_OR_DEFINED +#endif + +#ifndef VUINT8x32_XOR_DEFINED +VEC_AVX2_BITWISE(xor, u, 8, 32) +# define VUINT8x32_XOR_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* 16x16 */ + +#ifndef VINT16x16_LOAD_ALIGNED_DEFINED +VEC_AVX2_LOAD_ALIGNED(/* nothing */, 16, 16) +# define VINT16x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x16_LOAD_DEFINED +VEC_AVX2_LOAD(/* nothing */, 16, 16) +# define VINT16x16_LOAD_DEFINED +#endif + +#ifndef VINT16x16_STORE_ALIGNED_DEFINED +VEC_AVX2_STORE_ALIGNED(/* nothing */, 16, 16) +# define VINT16x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x16_STORE_DEFINED +VEC_AVX2_STORE(/* nothing */, 16, 16) +# define VINT16x16_STORE_DEFINED +#endif + +#ifndef VINT16x16_ADD_DEFINED +VEC_AVX2_OP(add, /* nothing */, 16, 16, i, add) +# define VINT16x16_ADD_DEFINED +#endif + +#ifndef VINT16x16_SUB_DEFINED +VEC_AVX2_OP(sub, /* nothing */, 16, 16, i, sub) +# define VINT16x16_SUB_DEFINED +#endif + +#ifndef VINT16x16_AND_DEFINED +VEC_AVX2_BITWISE(and, /* nothing */, 16, 16) +# define VINT16x16_AND_DEFINED +#endif + +#ifndef VINT16x16_OR_DEFINED +VEC_AVX2_BITWISE(or, /* nothing */, 16, 16) +# define VINT16x16_OR_DEFINED +#endif -#define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \ - VEC_AVX2_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_AVX2_DEFINE_OPERATIONS_SIGN(u, bits, size) +#ifndef VINT16x16_XOR_DEFINED +VEC_AVX2_BITWISE(xor, /* nothing */, 16, 16) +# define VINT16x16_XOR_DEFINED +#endif + +/* u16x16 */ + +#ifndef VUINT16x16_LOAD_ALIGNED_DEFINED +VEC_AVX2_LOAD_ALIGNED(u, 16, 16) +# define VUINT16x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x16_LOAD_DEFINED +VEC_AVX2_LOAD(u, 16, 16) +# define VUINT16x16_LOAD_DEFINED +#endif + +#ifndef VUINT16x16_STORE_ALIGNED_DEFINED +VEC_AVX2_STORE_ALIGNED(u, 16, 16) +# define VUINT16x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x16_STORE_DEFINED +VEC_AVX2_STORE(u, 16, 16) +# define VUINT16x16_STORE_DEFINED +#endif + +#ifndef VUINT16x16_ADD_DEFINED +VEC_AVX2_OP(add, u, 16, 16, i, add) +# define VUINT16x16_ADD_DEFINED +#endif -VEC_AVX2_DEFINE_OPERATIONS(8, 32) -VEC_AVX2_DEFINE_OPERATIONS(16, 16) -VEC_AVX2_DEFINE_OPERATIONS(32, 8) -VEC_AVX2_DEFINE_OPERATIONS(64, 4) +#ifndef VUINT16x16_SUB_DEFINED +VEC_AVX2_OP(sub, u, 16, 16, i, sub) +# define VUINT16x16_SUB_DEFINED +#endif + +#ifndef VUINT16x16_AND_DEFINED +VEC_AVX2_BITWISE(and, u, 16, 16) +# define VUINT16x16_AND_DEFINED +#endif + +#ifndef VUINT16x16_OR_DEFINED +VEC_AVX2_BITWISE(or, u, 16, 16) +# define VUINT16x16_OR_DEFINED +#endif + +#ifndef VUINT16x16_XOR_DEFINED +VEC_AVX2_BITWISE(xor, u, 16, 16) +# define VUINT16x16_XOR_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* 32x8 */ + +#ifndef VINT32x8_LOAD_ALIGNED_DEFINED +VEC_AVX2_LOAD_ALIGNED(/* nothing */, 32, 8) +# define VINT32x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT32x8_LOAD_DEFINED +VEC_AVX2_LOAD(/* nothing */, 32, 8) +# define VINT32x8_LOAD_DEFINED +#endif + +#ifndef VINT32x8_STORE_ALIGNED_DEFINED +VEC_AVX2_STORE_ALIGNED(/* nothing */, 32, 8) +# define VINT32x8_STORE_ALIGNED_DEFINED +#endif -#undef VEC_AVX2_DEFINE_OPERATIONS -#undef VEC_AVX2_DEFINE_OPERATIONS_SIGN -#undef VEC_AVX2_MUL_8x32 -#undef VEC_AVX2_MUL_16x16 -#undef VEC_AVX2_MUL_32x8 -#undef VEC_AVX2_MUL_64x4 -#undef VEC_AVX2_OPERATION_8x32_16x16 -#undef VEC_AVX2_OPERATION_8x32_32x8 -#undef VEC_AVX2_OPERATION_16x16 -#undef VEC_AVX2_LSHIFT_8x32 -#undef VEC_AVX2_LSHIFT_16x16 -#undef VEC_AVX2_LSHIFT_32x8 -#undef VEC_AVX2_LSHIFT_64x4 -#undef VEC_AVX2_RSHIFT_8x32 -#undef VEC_AVX2_RSHIFT_16x16 -#undef VEC_AVX2_RSHIFT_32x8 -#undef VEC_AVX2_aRSHIFT_64x4 -#undef VEC_AVX2_lRSHIFT_64x4 -#undef VEC_AVX2_RSHIFT_64x4 +#ifndef VINT32x8_STORE_DEFINED +VEC_AVX2_STORE(/* nothing */, 32, 8) +# define VINT32x8_STORE_DEFINED +#endif + +#ifndef VINT32x8_ADD_DEFINED +VEC_AVX2_OP(add, /* nothing */, 32, 8, i, add) +# define VINT32x8_ADD_DEFINED +#endif + +#ifndef VINT32x8_SUB_DEFINED +VEC_AVX2_OP(sub, /* nothing */, 32, 8, i, sub) +# define VINT32x8_SUB_DEFINED +#endif + +#ifndef VINT32x8_AND_DEFINED +VEC_AVX2_BITWISE(and, /* nothing */, 32, 8) +# define VINT32x8_AND_DEFINED +#endif + +#ifndef VINT32x8_OR_DEFINED +VEC_AVX2_BITWISE(or, /* nothing */, 32, 8) +# define VINT32x8_OR_DEFINED +#endif + +#ifndef VINT32x8_XOR_DEFINED +VEC_AVX2_BITWISE(xor, /* nothing */, 32, 8) +# define VINT32x8_XOR_DEFINED +#endif + +/* u32x8 */ + +#ifndef VUINT32x8_LOAD_ALIGNED_DEFINED +VEC_AVX2_LOAD_ALIGNED(u, 32, 8) +# define VUINT32x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x8_LOAD_DEFINED +VEC_AVX2_LOAD(u, 32, 8) +# define VUINT32x8_LOAD_DEFINED +#endif + +#ifndef VUINT32x8_STORE_ALIGNED_DEFINED +VEC_AVX2_STORE_ALIGNED(u, 32, 8) +# define VUINT32x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x8_STORE_DEFINED +VEC_AVX2_STORE(u, 32, 8) +# define VUINT32x8_STORE_DEFINED +#endif + +#ifndef VUINT32x8_ADD_DEFINED +VEC_AVX2_OP(add, u, 32, 8, i, add) +# define VUINT32x8_ADD_DEFINED +#endif + +#ifndef VUINT32x8_SUB_DEFINED +VEC_AVX2_OP(sub, u, 32, 8, i, sub) +# define VUINT32x8_SUB_DEFINED +#endif + +#ifndef VUINT32x8_AND_DEFINED +VEC_AVX2_BITWISE(and, u, 32, 8) +# define VUINT32x8_AND_DEFINED +#endif + +#ifndef VUINT32x8_OR_DEFINED +VEC_AVX2_BITWISE(or, u, 32, 8) +# define VUINT32x8_OR_DEFINED +#endif + +#ifndef VUINT32x8_XOR_DEFINED +VEC_AVX2_BITWISE(xor, u, 32, 8) +# define VUINT32x8_XOR_DEFINED +#endif #endif /* VEC_IMPL_X86_AVX2_H_ */
--- a/include/vec/impl/x86/avx512f.h Fri Apr 25 17:40:55 2025 -0400 +++ b/include/vec/impl/x86/avx512f.h Sat Apr 26 01:04:35 2025 -0400 @@ -25,272 +25,347 @@ #ifndef VEC_IMPL_X86_AVX512F_H_ #define VEC_IMPL_X86_AVX512F_H_ -#define VEC_AVX512F_OPERATION_8x64(op, sign) \ - do { \ - /* unpack and add */ \ - __m512i dst_1 = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \ - __m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 8), _mm512_srli_epi32(vec2.avx512f, 8)); \ - __m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \ - __m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 24), _mm512_srli_epi32(vec2.avx512f, 24)); \ - \ - /* repack */ \ - v##sign##int8x64 vec; \ - vec.avx512f = _mm512_or_si512( \ - _mm512_or_si512( \ - _mm512_slli_epi32(dst_4, 8), \ - _mm512_srli_epi32(_mm512_slli_epi32(dst_3, 8), 8) \ - ), \ - _mm512_or_si512( \ - _mm512_slli_epi32(_mm512_slli_epi32(dst_2, 8), 16), \ - _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 8), 24) \ - ) \ - ); \ - return vec; \ - } while (0) - -#define VEC_AVX512F_OPERATION_16x32(op, sign) \ - do { \ - /* unpack and add */ \ - __m512i dst_even = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \ - __m512i dst_odd = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \ +#define VEC_AVX512F_OPERATION_EX_EX(name, op, sign, bits, size, secondsign, intlsign) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##secondsign##int##bits##x##size vec2) \ + { \ + vec1.avx512f = _mm512_##op##_ep##intlsign##bits(vec1.avx512f, vec2.avx512f); \ \ - /* repack */ \ - v##sign##int16x32 vec; \ - vec.avx512f = _mm512_or_si512( \ - _mm512_slli_epi32(dst_odd, 16), \ - _mm512_srli_epi32(_mm512_slli_epi32(dst_even, 16), 16) \ - ); \ - return vec; \ - } while (0) + return vec1; \ + } + +#define VEC_AVX512F_MINMAX_TEMPLATE(SIGN, BITS, SIZE, INTLSIGN, OP) \ + VEC_AVX512F_OPERATION_EX_EX(OP, OP, SIGN, BITS, SIZE, SIGN, INTLSIGN) -#define VEC_AVX512F_ADD_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(add, sign) +#ifndef VINT32x16_MIN_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE( , 32, 16, i, min) +# define VINT32x16_MIN_DEFINED +#endif -#define VEC_AVX512F_ADD_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(add, sign) +#ifndef VINT32x16_MAX_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE( , 32, 16, i, max) +# define VINT32x16_MAX_DEFINED +#endif + +#ifndef VUINT32x16_MIN_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE(u, 32, 16, u, min) +# define VUINT32x16_MIN_DEFINED +#endif -#define VEC_AVX512F_ADD_32x16(sign) \ - do { \ - v##sign##int32x16 vec; \ - vec.avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +#ifndef VUINT32x16_MAX_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE(u, 32, 16, u, max) +# define VUINT32x16_MAX_DEFINED +#endif + +#ifndef VINT64x8_MIN_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE( , 64, 8, i, min) +# define VINT64x8_MIN_DEFINED +#endif -#define VEC_AVX512F_ADD_64x8(sign) \ - do { \ - v##sign##int64x8 vec; \ - vec.avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) - -#define VEC_AVX512F_SUB_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(sub, sign) +#ifndef VINT64x8_MAX_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE( , 64, 8, i, max) +# define VINT64x8_MAX_DEFINED +#endif -#define VEC_AVX512F_SUB_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(sub, sign) +#ifndef VUINT64x8_MIN_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE(u, 64, 8, u, min) +# define VUINT64x8_MIN_DEFINED +#endif -#define VEC_AVX512F_SUB_32x16(sign) \ - do { \ - v##sign##int32x16 vec; \ - vec.avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +#ifndef VUINT64x8_MAX_DEFINED +VEC_AVX512F_MINMAX_TEMPLATE(u, 64, 8, u, max) +# define VUINT64x8_MAX_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ -#define VEC_AVX512F_SUB_64x8(sign) \ - do { \ - v##sign##int64x8 vec; \ - vec.avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +#define VEC_AVX512F_OPERATION_EX(name, op, sign, bits, size, secondsign) \ + VEC_AVX512F_OPERATION_EX_EX(name, op, sign, bits, size, secondsign, i) + +#define VEC_AVX512F_OPERATION(name, op, sign, bits, size) \ + VEC_AVX512F_OPERATION_EX(name, op, sign, bits, size, sign) -#define VEC_AVX512F_MUL_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(mullo, sign) +#define VEC_AVX512F_OPERATION_SHIFT(name, op, sign, bits, size) \ + VEC_AVX512F_OPERATION_EX(name, op, sign, bits, size, u) -#define VEC_AVX512F_MUL_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(mullo, sign) +#define VEC_AVX512F_XRSHIFT(name, bits, size, sign, aORl) \ + VEC_AVX512F_OPERATION_SHIFT(name, sr##aORl##v, sign, bits, size) -#define VEC_AVX512F_MUL_32x16(sign) \ - do { \ - v##sign##int32x16 vec; \ - vec.avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +/* bitshift */ +#ifndef VINT32x16_LRSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(lrshift, 32, 16, /* nothing */, l) +# define VINT32x16_LRSHIFT_DEFINED +#endif -#define VEC_AVX512F_MUL_64x8(sign) \ - do { \ - __m512i ac = _mm512_mul_epu32(vec1.avx512f, vec2.avx512f); \ - __m512i b = _mm512_srli_epi64(vec1.avx512f, 32); \ - __m512i bc = _mm512_mul_epu32(b, vec2.avx512f); \ - __m512i d = _mm512_srli_epi64(vec2.avx512f, 32); \ - __m512i ad = _mm512_mul_epu32(vec1.avx512f, d); \ - __m512i hi = _mm512_add_epi64(bc, ad); \ - hi = _mm512_slli_epi64(hi, 32); \ - \ - v##sign##int64x8 vec; \ - vec.avx512f = _mm512_add_epi64(hi, ac); \ - return vec; \ - } while (0) +#ifndef VINT64x8_LRSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(lrshift, 64, 8, /* nothing */, l) +# define VINT64x8_LRSHIFT_DEFINED +#endif -#define VEC_AVX512F_LSHIFT_8x64(sign) \ - VEC_AVX512F_OPERATION_8x64(sllv, sign) +#ifndef VUINT32x16_LRSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(lrshift, 32, 16, u, l) +# define VUINT32x16_LRSHIFT_DEFINED +#endif -#define VEC_AVX512F_LSHIFT_16x32(sign) \ - VEC_AVX512F_OPERATION_16x32(sllv, sign) - -#define VEC_AVX512F_LSHIFT_32x16(sign) \ - do { \ - v##sign##int32x16 vec; \ - vec.avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +#ifndef VUINT64x8_LRSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(lrshift, 64, 8, u, l) +# define VUINT64x8_LRSHIFT_DEFINED +#endif -#define VEC_AVX512F_LSHIFT_64x8(sign) \ - do { \ - v##sign##int64x8 vec; \ - vec.avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +#ifndef VINT32x16_RSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(rshift, 32, 16, /* nothing */, a) +# define VINT32x16_RSHIFT_DEFINED +#endif -#define VEC_AVX512F_RSHIFT_8x64(sign, aORl) \ - VEC_AVX512F_OPERATION_8x64(sr##aORl##v, sign) +#ifndef VINT64x8_RSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(rshift, 64, 8, /* nothing */, a) +# define VINT64x8_RSHIFT_DEFINED +#endif -#define VEC_AVX512F_RSHIFT_16x32(sign, aORl) \ - VEC_AVX512F_OPERATION_16x32(sr##aORl##v, sign) - -#define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \ - do { \ - v##sign##int32x16 vec; \ - vec.avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +#ifndef VUINT32x16_RSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(rshift, 32, 16, u, l) +# define VUINT32x16_RSHIFT_DEFINED +#endif -#define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \ - do { \ - v##sign##int64x8 vec; \ - vec.avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } while (0) +#ifndef VUINT64x8_RSHIFT_DEFINED +VEC_AVX512F_XRSHIFT(rshift, 64, 8, u, l) +# define VUINT64x8_RSHIFT_DEFINED +#endif -#define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \ +/* ------------------------------------------------------------------------ */ + +#define VEC_AVX512F_LOAD_ALIGNED(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ vec.avx512f = _mm512_load_si512((const __m512i *)in); \ return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \ + } + +#define VEC_AVX512F_LOAD(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \ return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + } + +#define VEC_AVX512F_STORE_ALIGNED(sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm512_store_si512((__m512i *)out, vec.avx512f); \ - } \ - \ - static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + } + +#define VEC_AVX512F_STORE(sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm512_storeu_si512((__m512i *)out, vec.avx512f); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX512F_ADD_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX512F_SUB_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_AVX512F_MUL_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_AVX512F_LSHIFT_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + } + +#define VEC_AVX512F_BITWISE(op, sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##op(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - VEC_AVX512F_RSHIFT_##bits##x##size(sign, a); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \ - } \ - \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_avx512f_load_aligned, \ - v##sign##int##bits##x##size##_avx512f_load, \ - v##sign##int##bits##x##size##_avx512f_store_aligned, \ - v##sign##int##bits##x##size##_avx512f_store, \ - v##sign##int##bits##x##size##_avx512f_add, \ - v##sign##int##bits##x##size##_avx512f_sub, \ - v##sign##int##bits##x##size##_avx512f_mul, \ - /* .div = */ NULL, \ - /* .avg = */ NULL, \ - v##sign##int##bits##x##size##_avx512f_and, \ - v##sign##int##bits##x##size##_avx512f_or, \ - v##sign##int##bits##x##size##_avx512f_xor, \ - /* .not = */ NULL, \ - v##sign##int##bits##x##size##_avx512f_lshift, \ - v##sign##int##bits##x##size##_avx512f_rshift, \ - v##sign##int##bits##x##size##_avx512f_lrshift, \ - }; + vec1.avx512f = _mm512_##op##_si512(vec1.avx512f, vec2.avx512f); \ + return vec1; \ + } + +#ifndef VINT32x16_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(, 32, 16) +# define VINT32x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT32x16_LOAD_DEFINED +VEC_AVX512F_LOAD(, 32, 16) +# define VINT32x16_LOAD_DEFINED +#endif + +#ifndef VINT32x16_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(, 32, 16) +# define VINT32x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT32x16_STORE_DEFINED +VEC_AVX512F_STORE(, 32, 16) +# define VINT32x16_STORE_DEFINED +#endif + +#ifndef VINT32x16_ADD_DEFINED +VEC_AVX512F_OPERATION_EX_EX(add, add, , 32, 16, , i) +# define VINT32x16_ADD_DEFINED +#endif + +#ifndef VINT32x16_SUB_DEFINED +VEC_AVX512F_OPERATION_EX_EX(sub, sub, , 32, 16, , i) +# define VINT32x16_SUB_DEFINED +#endif + +#ifndef VINT32x16_MUL_DEFINED +VEC_AVX512F_OPERATION_EX_EX(mul, mullo, , 32, 16, , i) +# define VINT32x16_MUL_DEFINED +#endif + +#ifndef VINT32x16_AND_DEFINED +VEC_AVX512F_BITWISE(and, , 32, 16) +# define VINT32x16_AND_DEFINED +#endif + +#ifndef VINT32x16_OR_DEFINED +VEC_AVX512F_BITWISE(or, , 32, 16) +# define VINT32x16_OR_DEFINED +#endif + +#ifndef VINT32x16_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, , 32, 16) +# define VINT32x16_XOR_DEFINED +#endif + +#ifndef VUINT32x16_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(u, 32, 16) +# define VUINT32x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x16_LOAD_DEFINED +VEC_AVX512F_LOAD(u, 32, 16) +# define VUINT32x16_LOAD_DEFINED +#endif + +#ifndef VUINT32x16_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(u, 32, 16) +# define VUINT32x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x16_STORE_DEFINED +VEC_AVX512F_STORE(u, 32, 16) +# define VUINT32x16_STORE_DEFINED +#endif + +#ifndef VUINT32x16_ADD_DEFINED +VEC_AVX512F_OPERATION_EX_EX(add, add, u, 32, 16, u, i) +# define VUINT32x16_ADD_DEFINED +#endif + +#ifndef VUINT32x16_SUB_DEFINED +VEC_AVX512F_OPERATION_EX_EX(sub, sub, u, 32, 16, u, i) +# define VUINT32x16_SUB_DEFINED +#endif + +#ifndef VUINT32x16_MUL_DEFINED +VEC_AVX512F_OPERATION_EX_EX(mul, mullo, u, 32, 16, u, i) +# define VUINT32x16_MUL_DEFINED +#endif + +#ifndef VUINT32x16_AND_DEFINED +VEC_AVX512F_BITWISE(and, u, 32, 16) +# define VUINT32x16_AND_DEFINED +#endif + +#ifndef VUINT32x16_OR_DEFINED +VEC_AVX512F_BITWISE(or, u, 32, 16) +# define VUINT32x16_OR_DEFINED +#endif -#define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \ - VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size) +#ifndef VUINT32x16_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, u, 32, 16) +# define VUINT32x16_XOR_DEFINED +#endif + +#ifndef VINT64x8_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(, 64, 8) +# define VINT64x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT64x8_LOAD_DEFINED +VEC_AVX512F_LOAD(, 64, 8) +# define VINT64x8_LOAD_DEFINED +#endif + +#ifndef VINT64x8_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(, 64, 8) +# define VINT64x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT64x8_STORE_DEFINED +VEC_AVX512F_STORE(, 64, 8) +# define VINT64x8_STORE_DEFINED +#endif -VEC_AVX512F_DEFINE_OPERATIONS(8, 64) -VEC_AVX512F_DEFINE_OPERATIONS(16, 32) -VEC_AVX512F_DEFINE_OPERATIONS(32, 16) -VEC_AVX512F_DEFINE_OPERATIONS(64, 8) +#ifndef VINT64x8_ADD_DEFINED +VEC_AVX512F_OPERATION_EX_EX(add, add, , 64, 8, , i) +# define VINT64x8_ADD_DEFINED +#endif + +#ifndef VINT64x8_SUB_DEFINED +VEC_AVX512F_OPERATION_EX_EX(sub, sub, , 64, 8, , i) +# define VINT64x8_SUB_DEFINED +#endif + +#ifndef VINT64x8_MUL_DEFINED +VEC_AVX512F_OPERATION_EX_EX(mul, mullox, , 64, 8, , i) +# define VINT64x8_MUL_DEFINED +#endif + +#ifndef VINT64x8_AND_DEFINED +VEC_AVX512F_BITWISE(and, , 64, 8) +# define VINT64x8_AND_DEFINED +#endif + +#ifndef VINT64x8_OR_DEFINED +VEC_AVX512F_BITWISE(or, , 64, 8) +# define VINT64x8_OR_DEFINED +#endif -#undef VEC_AVX512F_DEFINE_OPERATIONS -#undef VEC_AVX512F_DEFINE_OPERATIONS_SIGN -#undef VEC_AVX512F_MUL_8x64 -#undef VEC_AVX512F_MUL_16x32 -#undef VEC_AVX512F_MUL_32x16 -#undef VEC_AVX512F_MUL_64x8 +#ifndef VINT64x8_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, , 64, 8) +# define VINT64x8_XOR_DEFINED +#endif + +#ifndef VUINT64x8_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(u, 64, 8) +# define VUINT64x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT64x8_LOAD_DEFINED +VEC_AVX512F_LOAD(u, 64, 8) +# define VUINT64x8_LOAD_DEFINED +#endif + +#ifndef VUINT64x8_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(u, 64, 8) +# define VUINT64x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT64x8_STORE_DEFINED +VEC_AVX512F_STORE(u, 64, 8) +# define VUINT64x8_STORE_DEFINED +#endif -#undef VEC_AVX512F_LSHIFT_8x64 -#undef VEC_AVX512F_LSHIFT_16x32 -#undef VEC_AVX512F_LSHIFT_32x16 -#undef VEC_AVX512F_LSHIFT_64x8 +#ifndef VUINT64x8_ADD_DEFINED +VEC_AVX512F_OPERATION_EX_EX(add, add, u, 64, 8, u, i) +# define VUINT64x8_ADD_DEFINED +#endif + +#ifndef VUINT64x8_SUB_DEFINED +VEC_AVX512F_OPERATION_EX_EX(sub, sub, u, 64, 8, u, i) +# define VUINT64x8_SUB_DEFINED +#endif + +#ifndef VUINT64x8_MUL_DEFINED +VEC_AVX512F_OPERATION_EX_EX(mul, mullox, u, 64, 8, u, i) +# define VUINT64x8_MUL_DEFINED +#endif -#undef VEC_AVX512F_RSHIFT_8x64 -#undef VEC_AVX512F_RSHIFT_16x32 -#undef VEC_AVX512F_RSHIFT_32x16 -#undef VEC_AVX512F_RSHIFT_64x8 +#ifndef VUINT64x8_AND_DEFINED +VEC_AVX512F_BITWISE(and, u, 64, 8) +# define VUINT64x8_AND_DEFINED +#endif + +#ifndef VUINT64x8_OR_DEFINED +VEC_AVX512F_BITWISE(or, u, 64, 8) +# define VUINT64x8_OR_DEFINED +#endif + +#ifndef VUINT64x8_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, u, 64, 8) +# define VUINT64x8_XOR_DEFINED +#endif #endif /* VEC_IMPL_X86_AVX512F_H_ */
--- a/include/vec/impl/x86/mmx.h Fri Apr 25 17:40:55 2025 -0400 +++ b/include/vec/impl/x86/mmx.h Sat Apr 26 01:04:35 2025 -0400 @@ -25,190 +25,516 @@ #ifndef VEC_IMPL_X86_MMX_H_ #define VEC_IMPL_X86_MMX_H_ -#define VEC_MMX_OPERATION_8x8(op, sign) \ - do { \ - /* unpack and multiply */ \ - __m64 dst_even = _mm_##op##_pi16(vec1.mmx, vec2.mmx); \ - __m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1.mmx, 8), _mm_srli_pi16(vec2.mmx, 8)); \ - \ - /* repack */ \ - v##sign##int8x8 vec; \ - vec.mmx = _mm_or_si64( \ - _mm_slli_pi16(dst_odd, 8), \ - _mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \ - ); \ - return vec; \ - } while (0) +/* ------------------------------------------------------------------------ */ -// shifting -#define VEC_MMX_LSHIFT_8x8(sign) \ - VEC_MMX_OPERATION_8x8(sll, sign) +#define VEC_MMX_OP_EX(name, intlsign, op, sign, bits, size, VARS, TRANS1, TRANS2) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VARS \ + \ + TRANS1 \ + \ + vec1.mmx = _mm_##op##_p##intlsign##bits(vec1.mmx, vec2.mmx); \ + \ + TRANS2 \ + \ + return vec1; \ + } -#define VEC_MMX_LSHIFT_16x4(sign) \ - do { \ - v##sign##int16x4 vec; \ - vec.mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx); \ - return vec; \ - } while (0) +#define VEC_MMX_OP(name, intlsign, op, sign, bits, size) \ + VEC_MMX_OP_EX(name, intlsign, op, sign, bits, size, /* nothing */, /* nothing */, /* nothing */) -#define VEC_MMX_LSHIFT_32x2(sign) \ - do { \ - v##sign##int32x2 vec; \ - vec.mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx); \ - return vec; \ - } while (0) - -#define VEC_MMX_RSHIFT_8x8(sign, aORl) \ - VEC_MMX_OPERATION_8x8(sr##aORl, sign) +/* ------------------------------------------------------------------------ */ +/* comparison */ -#define VEC_MMX_RSHIFT_16x4(sign, aORl) \ - do { \ - v##sign##int16x4 vec; \ - vec.mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx); \ - return vec; \ - } while (0) +/* helper funcs */ +#define VEC_xMMX_CMP(name, op, sign, bits, size, first, second, VARS, TRANS1, TRANS2) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VARS \ + \ + TRANS1 \ + \ + vec1.mmx = _mm_##op##_pi##bits(vec##first.mmx, vec##second.mmx); \ + \ + TRANS2 \ + \ + return vec1; \ + } -#define VEC_MMX_RSHIFT_32x2(sign, aORl) \ - do { \ - v##sign##int32x2 vec; \ - vec.mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx); \ - return vec; \ - } while (0) - -// shared between MMX variations -#define VEC_MMX_MUL_8x8(sign) \ - VEC_MMX_OPERATION_8x8(mullo, sign) +#define VEC_MMX_CMP(name, op, bits, size, first, second) \ + VEC_xMMX_CMP(name, op, /* nothing */, bits, size, first, second, /* nothing */, /* nothing */, /* nothing */) -#define VEC_MMX_MUL_16x4(sign) \ - do { \ - /* we have a real instruction for this */ \ - v##sign##int16x4 vec; \ - vec.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ - return vec; \ - } while (0) +#define VEC_uMMX_CMP(name, op, bits, size, first, second) \ + VEC_xMMX_CMP(name, op, u, bits, size, first, second, \ + __m64 xor_val = _mm_set1_pi##bits((vec_int##bits)(1u << (bits - 1))); \ + , { \ + vec1.mmx = _mm_xor_si64(vec1.mmx, xor_val); \ + vec2.mmx = _mm_xor_si64(vec2.mmx, xor_val); \ + }, \ + { \ + /* nothing */ \ + }) -#define VEC_MMX_MUL_32x2(sign) \ - do { \ - __m64 ac = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ - __m64 b = _mm_srli_pi32(vec1.mmx, 16); \ - __m64 bc = _mm_mullo_pi16(b, vec2.mmx); \ - __m64 d = _mm_srli_pi32(vec2.mmx, 16); \ - __m64 ad = _mm_mullo_pi16(vec1.mmx, d); \ - __m64 hi = _mm_add_pi32(bc, ad); \ - hi = _mm_slli_pi32(hi, 16); \ - \ - v##sign##int32x2 vec; \ - vec.mmx = _mm_add_pi32(hi, ac); \ - return vec; \ - } while (0) +#define VEC_MMX_CMPEQ(sign, bits, size) VEC_xMMX_CMP(cmpeq, cmpeq, sign, bits, size, 1, 2, , ,) +#define VEC_MMX_CMPLT(sign, bits, size) VEC_##sign##MMX_CMP(cmplt, cmpgt, bits, size, 2, 1) +#define VEC_MMX_CMPGT(sign, bits, size) VEC_##sign##MMX_CMP(cmpgt, cmpgt, bits, size, 1, 2) -#define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const vec_##sign##int##bits in[size]) \ +/* ------------------------------------------------------------------------ */ + +#define VEC_MMX_SPLAT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ { \ v##sign##int##bits##x##size vec; \ - memcpy(&vec.mmx, in, sizeof(vec.mmx)); \ + vec.mmx = _mm_set1_pi##bits(x); \ return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - memcpy(out, &vec.mmx, sizeof(vec.mmx)); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + } + +#define VEC_MMX_LOAD_EX(name, sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ - vec.mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_MMX_MUL_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.mmx = _mm_and_si64(vec1.mmx, vec2.mmx); \ + memcpy(&vec, in, sizeof(vec)); \ return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + } + +#define VEC_MMX_LOAD(sign, bits, size) VEC_MMX_LOAD_EX(load, sign, bits, size) +#define VEC_MMX_LOAD_ALIGNED(sign, bits, size) VEC_MMX_LOAD_EX(load_aligned, sign, bits, size) + +#define VEC_MMX_STORE_EX(name, sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ - v##sign##int##bits##x##size vec; \ - vec.mmx = _mm_or_si64(vec1.mmx, vec2.mmx); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + memcpy(out, &vec, sizeof(vec)); \ + } + +#define VEC_MMX_STORE(sign, bits, size) VEC_MMX_STORE_EX(store, sign, bits, size) +#define VEC_MMX_STORE_ALIGNED(sign, bits, size) VEC_MMX_STORE_EX(store_aligned, sign, bits, size) + +#define VEC_MMX_BITWISE(name, sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - v##sign##int##bits##x##size vec; \ - vec.mmx = _mm_xor_si64(vec1.mmx, vec2.mmx); \ - return vec; \ - } \ + vec1.mmx = _mm_##name##_si64(vec1.mmx, vec2.mmx); \ \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_MMX_LSHIFT_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + return vec1; \ + } + +/* ------------------------------------------------------------------------ */ + +#ifndef VINT8x8_SPLAT_DEFINED +VEC_MMX_SPLAT(, 8, 8) +# define VINT8x8_SPLAT_DEFINED +#endif + +#ifndef VINT8x8_LOAD_DEFINED +VEC_MMX_LOAD(, 8, 8) +# define VINT8x8_LOAD_DEFINED +#endif + +#ifndef VINT8x8_LOAD_ALIGNED_DEFINED +VEC_MMX_LOAD_ALIGNED(, 8, 8) +# define VINT8x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x8_STORE_DEFINED +VEC_MMX_STORE(, 8, 8) +# define VINT8x8_STORE_DEFINED +#endif + +#ifndef VINT8x8_STORE_ALIGNED_DEFINED +VEC_MMX_STORE_ALIGNED(, 8, 8) +# define VINT8x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x8_ADD_DEFINED +VEC_MMX_OP(add, i, add, /* nothing */, 8, 8) +# define VINT8x8_ADD_DEFINED +#endif + +#ifndef VINT8x8_SUB_DEFINED +VEC_MMX_OP(sub, i, sub, /* nothing */, 8, 8) +# define VINT8x8_SUB_DEFINED +#endif + +#ifndef VINT8x8_AND_DEFINED +VEC_MMX_BITWISE(and, /* nothing */, 8, 8) +# define VINT8x8_AND_DEFINED +#endif + +#ifndef VINT8x8_OR_DEFINED +VEC_MMX_BITWISE(or, /* nothing */, 8, 8) +# define VINT8x8_OR_DEFINED +#endif + +#ifndef VINT8x8_XOR_DEFINED +VEC_MMX_BITWISE(xor, /* nothing */, 8, 8) +# define VINT8x8_XOR_DEFINED +#endif + +#ifndef VINT8x8_CMPEQ_DEFINED +VEC_MMX_CMPEQ(, 8, 8) +# define VINT8x8_CMPEQ_DEFINED +#endif + +#ifndef VINT8x8_CMPLT_DEFINED +VEC_MMX_CMPLT(, 8, 8) +# define VINT8x8_CMPLT_DEFINED +#endif + +#ifndef VINT8x8_CMPGT_DEFINED +VEC_MMX_CMPGT(, 8, 8) +# define VINT8x8_CMPGT_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ + +#ifndef VUINT8x8_SPLAT_DEFINED +VEC_MMX_SPLAT(u, 8, 8) +# define VUINT8x8_SPLAT_DEFINED +#endif + +#ifndef VUINT8x8_LOAD_DEFINED +VEC_MMX_LOAD(u, 8, 8) +# define VUINT8x8_LOAD_DEFINED +#endif + +#ifndef VUINT8x8_LOAD_ALIGNED_DEFINED +VEC_MMX_LOAD_ALIGNED(u, 8, 8) +# define VUINT8x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x8_STORE_DEFINED +VEC_MMX_STORE(u, 8, 8) +# define VUINT8x8_STORE_DEFINED +#endif + +#ifndef VUINT8x8_STORE_ALIGNED_DEFINED +VEC_MMX_STORE_ALIGNED(u, 8, 8) +# define VUINT8x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x8_ADD_DEFINED +VEC_MMX_OP(add, i, add, u, 8, 8) +# define VUINT8x8_ADD_DEFINED +#endif + +#ifndef VUINT8x8_SUB_DEFINED +VEC_MMX_OP(sub, i, sub, u, 8, 8) +# define VUINT8x8_SUB_DEFINED +#endif + +#ifndef VUINT8x8_AND_DEFINED +VEC_MMX_BITWISE(and, u, 8, 8) +# define VUINT8x8_AND_DEFINED +#endif + +#ifndef VUINT8x8_OR_DEFINED +VEC_MMX_BITWISE(or, u, 8, 8) +# define VUINT8x8_OR_DEFINED +#endif + +#ifndef VUINT8x8_XOR_DEFINED +VEC_MMX_BITWISE(xor, u, 8, 8) +# define VUINT8x8_XOR_DEFINED +#endif + +#ifndef VUINT8x8_CMPEQ_DEFINED +VEC_MMX_CMPEQ(u, 8, 8) +# define VUINT8x8_CMPEQ_DEFINED +#endif + +#ifndef VUINT8x8_CMPLT_DEFINED +VEC_MMX_CMPLT(u, 8, 8) +# define VUINT8x8_CMPLT_DEFINED +#endif + +#ifndef VUINT8x8_CMPGT_DEFINED +VEC_MMX_CMPGT(u, 8, 8) +# define VUINT8x8_CMPGT_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ + +#define VEC_MMX_MUL_16x4(sign) \ + VEC_FUNC_IMPL v##sign##int16x4 v##sign##int16x4_mul(v##sign##int16x4 vec1, v##sign##int16x4 vec2) \ { \ - VEC_MMX_RSHIFT_##bits##x##size(sign, a); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_MMX_RSHIFT_##bits##x##size(sign, l); \ - } \ - \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_mmx_load_aligned, \ - v##sign##int##bits##x##size##_mmx_load_aligned, \ - v##sign##int##bits##x##size##_mmx_store_aligned, \ - v##sign##int##bits##x##size##_mmx_store_aligned, \ - v##sign##int##bits##x##size##_mmx_add, \ - v##sign##int##bits##x##size##_mmx_sub, \ - v##sign##int##bits##x##size##_mmx_mul, \ - /* .div = */ NULL, \ - /* .avg = */ NULL, \ - v##sign##int##bits##x##size##_mmx_and, \ - v##sign##int##bits##x##size##_mmx_or, \ - v##sign##int##bits##x##size##_mmx_xor, \ - /* .not = */ NULL, \ - v##sign##int##bits##x##size##_mmx_lshift, \ - v##sign##int##bits##x##size##_mmx_rshift, \ - v##sign##int##bits##x##size##_mmx_lrshift, \ - }; + vec1.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ + return vec1; \ + } + +#ifndef VINT16x4_SPLAT_DEFINED +VEC_MMX_SPLAT(, 16, 4) +# define VINT16x4_SPLAT_DEFINED +#endif + +#ifndef VINT16x4_LOAD_DEFINED +VEC_MMX_LOAD(, 16, 4) +# define VINT16x4_LOAD_DEFINED +#endif + +#ifndef VINT16x4_LOAD_ALIGNED_DEFINED +VEC_MMX_LOAD_ALIGNED(, 16, 4) +# define VINT16x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x4_STORE_DEFINED +VEC_MMX_STORE(, 16, 4) +# define VINT16x4_STORE_DEFINED +#endif + +#ifndef VINT16x4_STORE_ALIGNED_DEFINED +VEC_MMX_STORE_ALIGNED(, 16, 4) +# define VINT16x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x4_ADD_DEFINED +VEC_MMX_OP(add, i, add, /* nothing */, 16, 4) +# define VINT16x4_ADD_DEFINED +#endif + +#ifndef VINT16x4_SUB_DEFINED +VEC_MMX_OP(sub, i, sub, /* nothing */, 16, 4) +# define VINT16x4_SUB_DEFINED +#endif + +#ifndef VINT16x4_MUL_DEFINED +VEC_MMX_MUL_16x4(/* nothing */) +# define VINT16x4_MUL_DEFINED +#endif + +#ifndef VINT16x4_AND_DEFINED +VEC_MMX_BITWISE(and, /* nothing */, 16, 4) +# define VINT16x4_AND_DEFINED +#endif + +#ifndef VINT16x4_OR_DEFINED +VEC_MMX_BITWISE(or, /* nothing */, 16, 4) +# define VINT16x4_OR_DEFINED +#endif + +#ifndef VINT16x4_XOR_DEFINED +VEC_MMX_BITWISE(xor, /* nothing */, 16, 4) +# define VINT16x4_XOR_DEFINED +#endif + +#ifndef VINT16x4_CMPEQ_DEFINED +VEC_MMX_CMPEQ(, 16, 4) +# define VINT16x4_CMPEQ_DEFINED +#endif + +#ifndef VINT16x4_CMPLT_DEFINED +VEC_MMX_CMPLT(, 16, 4) +# define VINT16x4_CMPLT_DEFINED +#endif + +#ifndef VINT16x4_CMPGT_DEFINED +VEC_MMX_CMPGT(, 16, 4) +# define VINT16x4_CMPGT_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ + +#ifndef VUINT16x4_SPLAT_DEFINED +VEC_MMX_SPLAT(u, 16, 4) +# define VUINT16x4_SPLAT_DEFINED +#endif + +#ifndef VUINT16x4_LOAD_DEFINED +VEC_MMX_LOAD(u, 16, 4) +# define VUINT16x4_LOAD_DEFINED +#endif + +#ifndef VUINT16x4_LOAD_ALIGNED_DEFINED +VEC_MMX_LOAD_ALIGNED(u, 16, 4) +# define VUINT16x4_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x4_STORE_DEFINED +VEC_MMX_STORE(u, 16, 4) +# define VUINT16x4_STORE_DEFINED +#endif + +#ifndef VUINT16x4_STORE_ALIGNED_DEFINED +VEC_MMX_STORE_ALIGNED(u, 16, 4) +# define VUINT16x4_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x4_ADD_DEFINED +VEC_MMX_OP(add, i, add, u, 16, 4) +# define VUINT16x4_ADD_DEFINED +#endif + +#ifndef VUINT16x4_SUB_DEFINED +VEC_MMX_OP(sub, i, sub, u, 16, 4) +# define VUINT16x4_SUB_DEFINED +#endif + +#ifndef VUINT16x4_MUL_DEFINED +VEC_MMX_MUL_16x4(u) +# define VUINT16x4_MUL_DEFINED +#endif + +#ifndef VUINT16x4_AND_DEFINED +VEC_MMX_BITWISE(and, u, 16, 4) +# define VUINT16x4_AND_DEFINED +#endif + +#ifndef VUINT16x4_OR_DEFINED +VEC_MMX_BITWISE(or, u, 16, 4) +# define VUINT16x4_OR_DEFINED +#endif + +#ifndef VUINT16x4_XOR_DEFINED +VEC_MMX_BITWISE(xor, u, 16, 4) +# define VUINT16x4_XOR_DEFINED +#endif + +#ifndef VUINT16x4_CMPEQ_DEFINED +VEC_MMX_CMPEQ(u, 16, 4) +# define VUINT16x4_CMPEQ_DEFINED +#endif -#define VEC_MMX_DEFINE_OPERATIONS(bits, size) \ - VEC_MMX_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_MMX_DEFINE_OPERATIONS_SIGN(u, bits, size) +#ifndef VUINT16x4_CMPLT_DEFINED +VEC_MMX_CMPLT(u, 16, 4) +# define VUINT16x4_CMPLT_DEFINED +#endif + +#ifndef VUINT16x4_CMPGT_DEFINED +VEC_MMX_CMPGT(u, 16, 4) +# define VUINT16x4_CMPGT_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ + +#ifndef VINT32x2_SPLAT_DEFINED +VEC_MMX_SPLAT(, 32, 2) +# define VINT32x2_SPLAT_DEFINED +#endif + +#ifndef VINT32x2_LOAD_DEFINED +VEC_MMX_LOAD(, 32, 2) +# define VINT32x2_LOAD_DEFINED +#endif + +#ifndef VINT32x2_LOAD_ALIGNED_DEFINED +VEC_MMX_LOAD_ALIGNED(, 32, 2) +# define VINT32x2_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT32x2_STORE_DEFINED +VEC_MMX_STORE(, 32, 2) +# define VINT32x2_STORE_DEFINED +#endif -VEC_MMX_DEFINE_OPERATIONS(8, 8) -VEC_MMX_DEFINE_OPERATIONS(16, 4) -VEC_MMX_DEFINE_OPERATIONS(32, 2) +#ifndef VINT32x2_STORE_ALIGNED_DEFINED +VEC_MMX_STORE_ALIGNED(, 32, 2) +# define VINT32x2_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT32x2_ADD_DEFINED +VEC_MMX_OP(add, i, add, /* nothing */, 32, 2) +# define VINT32x2_ADD_DEFINED +#endif + +#ifndef VINT32x2_SUB_DEFINED +VEC_MMX_OP(sub, i, sub, /* nothing */, 32, 2) +# define VINT32x2_SUB_DEFINED +#endif + +#ifndef VINT32x2_AND_DEFINED +VEC_MMX_BITWISE(and, /* nothing */, 32, 2) +# define VINT32x2_AND_DEFINED +#endif + +#ifndef VINT32x2_OR_DEFINED +VEC_MMX_BITWISE(or, /* nothing */, 32, 2) +# define VINT32x2_OR_DEFINED +#endif + +#ifndef VINT32x2_XOR_DEFINED +VEC_MMX_BITWISE(xor, /* nothing */, 32, 2) +# define VINT32x2_XOR_DEFINED +#endif + +#ifndef VINT32x2_CMPEQ_DEFINED +VEC_MMX_CMPEQ(, 32, 2) +# define VINT32x2_CMPEQ_DEFINED +#endif + +#ifndef VINT32x2_CMPLT_DEFINED +VEC_MMX_CMPLT(, 32, 2) +# define VINT32x2_CMPLT_DEFINED +#endif -#undef VEC_MMX_DEFINE_OPERATIONS -#undef VEC_MMX_DEFINE_OPERATIONS_SIGN -#undef VEC_MMX_MUL_8x8 -#undef VEC_MMX_MUL_16x4 -#undef VEC_MMX_MUL_32x2 -#undef VEC_MMX_OPERATION_8x8 -#undef VEC_MMX_LSHIFT_8x8 -#undef VEC_MMX_LSHIFT_16x4 -#undef VEC_MMX_LSHIFT_32x2 -#undef VEC_MMX_RSHIFT_8x8 -#undef VEC_MMX_RSHIFT_16x4 -#undef VEC_MMX_RSHIFT_32x2 +#ifndef VINT32x2_CMPGT_DEFINED +VEC_MMX_CMPGT(, 32, 2) +# define VINT32x2_CMPGT_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ + +#ifndef VUINT32x2_SPLAT_DEFINED +VEC_MMX_SPLAT(u, 32, 2) +# define VUINT32x2_SPLAT_DEFINED +#endif + +#ifndef VUINT32x2_LOAD_DEFINED +VEC_MMX_LOAD(u, 32, 2) +# define VUINT32x2_LOAD_DEFINED +#endif + +#ifndef VUINT32x2_LOAD_ALIGNED_DEFINED +VEC_MMX_LOAD_ALIGNED(u, 32, 2) +# define VUINT32x2_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x2_STORE_DEFINED +VEC_MMX_STORE(u, 32, 2) +# define VUINT32x2_STORE_DEFINED +#endif + +#ifndef VUINT32x2_STORE_ALIGNED_DEFINED +VEC_MMX_STORE_ALIGNED(u, 32, 2) +# define VUINT32x2_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT32x2_ADD_DEFINED +VEC_MMX_OP(add, i, add, u, 32, 2) +# define VUINT32x2_ADD_DEFINED +#endif + +#ifndef VUINT32x2_SUB_DEFINED +VEC_MMX_OP(sub, i, sub, u, 32, 2) +# define VUINT32x2_SUB_DEFINED +#endif + +#ifndef VUINT32x2_AND_DEFINED +VEC_MMX_BITWISE(and, u, 32, 2) +# define VUINT32x2_AND_DEFINED +#endif + +#ifndef VUINT32x2_OR_DEFINED +VEC_MMX_BITWISE(or, u, 32, 2) +# define VUINT32x2_OR_DEFINED +#endif + +#ifndef VUINT32x2_XOR_DEFINED +VEC_MMX_BITWISE(xor, u, 32, 2) +# define VUINT32x2_XOR_DEFINED +#endif + +#ifndef VUINT32x2_CMPEQ_DEFINED +VEC_MMX_CMPEQ(u, 32, 2) +# define VUINT32x2_CMPEQ_DEFINED +#endif + +#ifndef VUINT32x2_CMPLT_DEFINED +VEC_MMX_CMPLT(u, 32, 2) +# define VUINT32x2_CMPLT_DEFINED +#endif + +#ifndef VUINT32x2_CMPGT_DEFINED +VEC_MMX_CMPGT(u, 32, 2) +# define VUINT32x2_CMPGT_DEFINED +#endif #endif /* VEC_IMPL_X86_MMX_H_ */
--- a/include/vec/impl/x86/sse2.h Fri Apr 25 17:40:55 2025 -0400 +++ b/include/vec/impl/x86/sse2.h Sat Apr 26 01:04:35 2025 -0400 @@ -25,105 +25,497 @@ #ifndef VEC_IMPL_X86_SSE2_H_ #define VEC_IMPL_X86_SSE2_H_ -#define VEC_SSE2_OPERATION_8x16(op, sign) \ - do { \ - /* unpack and multiply */ \ - __m128i dst_even = _mm_##op##_epi16(vec1.sse, vec2.sse); \ - __m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \ +#include <emmintrin.h> + +/* eh */ +#define VEC_SSE2_SET1_8(x) _mm_set1_epi8(x) +#define VEC_SSE2_SET1_16(x) _mm_set1_epi16(x) +#define VEC_SSE2_SET1_32(x) _mm_set1_epi32(x) +#define VEC_SSE2_SET1_64(x) _mm_set1_epi64x(x) + +/* ------------------------------------------------------------------------ */ + +/* despite this macro's name, it's used to basically define every single + * operation :) (with a few exceptions) */ +#define VEC_SSE2_OP_EX(name, op, sign, bits, size, first, second, VARS, TRANS1, TRANS2, INTLSIGN) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VARS \ + \ + TRANS1 \ + \ + vec1.sse = _mm_##op##_ep##INTLSIGN##bits(vec##first.sse, vec##second.sse); \ + \ + TRANS2 \ \ - /* repack */ \ - v##sign##int8x16 vec; \ - vec.sse = _mm_or_si128( \ - _mm_slli_epi16(dst_odd, 8), \ - _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ - ); \ - return vec; \ - } while (0) + return vec1; \ + } + +#define VEC_SSE2_OP_XOR(name, op, sign, bits, size, first, second, intlsign) \ + VEC_SSE2_OP_EX(name, op, sign, bits, size, first, second, \ + __m128i xor_val = VEC_SSE2_SET1_##bits((vec_int##bits)(1u << (bits - 1))); \ + , { \ + vec1.sse = _mm_xor_si128(vec1.sse, xor_val); \ + vec2.sse = _mm_xor_si128(vec2.sse, xor_val); \ + }, \ + { \ + vec1.sse = _mm_xor_si128(vec1.sse, xor_val); \ + }, intlsign) + +#define VEC_SSE2_OP(name, op, sign, bits, size, first, second, intlsign) \ + VEC_SSE2_OP_EX(name, op, sign, bits, size, first, second, /* nothing */, /* nothing */, /* nothing */, intlsign) + +/* ------------------------------------------------------------------------ */ +/* comparison */ + +#define VEC_xSSE2_CMP(name, op, sign, bits, size, first, second, VARS, TRANS1, TRANS2) \ + VEC_SSE2_OP_EX(name, op, sign, bits, size, first, second, VARS, TRANS1, TRANS2, i) + +#define VEC_SSE2_CMP(name, op, bits, size, first, second) \ + VEC_xSSE2_CMP(name, op, /* nothing */, bits, size, first, second, /* nothing */, /* nothing */, /* nothing */) -// shifting -#define VEC_SSE2_LSHIFT_8x16(sign) \ - VEC_SSE2_OPERATION_8x16(sll, sign) +#define VEC_uSSE2_CMP(name, op, bits, size, first, second) \ + VEC_xSSE2_CMP(name, op, u, bits, size, first, second, \ + __m128i xor_val = VEC_SSE2_SET1_##bits((vec_int##bits)(1u << (bits - 1))); \ + , { \ + vec1.sse = _mm_xor_si128(vec1.sse, xor_val); \ + vec2.sse = _mm_xor_si128(vec2.sse, xor_val); \ + }, \ + { \ + /* nothing */ \ + }) + +/* these are the same for unsigned and signed, for obvious reasons. */ +#define VEC_SSE2_CMPEQ_8x16(sign) VEC_xSSE2_CMP(cmpeq, cmpeq, sign, 8, 16, 1, 2, , ,) +#define VEC_SSE2_CMPEQ_16x8(sign) VEC_xSSE2_CMP(cmpeq, cmpeq, sign, 16, 8, 1, 2, , ,) +#define VEC_SSE2_CMPEQ_32x4(sign) VEC_xSSE2_CMP(cmpeq, cmpeq, sign, 32, 4, 1, 2, , ,) -#define VEC_SSE2_LSHIFT_16x8(sign) \ - do { \ - v##sign##int16x8 vec; \ - vec.sse = _mm_sll_epi16(vec1.sse, vec2.sse); \ +/* SSE2 doesn't have an intrinsic for 64x2 equality comparison, + * so how can we take a 32x4 comparison result and turn it into + * a 64x2 comparison result? + * + * well, Intel conveniently provided an operation where we can + * shuffle around 32-bit integers (_mm_shuffle_epi32). + * + * this means all we have to do is simply do the 32-bit operation, + * shuffle the parts, and then return a bitwise AND of the result. */ + +#define VEC_SSE2_CMPEQ_64x2(sign) \ + VEC_FUNC_IMPL v##sign##int64x2 v##sign##int64x2_cmpeq(v##sign##int64x2 vec1, v##sign##int64x2 vec2) \ + { \ + vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ + vec2.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(1, 1, 3, 3)); \ + vec1.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(0, 0, 2, 2)); \ + vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \ + \ + return vec1; \ + } + +/* ------------------------------------------------------------------------ */ + +#define VEC_SSE2_SPLAT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = VEC_SSE2_SET1_##bits(x); \ return vec; \ - } while (0) - -#define VEC_SSE2_LSHIFT_32x4(sign) \ - do { \ - v##sign##int32x4 vec; \ - vec.sse = _mm_sll_epi32(vec1.sse, vec2.sse); \ - return vec; \ - } while (0) + } -#define VEC_SSE2_LSHIFT_64x2(sign) \ - do { \ - v##sign##int64x2 vec; \ - vec.sse = _mm_sll_epi64(vec1.sse, vec2.sse); \ +#define VEC_SSE2_LOAD_ALIGNED(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_load_si128((const __m128i *)in); \ return vec; \ - } while (0) + } -#define VEC_SSE2_RSHIFT_8x16(sign, aORl) \ - VEC_SSE2_OPERATION_8x16(sr##aORl, sign) - -#define VEC_SSE2_RSHIFT_16x8(sign, aORl) \ - do { \ - v##sign##int16x8 vec; \ - vec.sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse); \ +#define VEC_SSE2_LOAD(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_loadu_si128((const __m128i *)in); \ return vec; \ - } while (0) + } + +#define VEC_SSE2_STORE_ALIGNED(sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm_store_si128((__m128i *)out, vec.sse); \ + } + +#define VEC_SSE2_STORE(sign, bits, size) \ + VEC_FUNC_IMPL void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + _mm_storeu_si128((__m128i *)out, vec.sse); \ + } + +#define VEC_SSE2_ADD(sign, bits, size) \ + VEC_SSE2_OP(add, add, sign, bits, size, 1, 2, i) + +#define VEC_SSE2_SUB(sign, bits, size) \ + VEC_SSE2_OP(sub, sub, sign, bits, size, 1, 2, i) + +#define VEC_SSE2_AND(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \ + return vec1; \ + } + +#define VEC_SSE2_OR(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + vec1.sse = _mm_or_si128(vec1.sse, vec2.sse); \ + return vec1; \ + } + +#define VEC_SSE2_XOR(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + vec1.sse = _mm_xor_si128(vec1.sse, vec2.sse); \ + return vec1; \ + } + +/* ------------------------------------------------------------------------ */ +/* vint8x16 */ -#define VEC_SSE2_RSHIFT_32x4(sign, aORl) \ - do { \ - v##sign##int32x4 vec; \ - vec.sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse); \ - return vec; \ - } while (0) +#ifndef VINT8x16_SPLAT_DEFINED +VEC_SSE2_SPLAT(/* nothing */, 8, 16) +# define VINT8x16_SPLAT_DEFINED +#endif + +#ifndef VINT8x16_LOAD_ALIGNED_DEFINED +VEC_SSE2_LOAD_ALIGNED(/* nothing */, 8, 16) +# define VINT8x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x16_LOAD_DEFINED +VEC_SSE2_LOAD(/* nothing */, 8, 16) +# define VINT8x16_LOAD_DEFINED +#endif + +#ifndef VINT8x16_STORE_ALIGNED_DEFINED +VEC_SSE2_STORE_ALIGNED(/* nothing */, 8, 16) +# define VINT8x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x16_STORE_DEFINED +VEC_SSE2_STORE(/* nothing */, 8, 16) +# define VINT8x16_STORE_DEFINED +#endif + +#ifndef VINT8x16_ADD_DEFINED +VEC_SSE2_ADD(/* nothing */, 8, 16) +# define VINT8x16_ADD_DEFINED +#endif + +#ifndef VINT8x16_SUB_DEFINED +VEC_SSE2_SUB(/* nothing */, 8, 16) +# define VINT8x16_SUB_DEFINED +#endif + +#ifndef VINT8x16_AND_DEFINED +VEC_SSE2_AND(/* nothing */, 8, 16) +# define VINT8x16_AND_DEFINED +#endif + +#ifndef VINT8x16_OR_DEFINED +VEC_SSE2_OR(/* nothing */, 8, 16) +# define VINT8x16_OR_DEFINED +#endif -#define VEC_SSE2_aRSHIFT_64x2(sign) \ - do { \ - return v##sign##int64x2_fallback_rshift(vec1, vec2); \ - } while (0) +#ifndef VINT8x16_XOR_DEFINED +VEC_SSE2_XOR(/* nothing */, 8, 16) +# define VINT8x16_XOR_DEFINED +#endif + +#ifndef VINT8x16_CMPGT_DEFINED +VEC_SSE2_CMP(cmpgt, cmpgt, 8, 16, 1, 2) +# define VINT8x16_CMPGT_DEFINED +#endif + +#ifndef VINT8x16_CMPLT_DEFINED +VEC_SSE2_CMP(cmplt, cmpgt, 8, 16, 2, 1) +# define VINT8x16_CMPLT_DEFINED +#endif + +#ifndef VINT8x16_CMPEQ_DEFINED +VEC_xSSE2_CMP(cmpeq, cmpeq, /* nothing */, 8, 16, 1, 2, , ,) +# define VINT8x16_CMPEQ_DEFINED +#endif + +#ifndef VINT8x16_MIN_DEFINED +VEC_SSE2_OP_XOR(min, min, /* nothing */, 8, 16, 1, 2, u) +# define VINT8x16_MIN_DEFINED +#endif + +#ifndef VINT8x16_MAX_DEFINED +VEC_SSE2_OP_XOR(max, max, /* nothing */, 8, 16, 1, 2, u) +# define VINT8x16_MAX_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vuint8x16 */ + +#ifndef VUINT8x16_SPLAT_DEFINED +VEC_SSE2_SPLAT(u, 8, 16) +# define VUINT8x16_SPLAT_DEFINED +#endif + +#ifndef VUINT8x16_LOAD_ALIGNED_DEFINED +VEC_SSE2_LOAD_ALIGNED(u, 8, 16) +# define VUINT8x16_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x16_LOAD_DEFINED +VEC_SSE2_LOAD(u, 8, 16) +# define VUINT8x16_LOAD_DEFINED +#endif -#define VEC_SSE2_lRSHIFT_64x2(sign) \ - do { \ - v##sign##int64x2 vec; \ - vec.sse = _mm_srl_epi64(vec1.sse, vec2.sse); \ - return vec; \ - } while (0) +#ifndef VUINT8x16_STORE_ALIGNED_DEFINED +VEC_SSE2_STORE_ALIGNED(u, 8, 16) +# define VUINT8x16_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x16_STORE_DEFINED +VEC_SSE2_STORE(u, 8, 16) +# define VUINT8x16_STORE_DEFINED +#endif + +#ifndef VUINT8x16_ADD_DEFINED +VEC_SSE2_ADD(u, 8, 16) +# define VUINT8x16_ADD_DEFINED +#endif + +#ifndef VUINT8x16_SUB_DEFINED +VEC_SSE2_SUB(u, 8, 16) +# define VUINT8x16_SUB_DEFINED +#endif + +#ifndef VUINT8x16_AND_DEFINED +VEC_SSE2_AND(u, 8, 16) +# define VUINT8x16_AND_DEFINED +#endif -#define VEC_SSE2_RSHIFT_64x2(sign, aORl) \ - VEC_SSE2_##aORl##RSHIFT_64x2(sign) +#ifndef VUINT8x16_OR_DEFINED +VEC_SSE2_OR(u, 8, 16) +# define VUINT8x16_OR_DEFINED +#endif + +#ifndef VUINT8x16_XOR_DEFINED +VEC_SSE2_XOR(u, 8, 16) +# define VUINT8x16_XOR_DEFINED +#endif + +#ifndef VUINT8x16_CMPGT_DEFINED +VEC_uSSE2_CMP(cmpgt, cmpgt, 8, 16, 1, 2) +# define VUINT8x16_CMPGT_DEFINED +#endif + +#ifndef VUINT8x16_CMPLT_DEFINED +VEC_uSSE2_CMP(cmplt, cmpgt, 8, 16, 2, 1) +# define VUINT8x16_CMPLT_DEFINED +#endif + +#ifndef VUINT8x16_CMPEQ_DEFINED +VEC_xSSE2_CMP(cmpeq, cmpeq, u, 8, 16, 1, 2, , ,) +# define VUINT8x16_CMPEQ_DEFINED +#endif -// shared between SSE2 variations -#define VEC_SSE2_MUL_8x16(sign) \ - VEC_SSE2_OPERATION_8x16(mullo, sign) +#ifndef VUINT8x16_MIN_DEFINED +VEC_SSE2_OP(min, min, u, 8, 16, 1, 2, u) +# define VUINT8x16_MIN_DEFINED +#endif + +#ifndef VUINT8x16_MAX_DEFINED +VEC_SSE2_OP(max, max, u, 8, 16, 1, 2, u) +# define VUINT8x16_MAX_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vint8x16 */ + +#ifndef VINT16x8_SPLAT_DEFINED +VEC_SSE2_SPLAT(/* nothing */, 16, 8) +# define VINT16x8_SPLAT_DEFINED +#endif + +#ifndef VINT16x8_LOAD_ALIGNED_DEFINED +VEC_SSE2_LOAD_ALIGNED(/* nothing */, 16, 8) +# define VINT16x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x8_LOAD_DEFINED +VEC_SSE2_LOAD(/* nothing */, 16, 8) +# define VINT16x8_LOAD_DEFINED +#endif + +#ifndef VINT16x8_STORE_ALIGNED_DEFINED +VEC_SSE2_STORE_ALIGNED(/* nothing */, 16, 8) +# define VINT16x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x8_STORE_DEFINED +VEC_SSE2_STORE(/* nothing */, 16, 8) +# define VINT16x8_STORE_DEFINED +#endif + +#ifndef VINT16x8_ADD_DEFINED +VEC_SSE2_ADD(/* nothing */, 16, 8) +# define VINT16x8_ADD_DEFINED +#endif -#define VEC_SSE2_MUL_16x8(sign) \ - do { \ - /* we have a real instruction for this */ \ - vec1.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \ - return vec1; \ - } while (0) +#ifndef VINT16x8_SUB_DEFINED +VEC_SSE2_SUB(/* nothing */, 16, 8) +# define VINT16x8_SUB_DEFINED +#endif + +#ifndef VINT16x8_MUL_DEFINED +VEC_SSE2_OP(mul, mullo, /* nothing */, 16, 8, 1, 2, i) +# define VINT16x8_MUL_DEFINED +#endif + +#ifndef VINT16x8_AND_DEFINED +VEC_SSE2_AND(/* nothing */, 16, 8) +# define VINT16x8_AND_DEFINED +#endif + +#ifndef VINT16x8_OR_DEFINED +VEC_SSE2_OR(/* nothing */, 16, 8) +# define VINT16x8_OR_DEFINED +#endif + +#ifndef VINT16x8_XOR_DEFINED +VEC_SSE2_XOR(/* nothing */, 16, 8) +# define VINT16x8_XOR_DEFINED +#endif + +#ifndef VINT16x8_CMPGT_DEFINED +VEC_SSE2_CMP(cmpgt, cmpgt, 16, 8, 1, 2) +# define VINT16x8_CMPGT_DEFINED +#endif + +#ifndef VINT16x8_CMPLT_DEFINED +VEC_SSE2_CMP(cmplt, cmpgt, 16, 8, 2, 1) +# define VINT16x8_CMPLT_DEFINED +#endif + +#ifndef VINT16x8_CMPEQ_DEFINED +VEC_xSSE2_CMP(cmpeq, cmpeq, /* nothing */, 16, 8, 1, 2, , ,) +# define VINT16x8_CMPEQ_DEFINED +#endif + +#ifndef VINT16x8_MIN_DEFINED +VEC_SSE2_OP(min, min, /* nothing */, 16, 8, 1, 2, i) +# define VINT16x8_MIN_DEFINED +#endif + +#ifndef VINT16x8_MAX_DEFINED +VEC_SSE2_OP(max, max, /* nothing */, 16, 8, 1, 2, i) +# define VINT16x8_MAX_DEFINED +#endif -#define VEC_SSE2_MUL_32x4(sign) \ - do { \ - /* this was stolen from... somewhere :) */ \ - __m128i a13 = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \ - __m128i b13 = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \ - __m128i prod02 = _mm_mul_epu32(vec1.sse, vec2.sse); /* (-,a2*b2,-,a0*b0) */ \ - __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ - __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ +/* ------------------------------------------------------------------------ */ +/* vuint8x16 */ + +#ifndef VUINT16x8_SPLAT_DEFINED +VEC_SSE2_SPLAT(u, 16, 8) +# define VUINT16x8_SPLAT_DEFINED +#endif + +#ifndef VUINT16x8_LOAD_ALIGNED_DEFINED +VEC_SSE2_LOAD_ALIGNED(u, 16, 8) +# define VUINT16x8_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x8_LOAD_DEFINED +VEC_SSE2_LOAD(u, 16, 8) +# define VUINT16x8_LOAD_DEFINED +#endif + +#ifndef VUINT16x8_STORE_ALIGNED_DEFINED +VEC_SSE2_STORE_ALIGNED(u, 16, 8) +# define VUINT16x8_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x8_STORE_DEFINED +VEC_SSE2_STORE(u, 16, 8) +# define VUINT16x8_STORE_DEFINED +#endif + +#ifndef VUINT16x8_ADD_DEFINED +VEC_SSE2_ADD(u, 16, 8) +# define VUINT16x8_ADD_DEFINED +#endif + +#ifndef VUINT16x8_SUB_DEFINED +VEC_SSE2_SUB(u, 16, 8) +# define VUINT16x8_SUB_DEFINED +#endif + +#ifndef VUINT16x8_MUL_DEFINED +VEC_SSE2_OP(mul, mullo, u, 16, 8, 1, 2, i) +# define VUINT16x8_MUL_DEFINED +#endif + +#ifndef VUINT16x8_AND_DEFINED +VEC_SSE2_AND(u, 16, 8) +# define VUINT16x8_AND_DEFINED +#endif + +#ifndef VUINT16x8_OR_DEFINED +VEC_SSE2_OR(u, 16, 8) +# define VUINT16x8_OR_DEFINED +#endif + +#ifndef VUINT16x8_XOR_DEFINED +VEC_SSE2_XOR(u, 16, 8) +# define VUINT16x8_XOR_DEFINED +#endif + +#ifndef VUINT16x8_CMPGT_DEFINED +VEC_uSSE2_CMP(cmpgt, cmpgt, 16, 8, 1, 2) +# define VUINT16x8_CMPGT_DEFINED +#endif + +#ifndef VUINT16x8_CMPLT_DEFINED +VEC_uSSE2_CMP(cmplt, cmpgt, 16, 8, 2, 1) +# define VUINT16x8_CMPLT_DEFINED +#endif + +#ifndef VUINT16x8_CMPEQ_DEFINED +VEC_xSSE2_CMP(cmpeq, cmpeq, u, 16, 8, 1, 2, , ,) +# define VUINT16x8_CMPEQ_DEFINED +#endif + +#ifndef VUINT16x8_MIN_DEFINED +VEC_SSE2_OP_XOR(min, min, u, 16, 8, 1, 2, i) +# define VUINT16x8_MIN_DEFINED +#endif + +#ifndef VUINT16x8_MAX_DEFINED +VEC_SSE2_OP_XOR(max, max, u, 16, 8, 1, 2, i) +# define VUINT16x8_MAX_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vint64x2 */ + +/* many things are more difficult with 64-bit values */ +#define VEC_SSE2_CMPEQ_64x2(sign) \ + VEC_FUNC_IMPL v##sign##int64x2 v##sign##int64x2_cmpeq(v##sign##int64x2 vec1, v##sign##int64x2 vec2) \ + { \ + vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ + vec2.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(1, 1, 3, 3)); \ + vec1.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(0, 0, 2, 2)); \ + vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \ \ - vec1.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ return vec1; \ - } while (0) + } #define VEC_SSE2_MUL_64x2(sign) \ - do { \ + VEC_FUNC_IMPL v##sign##int64x2 v##sign##int64x2_mul(v##sign##int64x2 vec1, v##sign##int64x2 vec2) \ + { \ __m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ __m128i b = _mm_srli_epi64(vec1.sse, 32); /* b = vec1 >> 32; */ \ __m128i bc = _mm_mul_epu32(b, vec2.sse); /* bc = b * (vec2 & UINT32_MAX); */ \ @@ -134,181 +526,29 @@ \ vec1.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \ return vec1; \ - } while (0) - -#define VEC_SSE2_CMPEQ_8x16(sign) \ - do { \ - vec1.sse = _mm_cmpeq_epi8(vec1.sse, vec2.sse); \ - return vec1; \ - } while (0) - -#define VEC_SSE2_CMPEQ_16x8(sign) \ - do { \ - vec1.sse = _mm_cmpeq_epi16(vec1.sse, vec2.sse); \ - return vec1; \ - } while (0) + } -#define VEC_SSE2_CMPEQ_32x4(sign) \ - do { \ - vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ - return vec1; \ - } while (0) +#ifndef VINT64x2_MUL_DEFINED +VEC_SSE2_MUL_64x2(/* nothing */) +# define VINT64x2_MUL_DEFINED +#endif -// SSE2 doesn't have an intrinsic for 64x2 equality comparison, -// so how can we take a 32x4 comparison result and turn it into -// a 64x2 comparison result? -// -// well, Intel conveniently provided an operation where we can -// shuffle around 32-bit integers (_mm_shuffle_epi32). -// -// this means all we have to do is simply do the 32-bit operation, -// shuffle the parts, and then return a bitwise AND of the result. - -#define VEC_SSE2_CMPEQ_64x2(sign) \ - do { \ - vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ - vec2.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(1, 1, 3, 3)); \ - vec1.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(0, 0, 2, 2)); \ - vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \ - return vec1; \ - } while (0) +#ifndef VINT64x2_CMPEQ_DEFINED +VEC_SSE2_CMPEQ_64x2(/* nothing */) +# define VINT64x2_CMPEQ_DEFINED +#endif -#define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.sse = _mm_load_si128((const __m128i *)in); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.sse = _mm_loadu_si128((const __m128i *)in); \ - return vec; \ - } \ - \ - static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm_store_si128((__m128i *)out, vec.sse); \ - } \ - \ - static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - _mm_storeu_si128((__m128i *)out, vec.sse); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.sse = _mm_add_epi##bits(vec1.sse, vec2.sse); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.sse = _mm_sub_epi##bits(vec1.sse, vec2.sse); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_SSE2_MUL_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.sse = _mm_and_si128(vec1.sse, vec2.sse); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.sse = _mm_or_si128(vec1.sse, vec2.sse); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.sse = _mm_xor_si128(vec1.sse, vec2.sse); \ - return vec; \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_SSE2_LSHIFT_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_SSE2_RSHIFT_##bits##x##size(sign, a); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - VEC_SSE2_RSHIFT_##bits##x##size(sign, l); \ - } \ - \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - VEC_SSE2_CMPEQ_##bits##x##size(sign); \ - } \ - \ - static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \ - /* .splat = */ NULL, \ - v##sign##int##bits##x##size##_sse2_load_aligned, \ - v##sign##int##bits##x##size##_sse2_load, \ - v##sign##int##bits##x##size##_sse2_store_aligned, \ - v##sign##int##bits##x##size##_sse2_store, \ - v##sign##int##bits##x##size##_sse2_add, \ - v##sign##int##bits##x##size##_sse2_sub, \ - v##sign##int##bits##x##size##_sse2_mul, \ - /* .div = */ NULL, \ - /* .avg = */ NULL, \ - v##sign##int##bits##x##size##_sse2_and, \ - v##sign##int##bits##x##size##_sse2_or, \ - v##sign##int##bits##x##size##_sse2_xor, \ - /* .not = */ NULL, \ - v##sign##int##bits##x##size##_sse2_lshift, \ - v##sign##int##bits##x##size##_sse2_rshift, \ - v##sign##int##bits##x##size##_sse2_lrshift, \ - /* .cmplt = */ NULL, \ - /* .cmple = */ NULL, \ - v##sign##int##bits##x##size##_sse2_cmpeq, \ - /* .cmpge = */ NULL, \ - /* .cmpgt = */ NULL, \ - }; +/* ------------------------------------------------------------------------ */ +/* vuint64x2 */ -#define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \ - VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size) - -// SSE is *only* 128-bit -VEC_SSE2_DEFINE_OPERATIONS(8, 16) -VEC_SSE2_DEFINE_OPERATIONS(16, 8) -VEC_SSE2_DEFINE_OPERATIONS(32, 4) -VEC_SSE2_DEFINE_OPERATIONS(64, 2) +#ifndef VUINT64x2_MUL_DEFINED +VEC_SSE2_MUL_64x2(u) +# define VUINT64x2_MUL_DEFINED +#endif -#undef VEC_SSE2_DEFINE_OPERATIONS -#undef VEC_SSE2_DEFINE_OPERATIONS_SIGN -#undef VEC_SSE2_MUL_8x16 -#undef VEC_SSE2_MUL_16x8 -#undef VEC_SSE2_MUL_32x4 -#undef VEC_SSE2_MUL_64x2 -#undef VEC_SSE2_OPERATION_8x16 -#undef VEC_SSE2_LSHIFT_8x16 -#undef VEC_SSE2_LSHIFT_16x8 -#undef VEC_SSE2_LSHIFT_32x4 -#undef VEC_SSE2_LSHIFT_64x2 -#undef VEC_SSE2_RSHIFT_8x16 -#undef VEC_SSE2_RSHIFT_16x8 -#undef VEC_SSE2_RSHIFT_32x4 -#undef VEC_SSE2_aRSHIFT_64x2 -#undef VEC_SSE2_lRSHIFT_64x2 -#undef VEC_SSE2_RSHIFT_64x2 +#ifndef VUINT64x2_CMPEQ_DEFINED +VEC_SSE2_CMPEQ_64x2(u) +# define VUINT64x2_CMPEQ_DEFINED +#endif #endif /* VEC_IMPL_X86_SSE2_H_ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/sse3.h Sat Apr 26 01:04:35 2025 -0400 @@ -0,0 +1,102 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_SSE3_H_ +#define VEC_IMPL_X86_SSE3_H_ + +/* SSE3 provides a slightly more optimized load function */ + +#define VEC_SSE3_LOAD(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_loadu_si128((const __m128i *)in); \ + return vec; \ + } + +/* ------------------------------------------------------------------------ */ +/* vint8x16 */ + +#ifndef VINT8x16_LOAD_DEFINED +VEC_SSE3_LOAD(/* nothing */, 8, 16) +# define VINT8x16_LOAD_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vuint8x16 */ + +#ifndef VUINT8x16_LOAD_DEFINED +VEC_SSE3_LOAD(u, 8, 16) +# define VUINT8x16_LOAD_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vint16x8 */ + +#ifndef VINT16x8_LOAD_DEFINED +VEC_SSE3_LOAD(/* nothing */, 16, 8) +# define VINT16x8_LOAD_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vuint32x4 */ + +#ifndef VUINT16x8_LOAD_DEFINED +VEC_SSE3_LOAD(u, 16, 8) +# define VUINT16x8_LOAD_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vint32x4 */ + +#ifndef VINT32x4_LOAD_DEFINED +VEC_SSE3_LOAD(/* nothing */, 32, 4) +# define VINT32x4_LOAD_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vuint32x4 */ + +#ifndef VUINT32x4_LOAD_DEFINED +VEC_SSE3_LOAD(u, 32, 4) +# define VUINT32x4_LOAD_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vint64x2 */ + +#ifndef VINT64x2_LOAD_DEFINED +VEC_SSE3_LOAD(/* nothing */, 64, 2) +# define VINT64x2_LOAD_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* vuint64x2 */ + +#ifndef VUINT64x2_LOAD_DEFINED +VEC_SSE3_LOAD(u, 64, 2) +# define VUINT64x2_LOAD_DEFINED +#endif + +#endif /* VEC_IMPL_X86_SSE3_H_ */
--- a/include/vec/impl/x86/sse41.h Fri Apr 25 17:40:55 2025 -0400 +++ b/include/vec/impl/x86/sse41.h Sat Apr 26 01:04:35 2025 -0400 @@ -25,43 +25,107 @@ #ifndef VEC_IMPL_X86_SSE41_H_ #define VEC_IMPL_X86_SSE41_H_ -// SSE 4.1 provides a real _mm_mullo_epi32 -#define VEC_SSE41_DEFINE_OPERATIONS(sign) \ - static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \ +#define VEC_SSE41_OP(NAME, SIGN, BITS, SIZE, INTLSIGN, OP) \ + VEC_FUNC_IMPL v##SIGN##int##BITS##x##SIZE v##SIGN##int##BITS##x##SIZE##_##NAME(v##SIGN##int##BITS##x##SIZE vec1, v##SIGN##int##BITS##x##SIZE vec2) \ { \ - v##sign##int32x4 vec; \ - vec.sse = _mm_mullo_epi32(vec1.sse, vec2.sse); \ - return vec; \ - } \ - \ - static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \ - /* .splat = */ NULL, \ - v##sign##int32x4_sse2_load_aligned, \ - v##sign##int32x4_sse2_load, \ - v##sign##int32x4_sse2_store_aligned, \ - v##sign##int32x4_sse2_store, \ - v##sign##int32x4_sse2_add, \ - v##sign##int32x4_sse2_sub, \ - v##sign##int32x4_sse41_mul, \ - /* .div = */ NULL, \ - /* .avg = */ NULL, \ - v##sign##int32x4_sse2_and, \ - v##sign##int32x4_sse2_or, \ - v##sign##int32x4_sse2_xor, \ - /* .not = */ NULL, \ - v##sign##int32x4_sse2_lshift, \ - v##sign##int32x4_sse2_rshift, \ - v##sign##int32x4_sse2_lrshift, \ - /* .cmplt = */ NULL, \ - /* .cmple = */ NULL, \ - v##sign##int32x4_sse2_cmpeq, \ - /* .cmpge = */ NULL, \ - /* .cmpgt = */ NULL, \ - }; + vec1.sse = _mm_##OP##_ep##INTLSIGN##BITS(vec1.sse, vec2.sse); \ + return vec1; \ + } + +/* vint8x16 */ + +#ifndef VINT8x16_MIN_DEFINED +VEC_SSE41_OP(min, /* nothing */, 8, 16, i, min) +# define VINT8x16_MIN_DEFINED +#endif + +#ifndef VINT8x16_MAX_DEFINED +VEC_SSE41_OP(max, /* nothing */, 8, 16, i, max) +# define VINT8x16_MAX_DEFINED +#endif + +/* vuint8x16 */ + +#ifndef VUINT8x16_MIN_DEFINED +VEC_SSE41_OP(min, u, 8, 16, u, min) +# define VUINT8x16_MIN_DEFINED +#endif + +#ifndef VUINT8x16_MAX_DEFINED +VEC_SSE41_OP(max, u, 8, 16, u, max) +# define VUINT8x16_MAX_DEFINED +#endif + +/* vint16x8 */ + +#ifndef VINT16x8_MIN_DEFINED +VEC_SSE41_OP(min, /* nothing */, 16, 8, i, min) +# define VINT16x8_MIN_DEFINED +#endif + +#ifndef VINT16x8_MAX_DEFINED +VEC_SSE41_OP(max, /* nothing */, 16, 8, i, max) +# define VINT16x8_MAX_DEFINED +#endif + +/* vuint8x16 */ + +#ifndef VUINT16x8_MIN_DEFINED +VEC_SSE41_OP(min, u, 16, 8, u, min) +# define VUINT16x8_MIN_DEFINED +#endif -VEC_SSE41_DEFINE_OPERATIONS() -VEC_SSE41_DEFINE_OPERATIONS(u) +#ifndef VUINT16x8_MAX_DEFINED +VEC_SSE41_OP(max, u, 16, 8, u, max) +# define VUINT16x8_MAX_DEFINED +#endif + +/* vint32x4 */ + +#ifndef VINT32x4_MUL_DEFINED +VEC_SSE41_OP(mul, /* nothing */, 32, 4, i, mullo) +# define VINT32x4_MUL_DEFINED +#endif + +#ifndef VINT32x4_MIN_DEFINED +VEC_SSE41_OP(min, /* nothing */, 32, 4, i, min) +# define VINT32x4_MIN_DEFINED +#endif + +#ifndef VINT32x4_MAX_DEFINED +VEC_SSE41_OP(max, /* nothing */, 32, 4, i, max) +# define VINT32x4_MAX_DEFINED +#endif + +/* vuint32x4 */ -#undef VEC_SSE41_DEFINE_OPERATIONS +#ifndef VUINT32x4_MUL_DEFINED +VEC_SSE41_OP(mul, u, 32, 4, i, mullo) +# define VUINT32x4_MUL_DEFINED +#endif + +#ifndef VUINT32x4_MIN_DEFINED +VEC_SSE41_OP(min, u, 32, 4, u, min) +# define VUINT32x4_MIN_DEFINED +#endif + +#ifndef VUINT32x4_MAX_DEFINED +VEC_SSE41_OP(max, u, 32, 4, u, max) +# define VUINT32x4_MAX_DEFINED +#endif + +/* vint64x2 */ + +#ifndef VINT64x2_CMPEQ_DEFINED +VEC_SSE41_OP(cmpeq, /* nothing */, 64, 2, i, cmpeq) +# define VINT64x2_CMPEQ_DEFINED +#endif + +/* vuint64x2 */ + +#ifndef VUINT64x2_CMPEQ_DEFINED +VEC_SSE41_OP(cmpeq, u, 64, 2, i, cmpeq) +# define VUINT64x2_CMPEQ_DEFINED +#endif #endif /* VEC_IMPL_X86_SSE41_H_ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/sse42.h Sat Apr 26 01:04:35 2025 -0400 @@ -0,0 +1,94 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_SSE42_H_ +#define VEC_IMPL_X86_SSE42_H_ + +/* helper funcs */ + +#define MM_SET1_64(x) _mm_set1_epi64x(x) + +#define VEC_xSSE42_CMP(name, op, sign, bits, size, first, second, VARS, TRANS1, TRANS2) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VARS \ + \ + TRANS1 \ + \ + vec1.sse = _mm_##op##_epi##bits(vec##first.sse, vec##second.sse); \ + \ + TRANS2 \ + \ + return vec1; \ + } + +#define VEC_SSE42_CMP(name, op, bits, size, first, second) \ + VEC_xSSE42_CMP(name, op, /* nothing */, bits, size, first, second, /* nothing */, /* nothing */, /* nothing */) + +#define VEC_uSSE42_CMP(name, op, bits, size, first, second) \ + VEC_xSSE42_CMP(name, op, u, bits, size, first, second, \ + __m128i xor_val = MM_SET1_##bits(UINT64_C(1) << (bits - 1)); \ + , { \ + vec1.sse = _mm_xor_si128(vec1.sse, xor_val); \ + vec2.sse = _mm_xor_si128(vec2.sse, xor_val); \ + }, \ + { \ + /* nothing */ \ + }) + +/* vint64x2 */ + +#ifndef VINT64x2_CMPEQ_DEFINED +VEC_xSSE42_CMP(cmpeq, cmpeq, /* nothing */, 64, 2, 1, 2, /* nothing */, /* nothing */, /* nothing */) +# define VINT64x2_CMPEQ_DEFINED +#endif + +#ifndef VINT64x2_CMPLT_DEFINED +VEC_SSE42_CMP(cmplt, cmpgt, 64, 2, 2, 1) +# define VINT64x2_CMPLT_DEFINED +#endif + +#ifndef VINT64x2_CMPGT_DEFINED +VEC_SSE42_CMP(cmpgt, cmpgt, 64, 2, 1, 2) +# define VINT64x2_CMPGT_DEFINED +#endif + +/* vuint64x2 */ + +#ifndef VUINT64x2_CMPEQ_DEFINED +VEC_xSSE42_CMP(cmpeq, cmpeq, u, 64, 2, 1, 2, /* nothing */, /* nothing */, /* nothing */) +# define VUINT64x2_CMPEQ_DEFINED +#endif + +#ifndef VUINT64x2_CMPLT_DEFINED +VEC_uSSE42_CMP(cmplt, cmpgt, 64, 2, 2, 1) +# define VUINT64x2_CMPLT_DEFINED +#endif + +#ifndef VUINT64x2_CMPGT_DEFINED +VEC_uSSE42_CMP(cmpgt, cmpgt, 64, 2, 1, 2) +# define VUINT64x2_CMPGT_DEFINED +#endif + +#endif /* VEC_IMPL_X86_SSE42_H_ */
--- a/include/vec/vec.h Fri Apr 25 17:40:55 2025 -0400 +++ b/include/vec/vec.h Sat Apr 26 01:04:35 2025 -0400 @@ -70,6 +70,9 @@ # endif #endif +#include <string.h> +#include <stdlib.h> + #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ (((a) >= (x)) && \ ((a) > x || (b) >= (y)) && \ @@ -108,6 +111,31 @@ # endif #endif +#if VEC_GNUC_HAS_ATTRIBUTE(__always_inline__, 4, 0, 0) +# define VEC_ALWAYS_INLINE __attribute__((__always_inline__)) +#else +# define VEC_ALWAYS_INLINE +#endif + +#define VEC_FUNC_IMPL static inline VEC_ALWAYS_INLINE + +/* --------------------------------------------------------------- */ +/* Get maximum value for type */ + +#define VEC_TYPE_SIGNED(t) (((t)(-1)) < ((t)0)) + +#define VEC_MAX_EX(t, TOPBIT) \ + (((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | \ + ((TOPBIT) << ((sizeof(t) * 8ULL) - 4ULL))) + +#define VEC_MAX_OF_UNSIGNED(t) VEC_MAX_EX(t, 0xFULL) +#define VEC_MAX_OF_SIGNED(t) VEC_MAX_EX(t, 0x7ULL) + +#define VEC_MAX_OF_TYPE(t) \ + ((unsigned long long)(VEC_TYPE_SIGNED(t) \ + ? VEC_MAX_OF_SIGNED(t) \ + : VEC_MAX_OF_UNSIGNED(t))) + /* --------------------------------------------------------------- */ /* Detect compiler SIMD support */ @@ -273,9 +301,18 @@ #ifdef __SSE2__ # include <emmintrin.h> # define VEC_COMPILER_HAS_SSE2 +# ifdef __SSE3__ +# include <pmmintrin.h> +# define VEC_COMPILER_HAS_SSE3 +# endif # ifdef __SSE4_1__ +# include <smmintrin.h> # define VEC_COMPILER_HAS_SSE41 # endif +# ifdef __SSE4_2__ +# include <nmmintrin.h> +# define VEC_COMPILER_HAS_SSE42 +# endif # if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT # undef VINT8x16_ALIGNMENT # define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT @@ -389,32 +426,23 @@ /* --------------------------------------------------------------- */ /* bit shift */ -inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y) +VEC_FUNC_IMPL vec_uintmax vec_urshift(vec_uintmax x, unsigned int y) { return x >> y; } -inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y) +VEC_FUNC_IMPL vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y) { return x << y; } -inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y) +VEC_FUNC_IMPL vec_intmax vec_rshift(vec_intmax x, unsigned int y) { - // reinterpret as unsigned integer and then shift - union { - vec_intmax d; - vec_uintmax u; - } xx; - - xx.d = x; - xx.u >>= y; - return xx.d; + return (x < 0) ? (~(~x >> y)) : (x >> y); } -inline vec_intmax vec_llshift(vec_intmax x, unsigned int y) +VEC_FUNC_IMPL vec_intmax vec_lshift(vec_intmax x, unsigned int y) { - // reinterpret as unsigned integer and then shift union { vec_intmax d; vec_uintmax u; @@ -425,92 +453,258 @@ return xx.d; } -inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y) +VEC_FUNC_IMPL vec_intmax vec_avg(vec_intmax x, vec_intmax y) { - return x >> y; + vec_intmax x_d_rem = (x % 2); + vec_intmax y_d_rem = (y % 2); + vec_intmax rem_d_quot = ((x_d_rem + y_d_rem) / 2); + vec_intmax rem_d_rem = ((x_d_rem + y_d_rem) % 2); + + return ((x / 2) + (y / 2)) + (rem_d_quot) + (rem_d_rem == 1); } -inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y) +VEC_FUNC_IMPL vec_uintmax vec_uavg(vec_uintmax x, vec_uintmax y) { - return x << y; + return (x >> 1) + (y >> 1) + ((x | y) & 1); } -/** - * Arithmetic shifts; based off code from OpenMPT, which is under - * the Boost Software License: - * - * Permission is hereby granted, free of charge, to any person or organization - * obtaining a copy of the software and accompanying documentation covered by - * this license (the "Software") to use, reproduce, display, distribute, - * execute, and transmit the Software, and to prepare derivative works of the - * Software, and to permit third-parties to whom the Software is furnished to - * do so, all subject to the following: - * - * The copyright notices in the Software and this entire statement, including - * the above license grant, this restriction and the following disclaimer, - * must be included in all copies of the Software, in whole or in part, and - * all derivative works of the Software, unless such copies or derivative - * works are solely in the form of machine-executable object code generated by - * a source language processor. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT - * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE - * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. -**/ -inline vec_intmax vec_rshift(vec_intmax x, unsigned int y) -{ - static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1); +/* --------------------------------------------------------------- */ +/* Array alignment macros */ - union { - vec_intmax d; - vec_uintmax u; - } xx; +#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) +# define VEC_ALIGNAS(x) alignas(x) +#elif (__STDC_VERSION__ >= 201112L) +# define VEC_ALIGNAS(x) _Alignas(x) +#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0) +# define VEC_ALIGNAS(x) __attribute__((__aligned__(x))) +#endif - xx.d = x; - - xx.u += roffset; - xx.u >>= y; - xx.u -= roffset >> y; - - return xx.d; -} - -inline vec_intmax vec_lshift(vec_intmax x, unsigned int y) -{ - static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1); - - union { - vec_intmax d; - vec_uintmax u; - } xx; - - xx.d = x; - - xx.u += roffset; - xx.u <<= y; - xx.u -= roffset << y; - - return xx.d; -} - -#ifdef VEC_IMPLEMENTATION -extern inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y); -extern inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y); -extern inline vec_intmax vec_llshift(vec_intmax x, unsigned int y); -extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); -extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); -extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); -extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); +/* the alignment must be specified in bytes and must be a multiple of the + * type size. it is always assumed that the type will be on a boundary of + * its size, which may or may not be true */ +#ifdef VEC_ALIGNAS +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_ALIGNAS(align) type var[length] +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(var)) +#else +// use unions to get an aligned offset without triggering strict aliasing +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \ + union vec_aligned_union_##var##_ { \ + type arr[length]; \ + unsigned char bytes[sizeof(type) * length]; \ + }; \ + unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \ + type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \ + VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(vec_unaligned_##var##_) - (align - 1)) #endif -/* --------------------------------------------------------------- */ +#define VEC_ALIGNED_ARRAY_LENGTH(var) \ + (VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var)) + +////////////////////////////////////////////////////////////////////////////////////// +// predefined variants for each vector type + +////////////////////////////////////////////////////////////////////////////////////// +// 16-bit + +#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT) +#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT) +#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT) +#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0) + +#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT) +#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT) +#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT) +#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 32-bit + +#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT) +#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT) +#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT) +#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0) + +#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT) +#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT) +#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT) +#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0) + +#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT) +#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT) +#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT) +#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0) + +#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT) +#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT) +#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT) +#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 64-bit + +#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT) +#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0) + +#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT) +#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0) + +#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT) +#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0) + +#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0) + +#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0) + +#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 128-bit + +#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) +#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) + +#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) +#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) + +#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) +#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) + +#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) +#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) -#include "impl/align.h" +#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) + +#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) + +#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) + +#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 256-bit + +#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) +#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) + +#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) +#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) +#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) + +#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) +#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) + +#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) + +#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) + +#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 512-bit + +#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) +#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) + +#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT) +#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) +#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) + +#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) +#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) + +#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) + +#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT) +#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) + +#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) /* --------------------------------------------------------------- */ /* Defines the structures for each vector type */ @@ -827,150 +1021,8 @@ vint64x4 generic[2]; } vint64x8; -// --------------------------------------------------------------------------------- -// function declarations - -int vec_init(void); - -#define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ - void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); - -#define VEC_DECLARE_OPERATIONS(bits, size) \ - VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \ - VEC_DECLARE_OPERATIONS_SIGN(u, bits, size) - -// 16-bit -VEC_DECLARE_OPERATIONS(8, 2) - -// 32-bit -VEC_DECLARE_OPERATIONS(8, 4) -VEC_DECLARE_OPERATIONS(16, 2) - -// 64-bit -VEC_DECLARE_OPERATIONS(8, 8) -VEC_DECLARE_OPERATIONS(16, 4) -VEC_DECLARE_OPERATIONS(32, 2) - -// 128-bit -VEC_DECLARE_OPERATIONS(8, 16) -VEC_DECLARE_OPERATIONS(16, 8) -VEC_DECLARE_OPERATIONS(32, 4) -VEC_DECLARE_OPERATIONS(64, 2) - -// 256-bit -VEC_DECLARE_OPERATIONS(8, 32) -VEC_DECLARE_OPERATIONS(16, 16) -VEC_DECLARE_OPERATIONS(32, 8) -VEC_DECLARE_OPERATIONS(64, 4) - -// 512-bit -VEC_DECLARE_OPERATIONS(8, 64) -VEC_DECLARE_OPERATIONS(16, 32) -VEC_DECLARE_OPERATIONS(32, 16) -VEC_DECLARE_OPERATIONS(64, 8) - -#undef VEC_DECLARE_OPERATIONS -#undef VEC_DECLARE_OPERATIONS_SIGN - -// --------------------------------------------------------------------------------- -// okay, now we can actually implement the functions - -#ifdef VEC_IMPLEMENTATION - -// Fallback functions, need to be defined before everything else. -#include "impl/fallback.h" - -// okay, these are filled in for each supported backend. -// `and', `or', `xor', and `nor' have to be prefixed with -// `b' because of <iso646.h> -#define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \ - typedef struct { \ - v##sign##int##bits##x##size (*splat)(vec_##sign##int##bits x); \ - v##sign##int##bits##x##size (*load_aligned)(const vec_##sign##int##bits in[size]); \ - v##sign##int##bits##x##size (*load)(const vec_##sign##int##bits in[size]); \ - void (*store_aligned)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - void (*store)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ - v##sign##int##bits##x##size (*add)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*sub)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*mul)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*div)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*avg)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*band)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*bor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*bxor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*bnot)(v##sign##int##bits##x##size vec); \ - v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size (*cmplt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*cmple)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*cmpeq)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*cmpge)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*cmpgt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - } v##sign##int##bits##x##size##_impl; - -#define VEC_DEFINE_IMPL_STRUCT(bits, size) \ - VEC_DEFINE_IMPL_STRUCT_SIGN( , bits, size) \ - VEC_DEFINE_IMPL_STRUCT_SIGN(u, bits, size) - -// 16-bit -VEC_DEFINE_IMPL_STRUCT(8, 2) - -// 32-bit -VEC_DEFINE_IMPL_STRUCT(8, 4) -VEC_DEFINE_IMPL_STRUCT(16, 2) - -// 64-bit -VEC_DEFINE_IMPL_STRUCT(8, 8) -VEC_DEFINE_IMPL_STRUCT(16, 4) -VEC_DEFINE_IMPL_STRUCT(32, 2) - -// 128-bit -VEC_DEFINE_IMPL_STRUCT(8, 16) -VEC_DEFINE_IMPL_STRUCT(16, 8) -VEC_DEFINE_IMPL_STRUCT(32, 4) -VEC_DEFINE_IMPL_STRUCT(64, 2) - -// 256-bit -VEC_DEFINE_IMPL_STRUCT(8, 32) -VEC_DEFINE_IMPL_STRUCT(16, 16) -VEC_DEFINE_IMPL_STRUCT(32, 8) -VEC_DEFINE_IMPL_STRUCT(64, 4) - -// 512-bit -VEC_DEFINE_IMPL_STRUCT(8, 64) -VEC_DEFINE_IMPL_STRUCT(16, 32) -VEC_DEFINE_IMPL_STRUCT(32, 16) -VEC_DEFINE_IMPL_STRUCT(64, 8) - -#undef VEC_DEFINE_IMPL_STRUCT -#undef VEC_DEFINE_IMPL_STRUCT_SIGN - -// ------------------------------------------------------------------------ - -#ifdef VEC_COMPILER_HAS_ALTIVEC -# include "impl/ppc/altivec.h" -#endif +/* ------------------------------------------------------------------------ */ +/* finally; we can import the real implementations */ #ifdef VEC_COMPILER_HAS_AVX512F # include "impl/x86/avx512f.h" @@ -980,415 +1032,116 @@ # include "impl/x86/avx2.h" #endif +#ifdef VEC_COMPILER_HAS_SSE42 +# include "impl/x86/sse42.h" +#endif +#ifdef VEC_COMPILER_HAS_SSE41 +# include "impl/x86/sse41.h" +#endif +#ifdef VEC_COMPILER_HAS_SSE3 +# include "impl/x86/sse3.h" +#endif #ifdef VEC_COMPILER_HAS_SSE2 # include "impl/x86/sse2.h" #endif - -// depends on SSE2 functions; the only thing SSE4.1 provides for us -// is a native 32-bit multiply -#ifdef VEC_COMPILER_HAS_SSE41 -# include "impl/x86/sse41.h" -#endif - #ifdef VEC_COMPILER_HAS_MMX # include "impl/x86/mmx.h" #endif -#ifdef VEC_COMPILER_HAS_NEON -# include "impl/arm/neon.h" -#endif - #include "impl/generic.h" -/* ---------------------------------------------------------------- */ - -#include "impl/cpu.h" // CPU detection crap - -// 16-bit -static vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; -static vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; - -// 32-bit -static vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; -static vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; -static vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; -static vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; +/* ------------------------------------------------------------------------ */ +/* very minimal aligned malloc */ -// 64-bit -static vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; -static vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; -static vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; -static vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; -static vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; -static vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; +#define VEC_MALLOC_ALIGNMENT (64) -// 128-bit -static vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; -static vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; -static vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; -static vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; -static vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; -static vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; -static vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; -static vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; +VEC_STATIC_ASSERT(!(VEC_MALLOC_ALIGNMENT & (VEC_MALLOC_ALIGNMENT - 1)) + && (VEC_MALLOC_ALIGNMENT > 0), + "VEC_MALLOC_ALIGNMENT must be a power of two"); -// 256-bit -static vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; -static vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; -static vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; -static vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; -static vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; -static vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; -static vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; -static vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; +typedef unsigned char vec_alignment_type; -// 512-bit -static vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; -static vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; -static vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; -static vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; -static vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; -static vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; -static vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; -static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; +#define VEC_MALLOC_ADDITIONAL_SIZE (sizeof(vec_alignment_type) + (VEC_MALLOC_ALIGNMENT - 1)) +#define VEC_MALLOC_MAX_SIZE (SIZE_MAX - VEC_MALLOC_ADDITIONAL_SIZE) -// returns 0 or a negative error code on failure -int vec_init(void) +VEC_FUNC_IMPL void *vec_internal_align_ptr_(void *q) { - // This function is NOT thread safe. However, once vec - // is initialized, all of the vector functions are thread-safe. - // - // In fact, it's possible to use vec without calling - // vec_init() at all, but it would be completely useless since - // it would just use a generic implementation without any - // vectorization whatsoever (unless maybe the compiler is - // smart enough to optimize it into vectors) - - vec_get_CPU_features(); + vec_alignment_type diff; -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (vec_CPU_have_ALTIVEC()) { - vint8x16_impl_cpu = &vint8x16_impl_altivec; - vuint8x16_impl_cpu = &vuint8x16_impl_altivec; - vint16x8_impl_cpu = &vint16x8_impl_altivec; - vuint16x8_impl_cpu = &vuint16x8_impl_altivec; - vint32x4_impl_cpu = &vint32x4_impl_altivec; - vuint32x4_impl_cpu = &vuint32x4_impl_altivec; -#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX - if (vec_CPU_have_ALTIVEC_VSX()) { - vint64x2_impl_cpu = &vint64x2_impl_altivec; - vuint64x2_impl_cpu = &vuint64x2_impl_altivec; - } -#endif - } -#endif -#ifdef VEC_COMPILER_HAS_AVX512F - if (vec_CPU_have_AVX512F()) { - vint8x64_impl_cpu = &vint8x64_impl_avx512f; - vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; - vint16x32_impl_cpu = &vint16x32_impl_avx512f; - vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; - vint32x16_impl_cpu = &vint32x16_impl_avx512f; - vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; - vint64x8_impl_cpu = &vint64x8_impl_avx512f; - vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; - } -#endif -#ifdef VEC_COMPILER_HAS_AVX2 - if (vec_CPU_have_AVX2()) { - vint8x32_impl_cpu = &vint8x32_impl_avx2; - vuint8x32_impl_cpu = &vuint8x32_impl_avx2; - vint16x16_impl_cpu = &vint16x16_impl_avx2; - vuint16x16_impl_cpu = &vuint16x16_impl_avx2; - vint32x8_impl_cpu = &vint32x8_impl_avx2; - vuint32x8_impl_cpu = &vuint32x8_impl_avx2; - vint64x4_impl_cpu = &vint64x4_impl_avx2; - vuint64x4_impl_cpu = &vuint64x4_impl_avx2; - } -#endif -#ifdef VEC_COMPILER_HAS_SSE2 - if (vec_CPU_have_SSE2()) { - vint8x16_impl_cpu = &vint8x16_impl_sse2; - vuint8x16_impl_cpu = &vuint8x16_impl_sse2; - vint16x8_impl_cpu = &vint16x8_impl_sse2; - vuint16x8_impl_cpu = &vuint16x8_impl_sse2; -# ifdef VEC_COMPILER_HAS_SSE41 - if (vec_CPU_have_SSE41()) { - vint32x4_impl_cpu = &vint32x4_impl_sse41; - vuint32x4_impl_cpu = &vuint32x4_impl_sse41; - } else -# endif - { - vint32x4_impl_cpu = &vint32x4_impl_sse2; - vuint32x4_impl_cpu = &vuint32x4_impl_sse2; - } - vint64x2_impl_cpu = &vint64x2_impl_sse2; - vuint64x2_impl_cpu = &vuint64x2_impl_sse2; - } -#endif -#ifdef VEC_COMPILER_HAS_MMX - if (vec_CPU_have_MMX()) { - vint8x8_impl_cpu = &vint8x8_impl_mmx; - vuint8x8_impl_cpu = &vuint8x8_impl_mmx; - vint16x4_impl_cpu = &vint16x4_impl_mmx; - vuint16x4_impl_cpu = &vuint16x4_impl_mmx; - vint32x2_impl_cpu = &vint32x2_impl_mmx; - vuint32x2_impl_cpu = &vuint32x2_impl_mmx; - } -#endif -#ifdef VEC_COMPILER_HAS_NEON - if (vec_CPU_have_NEON()) { - // 64-bit - vint8x8_impl_cpu = &vint8x8_impl_neon; - vuint8x8_impl_cpu = &vuint8x8_impl_neon; - vint16x4_impl_cpu = &vint16x4_impl_neon; - vuint16x4_impl_cpu = &vuint16x4_impl_neon; - vint32x2_impl_cpu = &vint32x2_impl_neon; - vuint32x2_impl_cpu = &vuint32x2_impl_neon; + diff = (((uintptr_t)q + (VEC_MALLOC_ALIGNMENT - 1)) & ~(VEC_MALLOC_ALIGNMENT - 1)) - (uintptr_t)q; + q = (char *)q + diff; + + memcpy((char *)q - sizeof(diff), &diff, sizeof(diff)); + + return q; +} - // 128-bit - vint8x16_impl_cpu = &vint8x16_impl_neon; - vuint8x16_impl_cpu = &vuint8x16_impl_neon; - vint16x8_impl_cpu = &vint16x8_impl_neon; - vuint16x8_impl_cpu = &vuint16x8_impl_neon; - vint32x4_impl_cpu = &vint32x4_impl_neon; - vuint32x4_impl_cpu = &vuint32x4_impl_neon; - vint64x2_impl_cpu = &vint64x2_impl_neon; - vuint64x2_impl_cpu = &vuint64x2_impl_neon; - } -#endif - { - // do nothing, they're already set to generics - } +/* reverses vec_align_ptr */ +VEC_FUNC_IMPL void *vec_internal_unalign_ptr_(void *q) +{ + vec_alignment_type diff; - return 0; + memcpy(&diff, (char *)q - sizeof(diff), sizeof(diff)); + q = (char *)q - diff; + + return q; } -/* ---------------------------------------------------------------- */ +VEC_FUNC_IMPL void *vec_malloc(size_t size) +{ + void *q; + + if (size > VEC_MALLOC_MAX_SIZE) + return NULL; + + /* allocate space for the diff (we have to do this, + * for realloc has no way of knowing the original ptr) */ + q = malloc(size + VEC_MALLOC_ADDITIONAL_SIZE); + if (!q) + return NULL; + + return vec_internal_align_ptr_(q); +} + +VEC_FUNC_IMPL void *vec_calloc(size_t count, size_t nmemb) +{ + size_t size; + void *q; + + size = count * nmemb; + if (size && size / count != nmemb) + return NULL; /* nope */ + + q = vec_malloc(size); -#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->splat) \ - return v##sign##int##bits##x##size##_impl_cpu->splat(x); \ - \ - return v##sign##int##bits##x##size##_fallback_splat(x); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ - { \ - v##sign##int##bits##x##size err = {0}; \ - \ - if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \ - return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \ - \ - VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \ - \ - return err; \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->load) \ - return v##sign##int##bits##x##size##_impl_cpu->load(in); \ - \ - return v##sign##int##bits##x##size##_fallback_load(in); \ - } \ - \ - void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \ - v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \ - return; \ - } \ - \ - VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \ - } \ - \ - void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->store) { \ - v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \ - return; \ - } \ - \ - v##sign##int##bits##x##size##_fallback_store(vec, out); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->add) \ - v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_add(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->sub) \ - v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_sub(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->mul) \ - v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_mul(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->div) \ - v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_div(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->avg) \ - v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_avg(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->band) \ - v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->bor) \ - v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->bxor) \ - v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->bnot) \ - v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \ - \ - return v##sign##int##bits##x##size##_fallback_not(vec); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->cmplt) \ - v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_cmplt(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->cmple) \ - v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_cmple(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->cmpeq) \ - v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_cmpeq(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->cmpge) \ - v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_cmpge(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->cmpgt) \ - v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_cmpgt(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->lshift) \ - v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_lshift(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->rshift) \ - v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_rshift(vec1, vec2); \ - } \ - \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ - { \ - if (v##sign##int##bits##x##size##_impl_cpu->lrshift) \ - v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \ - \ - return v##sign##int##bits##x##size##_fallback_lrshift(vec1, vec2); \ - } + if (q) + memset(q, 0, size); + + return q; +} + +VEC_FUNC_IMPL void *vec_realloc(void *ptr, size_t newsize) +{ + void *q; + + if (!ptr) + return vec_malloc(newsize); -#define VEC_DEFINE_OPERATIONS(bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ - VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) - -// 16-bit -VEC_DEFINE_OPERATIONS(8, 2) + if (newsize > VEC_MALLOC_MAX_SIZE) + return NULL; -// 32-bit -VEC_DEFINE_OPERATIONS(8, 4) -VEC_DEFINE_OPERATIONS(16, 2) - -// 64-bit -VEC_DEFINE_OPERATIONS(8, 8) -VEC_DEFINE_OPERATIONS(16, 4) -VEC_DEFINE_OPERATIONS(32, 2) + q = realloc(vec_internal_unalign_ptr_(ptr), VEC_MALLOC_ADDITIONAL_SIZE); + if (!q) + return NULL; -// 128-bit -VEC_DEFINE_OPERATIONS(8, 16) -VEC_DEFINE_OPERATIONS(16, 8) -VEC_DEFINE_OPERATIONS(32, 4) -VEC_DEFINE_OPERATIONS(64, 2) + return vec_internal_align_ptr_(q); +} -// 256-bit -VEC_DEFINE_OPERATIONS(8, 32) -VEC_DEFINE_OPERATIONS(16, 16) -VEC_DEFINE_OPERATIONS(32, 8) -VEC_DEFINE_OPERATIONS(64, 4) - -// 512-bit -VEC_DEFINE_OPERATIONS(8, 64) -VEC_DEFINE_OPERATIONS(16, 32) -VEC_DEFINE_OPERATIONS(32, 16) -VEC_DEFINE_OPERATIONS(64, 8) - -#undef VEC_DEFINE_OPERATIONS -#undef VEC_DEFINE_OPERATIONS_SIGN - -#endif /* VEC_IMPLEMENTATION */ +VEC_FUNC_IMPL void vec_free(void *ptr) +{ + if (ptr) + free(vec_internal_unalign_ptr_(ptr)); +} #ifdef __cplusplus }
--- a/src/vec.c Fri Apr 25 17:40:55 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -#define VEC_IMPLEMENTATION -#include "vec/vec.h"
--- a/test/Makefile.template Fri Apr 25 17:40:55 2025 -0400 +++ b/test/Makefile.template Sat Apr 26 01:04:35 2025 -0400 @@ -1,4 +1,4 @@ -CPPFLAGS += -g -O2 -I../include -Wall -Wpedantic -Werror=strict-aliasing +CPPFLAGS += -O2 -I../include -Wall -Wpedantic -Werror=strict-aliasing CFLAGS += $(CPPFLAGS) -std=c99 CXXFLAGS += $(CPPFLAGS) -std=c++11 @@ -10,35 +10,29 @@ ../include/vec/impl/x86/sse2.h \ ../include/vec/impl/x86/sse41.h \ ../include/vec/impl/cpu.h \ - ../include/vec/impl/fallback.h \ ../include/vec/impl/generic.h \ test_align.h \ test_arith.h \ test_compare.h \ - test_shift.h + test_shift.h \ + test_benchmark.h BINS = test-generic test-host test-cxx -OBJS = vec-generic.o vec-host.o test.o test-cxx.o +OBJS = test.o test-cxx.o test_benchmark_simple.o test_benchmark_vec.o .PHONY: all clean test all: $(BINS) -vec-generic.o: ../src/vec.c $(HEADERS) - $(CC) $(CFLAGS) -DVEC_SUPPRESS_HW=1 -c -o $@ $< - -vec-host.o: ../src/vec.c $(HEADERS) - $(CC) $(CFLAGS) -c -o $@ $< - -test.o: test.c +test.o: test.c test_benchmark_simple.o test_benchmark_vec.o $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< test-cxx.o: test.cc $(CXX) $(CXXFLAGS) -c -o $@ $< -test-generic: vec-generic.o test.o +test-generic: test.o test_benchmark_simple.o test_benchmark_vec.o $(CC) $(LDFLAGS) -o $@ $^ -test-host: vec-host.o test.o +test-host: test.o test_benchmark_simple.o test_benchmark_vec.o $(CC) $(LDFLAGS) -o $@ $^ test-cxx: test-cxx.o $(HEADERS)
--- a/test/Makefile.x86 Fri Apr 25 17:40:55 2025 -0400 +++ b/test/Makefile.x86 Sat Apr 26 01:04:35 2025 -0400 @@ -1,3 +1,3 @@ -CPPFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f +CPPFLAGS += -mmmx -msse2 -mavx512f include Makefile.template \ No newline at end of file
--- a/test/test.c Fri Apr 25 17:40:55 2025 -0400 +++ b/test/test.c Sat Apr 26 01:04:35 2025 -0400 @@ -3,6 +3,7 @@ #include <stdio.h> #include <string.h> #include <inttypes.h> +#include <time.h> #define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) @@ -113,6 +114,7 @@ #include "test_arith.h" #include "test_compare.h" #include "test_shift.h" +#include "test_benchmark.h" // ------------------------------------------------------------ @@ -120,12 +122,14 @@ { int ret = 0; - vec_init(); + srand(time(NULL)); ret |= test_align(); ret |= test_arith(); ret |= test_compare(); ret |= test_shift(); + test_benchmark(); + return ret; }
--- a/test/test.cc Fri Apr 25 17:40:55 2025 -0400 +++ b/test/test.cc Sat Apr 26 01:04:35 2025 -0400 @@ -1,4 +1,3 @@ -#define VEC_IMPLEMENTATION #include "vec/vec.h" #include <iostream> @@ -24,4 +23,4 @@ ret |= 1; return ret; -} \ No newline at end of file +}
--- a/test/test_arith.h Fri Apr 25 17:40:55 2025 -0400 +++ b/test/test_arith.h Sat Apr 26 01:04:35 2025 -0400 @@ -60,10 +60,12 @@ CREATE_TEST(sign, psign, csign, bits, size, and, orig_a[i] & orig_b[i]) \ CREATE_TEST(sign, psign, csign, bits, size, or, orig_a[i] | orig_b[i]) \ CREATE_TEST(sign, psign, csign, bits, size, xor, orig_a[i] ^ orig_b[i]) \ - CREATE_TEST(sign, psign, csign, bits, size, avg, (sign##int##bits##_t)(orig_a[i] + orig_b[i]) / 2) \ + CREATE_TEST(sign, psign, csign, bits, size, avg, (vec_##sign##int##bits)vec_##sign##avg(orig_a[i], orig_b[i])) \ CREATE_TEST_SHIFT(sign, psign, csign, bits, size, rshift, vec_##sign##rshift(orig_a[i], orig_b[i])) \ CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lshift, vec_##sign##lshift(orig_a[i], orig_b[i])) \ - CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_##sign##lrshift(orig_a[i], orig_b[i])) + CREATE_TEST_SHIFT(sign, psign, csign, bits, size, lrshift, vec_urshift((vec_uint##bits)orig_a[i], orig_b[i])) \ + CREATE_TEST(sign, psign, csign, bits, size, min, (orig_a[i] < orig_b[i]) ? orig_a[i] : orig_b[i]) \ + CREATE_TEST(sign, psign, csign, bits, size, max, (orig_a[i] > orig_b[i]) ? orig_a[i] : orig_b[i]) #define CREATE_TESTS(bits, size) \ CREATE_TESTS_SIGN(, d, , bits, size) \ @@ -115,6 +117,8 @@ ret |= test_arith_v##sign##int##bits##x##size##_or(a, b); \ ret |= test_arith_v##sign##int##bits##x##size##_xor(a, b); \ ret |= test_arith_v##sign##int##bits##x##size##_avg(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_min(a, b); \ + ret |= test_arith_v##sign##int##bits##x##size##_max(a, b); \ } \ } \ \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_benchmark.h Sat Apr 26 01:04:35 2025 -0400 @@ -0,0 +1,54 @@ + +/* ------------------------------------------------------------------------ */ +/* simple benchmark for getting the min/max range of an audio sample. */ + +/* prevent GCC from optimizing these function calls away - i think there's + * probably a better way to do this, but I haven't found it yet :) */ + + +extern void test_benchmark_sample_minmax_simple_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax); +extern void test_benchmark_sample_minmax_vec_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax); + +VEC_FUNC_IMPL void test_benchmark_sample_minmax(void) +{ + int32_t min, max; + clock_t start, end; + int i; + int16_t *q = vec_malloc(16000000u * 2u); + + printf("\nsigned 16-bit audio sample min/max - 1 thousand passes - 16000000 samples\n\n"); + + /* generate random sample values */ + for (i = 0; i < 16000000; i++) + q[i] = rand(); + + start = clock(); + for (i = 0; i < 1000; i++) { + min = INT32_MAX; + max = INT32_MIN; + test_benchmark_sample_minmax_vec_impl(q, 16000000u, &min, &max); + } + end = clock(); + + printf("- vec: took %f secs\n", (double)(end - start) / CLOCKS_PER_SEC); + + start = clock(); + for (i = 0; i < 1000; i++) { + min = INT32_MAX; + max = INT32_MIN; + test_benchmark_sample_minmax_simple_impl(q, 16000000u, &min, &max); + } + end = clock(); + + printf("- simple: took %f secs\n", (double)(end - start) / CLOCKS_PER_SEC); + + printf("\n"); + + vec_free(q); +} + +static void test_benchmark(void) +{ + printf("------- BENCHMARK --------\n"); + test_benchmark_sample_minmax(); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_benchmark_simple.c Sat Apr 26 01:04:35 2025 -0400 @@ -0,0 +1,18 @@ +#include <stdint.h> + +extern void test_benchmark_sample_minmax_simple_impl(int16_t *smpl, + uint32_t length, int32_t *pmin, int32_t *pmax) +{ + int32_t min = *pmin; + int32_t max = *pmax; + + while (length--) { + if (*smpl < min) min = *smpl; + if (*smpl > max) max = *smpl; + + smpl++; + } + + *pmin = min; + *pmax = max; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_benchmark_vec.c Sat Apr 26 01:04:35 2025 -0400 @@ -0,0 +1,43 @@ +#include "vec/vec.h" + +extern void test_benchmark_sample_minmax_vec_impl(int16_t *smpl, + uint32_t length, int32_t *pmin, int32_t *pmax) +{ + int32_t smin = INT32_MAX, smax = INT32_MIN; + uint32_t len32; + int i; + vint16x32 min = vint16x32_splat(*pmin); + vint16x32 max = vint16x32_splat(*pmax); + VINT16x32_ALIGNED_ARRAY(mins); + VINT16x32_ALIGNED_ARRAY(maxs); + + len32 = length / 32; + while (len32--) { + vint16x32 vec = vint16x32_load_aligned(smpl); + + min = vint16x32_min(vec, min); + max = vint16x32_max(vec, max); + + smpl += 32; + } + + vint16x32_store_aligned(min, mins); + vint16x32_store_aligned(max, maxs); + + /* get the lowest minimum of what we have left */ + for (i = 0; i < 32; i++) { + if (mins[i] < smin) smin = mins[i]; + if (maxs[i] > smax) smax = maxs[i]; + } + + len32 = length % 32; + while (len32--) { + if (*smpl < smin) smin = *smpl; + if (*smpl > smax) smax = *smpl; + + smpl++; + } + + *pmin = smin; + *pmax = smax; +}
--- a/test/test_compare.h Fri Apr 25 17:40:55 2025 -0400 +++ b/test/test_compare.h Sat Apr 26 01:04:35 2025 -0400 @@ -10,7 +10,8 @@ v##sign##int##bits##x##size##_store(c, orig_c); \ \ for (int i = 0; i < size; i++) { \ - if ((sign##int##bits##_t)(((equiv) ? UINT##bits##_MAX : 0)) != orig_c[i]) { \ + if ((vec_##sign##int##bits)(((equiv) ? UINT##bits##_MAX : 0)) != orig_c[i]) { \ + printf("%lld %lld\n", (long long)(vec_##sign##int##bits)(((equiv) ? UINT##bits##_MAX : 0)), (long long)orig_c[i]); \ fprintf(stderr, "v" #sign "int" #bits "x" #size "_" #op " test FAILED at index %d: (" #equiv ") [%d] does not equal result [%" PRI ## psign ## bits "]!\n", i, equiv, orig_c[i]); \ print_v##sign##int##bits##x##size(stderr,a); \ print_v##sign##int##bits##x##size(stderr,b); \
--- a/test/test_shift.h Fri Apr 25 17:40:55 2025 -0400 +++ b/test/test_shift.h Sat Apr 26 01:04:35 2025 -0400 @@ -2,10 +2,6 @@ { int ret = 0; - ret |= (vec_ulrshift(0xFFFFFFFF, 16) != 0xFFFF); - ret |= (vec_ullshift(0xFFFF, 16) != 0xFFFF0000); - ret |= (vec_lrshift(0xFFFFFFFF, 16) != 0xFFFF); - ret |= (vec_llshift(0xFFFF, 16) != 0xFFFF0000); ret |= (vec_urshift(0xFFFFFFFF, 16) != 0xFFFF); ret |= (vec_ulshift(0xFFFF, 16) != 0xFFFF0000); ret |= (vec_rshift(-0xFFFF, 8) != -0x100);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/gengeneric.c Sat Apr 26 01:04:35 2025 -0400 @@ -0,0 +1,429 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +/* Use this file to generate include/vec/impl/generic.h !! + * + * `gcc -o gengeneric gengeneric.c` */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) + +/* XXX: would it be faster to unroll literally everything instead of defining everything, + * and then unpacking it all? */ +static const char *header = + "/**\n" + " * vec - a tiny SIMD vector library in plain C99\n" + " * \n" + " * Copyright (c) 2024 Paper\n" + " * \n" + " * Permission is hereby granted, free of charge, to any person obtaining a copy\n" + " * of this software and associated documentation files (the \"Software\"), to deal\n" + " * in the Software without restriction, including without limitation the rights\n" + " * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n" + " * copies of the Software, and to permit persons to whom the Software is\n" + " * furnished to do so, subject to the following conditions:\n" + " * \n" + " * The above copyright notice and this permission notice shall be included in all\n" + " * copies or substantial portions of the Software.\n" + " * \n" + " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n" + " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" + " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n" + " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n" + " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n" + " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n" + " * SOFTWARE.\n" + "**/\n" + "\n" + "/* This file is automatically generated! Do not edit it directly!\n" + " * Edit the code that generates it in utils/gengeneric.c --paper */\n" + "\n" + "#ifndef VEC_IMPL_GENERIC_H_\n" + "#define VEC_IMPL_GENERIC_H_\n" + "\n" + "#include <string.h>\n" + "\n" + "// -----------------------------------------------------------------\n" + "\n" + "#define VEC_GENERIC_OPERATION(op, sign, bits, size) \\\n" + " do { \\\n" + " int i; \\\n" + " \\\n" + " for (i = 0; i < size; i++) \\\n" + " vec1.generic[i] = (op); \\\n" + " \\\n" + " return vec1; \\\n" + " } while (0)\n" + "\n" + "#define VEC_GENERIC_BUILTIN_OPERATION(op, sign, bits, size) \\\n" + " VEC_GENERIC_OPERATION(vec1.generic[i] op vec2.generic[i], sign, bits, size)\n" + "\n" + "#define VEC_GENERIC_CMP(op, sign, bits, size) \\\n" + " VEC_GENERIC_OPERATION((vec1.generic[i] op vec2.generic[i]) ? (vec_##sign##int##bits)VEC_MAX_OF_TYPE(vec_uint##bits) : 0, sign, bits, size)\n" + "\n" + "/* okay, now we can do this crap: */\n" + "\n" + "#define VEC_GENERIC_SPLAT(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \\\n" + " { \\\n" + " v##sign##int##bits##x##size vec; \\\n" + " for (int i = 0; i < size; i++) \\\n" + " vec.generic[i] = x; \\\n" + " return vec; \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_LOAD_EX(name, sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(const vec_##sign##int##bits in[size]) \\\n" + " { \\\n" + " v##sign##int##bits##x##size vec; \\\n" + " memcpy(&vec, in, sizeof(vec_##sign##int##bits) * size); \\\n" + " return vec; \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_LOAD_ALIGNED(sign, bits, size) VEC_GENERIC_LOAD_EX(load_aligned, sign, bits, size)\n" + "#define VEC_GENERIC_LOAD(sign, bits, size) VEC_GENERIC_LOAD_EX(load, sign, bits, size)\n" + "\n" + "#define VEC_GENERIC_STORE_EX(name, sign, bits, size) \\\n" + " VEC_FUNC_IMPL void v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \\\n" + " { \\\n" + " memcpy(out, &vec, sizeof(vec_##sign##int##bits) * size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_STORE_ALIGNED(sign, bits, size) VEC_GENERIC_STORE_EX(store_aligned, sign, bits, size)\n" + "#define VEC_GENERIC_STORE(sign, bits, size) VEC_GENERIC_STORE_EX(store, sign, bits, size)\n" + "\n" + "#define VEC_GENERIC_ADD(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_BUILTIN_OPERATION(+, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_SUB(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_BUILTIN_OPERATION(-, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_MUL(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_BUILTIN_OPERATION(*, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_DIV(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_OPERATION(vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_AVG(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " for (int i = 0; i < size; i++) \\\n" + " vec1.generic[i] = vec_##sign##avg(vec1.generic[i], vec2.generic[i]); \\\n" + " \\\n" + " return vec1; \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_AND(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_BUILTIN_OPERATION(&, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_OR(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_BUILTIN_OPERATION(|, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_XOR(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_BUILTIN_OPERATION(^, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_NOT(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \\\n" + " { \\\n" + " return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)VEC_MAX_OF_TYPE(vec_uint##bits))); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_CMPLT(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_CMP(<, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_CMPLE(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmpgt(vec1, vec2)); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_CMPEQ(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_CMP(==, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_CMPGE(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " return v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size##_cmplt(vec1, vec2)); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_CMPGT(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_CMP(>, sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_LSHIFT(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_OPERATION(vec_##sign##lshift(vec1.generic[i], vec2.generic[i]), sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_RSHIFT(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_OPERATION(vec_##sign##rshift(vec1.generic[i], vec2.generic[i]), sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_LRSHIFT(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \\\n" + " { \\\n" + " VEC_GENERIC_OPERATION(vec_urshift((vec_uint##bits)vec1.generic[i], vec2.generic[i]), sign, bits, size); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_MIN(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_min(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " v##sign##int##bits##x##size cmplt = v##sign##int##bits##x##size##_cmplt(vec1, vec2); \\\n" + " \\\n" + " v##sign##int##bits##x##size a = v##sign##int##bits##x##size##_and(vec1, cmplt); \\\n" + " v##sign##int##bits##x##size b = v##sign##int##bits##x##size##_and(vec2, v##sign##int##bits##x##size##_not(cmplt)); \\\n" + " \\\n" + " return v##sign##int##bits##x##size##_or(a, b); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_MAX(sign, bits, size) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_max(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \\\n" + " { \\\n" + " v##sign##int##bits##x##size cmplt = v##sign##int##bits##x##size##_cmpgt(vec1, vec2); \\\n" + " \\\n" + " v##sign##int##bits##x##size a = v##sign##int##bits##x##size##_and(vec1, cmplt); \\\n" + " v##sign##int##bits##x##size b = v##sign##int##bits##x##size##_and(vec2, v##sign##int##bits##x##size##_not(cmplt)); \\\n" + " \\\n" + " return v##sign##int##bits##x##size##_or(a, b); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_DBL_SPLAT(sign, bits, size, halfsize) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \\\n" + " { \\\n" + " v##sign##int##bits##x##size vec; \\\n" + " \\\n" + " vec.generic[0] = v##sign##int##bits##x##halfsize##_splat(x); \\\n" + " vec.generic[1] = v##sign##int##bits##x##halfsize##_splat(x); \\\n" + " \\\n" + " return vec; \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_DBL_LOAD_EX(name, sign, bits, size, halfsize) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(const vec_##sign##int##bits x[size]) \\\n" + " { \\\n" + " v##sign##int##bits##x##size vec; \\\n" + " \\\n" + " vec.generic[0] = v##sign##int##bits##x##halfsize##_##name(x); \\\n" + " vec.generic[1] = v##sign##int##bits##x##halfsize##_##name(x + halfsize); \\\n" + " \\\n" + " return vec; \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_DBL_LOAD(sign, bits, size, halfsize) VEC_GENERIC_DBL_LOAD_EX(load, sign, bits, size, halfsize)\n" + "#define VEC_GENERIC_DBL_LOAD_ALIGNED(sign, bits, size, halfsize) VEC_GENERIC_DBL_LOAD_EX(load_aligned, sign, bits, size, halfsize)\n" + "\n" + "#define VEC_GENERIC_DBL_STORE_EX(name, sign, bits, size, halfsize) \\\n" + " VEC_FUNC_IMPL void v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec, vec_##sign##int##bits x[size]) \\\n" + " { \\\n" + " v##sign##int##bits##x##halfsize##_##name(vec.generic[0], x); \\\n" + " v##sign##int##bits##x##halfsize##_##name(vec.generic[1], x + halfsize); \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_DBL_STORE(sign, bits, size, halfsize) VEC_GENERIC_DBL_STORE_EX(store, sign, bits, size, halfsize)\n" + "#define VEC_GENERIC_DBL_STORE_ALIGNED(sign, bits, size, halfsize) VEC_GENERIC_DBL_STORE_EX(store_aligned, sign, bits, size, halfsize)\n" + "\n" + "#define VEC_GENERIC_DBL_OP(name, sign, bits, size, halfsize, secondsign) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##secondsign##int##bits##x##size vec2) \\\n" + " { \\\n" + " vec1.generic[0] = v##sign##int##bits##x##halfsize##_##name(vec1.generic[0], vec2.generic[0]); \\\n" + " vec1.generic[1] = v##sign##int##bits##x##halfsize##_##name(vec1.generic[1], vec2.generic[1]); \\\n" + " \\\n" + " return vec1; \\\n" + " }\n" + "\n" + "#define VEC_GENERIC_DBL_ADD(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(add, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_SUB(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(sub, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_MUL(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(mul, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_DIV(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(div, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_AVG(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(avg, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_LSHIFT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(lshift, sign, bits, size, halfsize, u)\n" + "#define VEC_GENERIC_DBL_RSHIFT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(rshift, sign, bits, size, halfsize, u)\n" + "#define VEC_GENERIC_DBL_LRSHIFT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(lrshift, sign, bits, size, halfsize, u)\n" + "#define VEC_GENERIC_DBL_AND(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(and, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_OR(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(or, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_XOR(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(xor, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_MIN(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(min, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_MAX(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(max, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_CMPLT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmplt, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_CMPLE(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmple, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_CMPEQ(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmpeq, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_CMPGE(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmpge, sign, bits, size, halfsize, sign)\n" + "#define VEC_GENERIC_DBL_CMPGT(sign, bits, size, halfsize) VEC_GENERIC_DBL_OP(cmpgt, sign, bits, size, halfsize, sign)\n" + "\n" + "#define VEC_GENERIC_DBL_NOT(sign, bits, size, halfsize) \\\n" + " VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \\\n" + " { \\\n" + " vec.generic[0] = v##sign##int##bits##x##halfsize##_not(vec.generic[0]); \\\n" + " vec.generic[1] = v##sign##int##bits##x##halfsize##_not(vec.generic[1]); \\\n" + " \\\n" + " return vec; \\\n" + " }\n" + "\n" + "/* ------------------------------------------------------------------------ */\n" + "/* PREPROCESSOR HELL INCOMING */\n"; + +static const char *footer = + "#endif /* VEC_IMPL_GENERIC_H_ */\n"; + +/* ------------------------------------------------------------------------ */ + +static void print_generic_op(const char *op, int is_signed, int bits, int size) +{ + printf( + "#ifndef V%sINT%dx%d_%s_DEFINED\n" + "VEC_GENERIC_%s(%s, %d, %d)\n" + "# define V%sINT%dx%d_%s_DEFINED\n" + "#endif\n", + (is_signed ? "" : "U"), bits, size, op, op, (is_signed ? "/* nothing */" : "u"), bits, size, (is_signed ? "" : "U"), bits, size, op); +} + +static void print_generic_dbl_op(const char *op, int is_signed, int bits, int size) +{ + printf( + "#ifndef V%sINT%dx%d_%s_DEFINED\n" + "VEC_GENERIC_DBL_%s(%s, %d, %d, %d)\n" + "# define V%sINT%dx%d_%s_DEFINED\n" + "#endif\n\n", + (is_signed ? "" : "U"), bits, size, op, op, (is_signed ? "/* nothing */" : "u"), bits, size, size / 2, (is_signed ? "" : "U"), bits, size, op); +} + +typedef void (*print_op_spec)(const char *op, int is_signed, int bits, int size); + +static inline void print_ops(int is_signed, int bits, int size, print_op_spec print_op) +{ + /* all supported operations here */ + static const char *ops[] = { + "SPLAT", + "LOAD_ALIGNED", + "LOAD", + "STORE_ALIGNED", + "STORE", + "ADD", + "SUB", + "MUL", + "DIV", + "AVG", + "AND", + "OR", + "XOR", + "NOT", + "CMPLT", + "CMPEQ", + "CMPGT", + "CMPLE", /* these two must be after CMPLT and CMPGT respectfully, */ + "CMPGE", /* because their definitions call those functions */ + "MIN", + "MAX", + "RSHIFT", + "LRSHIFT", + "LSHIFT", + NULL, + }; + int i; + + printf("\n\n/* v%sint%dx%d */\n\n", (is_signed ? "u" : ""), bits, size); + + for (i = 0; ops[i]; i++) + print_op(ops[i], is_signed, bits, size); +} + +int main(void) +{ + static struct { + int bits, size; + print_op_spec print_op; + } defs[] = { + /* -- 8-bit */ + {8, 2, print_generic_op}, + {8, 4, print_generic_dbl_op}, + {8, 8, print_generic_dbl_op}, + {8, 16, print_generic_dbl_op}, + {8, 32, print_generic_dbl_op}, + {8, 64, print_generic_dbl_op}, + + /* -- 16-bit */ + {16, 2, print_generic_op}, + {16, 4, print_generic_dbl_op}, + {16, 8, print_generic_dbl_op}, + {16, 16, print_generic_dbl_op}, + {16, 32, print_generic_dbl_op}, + + /* -- 32-bit */ + {32, 2, print_generic_op}, + {32, 4, print_generic_dbl_op}, + {32, 8, print_generic_dbl_op}, + {32, 16, print_generic_dbl_op}, + + /* -- 64-bit */ + {64, 2, print_generic_op}, + {64, 4, print_generic_dbl_op}, + {64, 8, print_generic_dbl_op}, + }; + int i; + + puts(header); + + for (i = 0; i < ARRAY_SIZE(defs); i++) { + print_ops(1, defs[i].bits, defs[i].size, defs[i].print_op); + print_ops(0, defs[i].bits, defs[i].size, defs[i].print_op); + } + + puts(footer); +}
--- a/vec.pc.in Fri Apr 25 17:40:55 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -prefix=@CMAKE_INSTALL_PREFIX@ -exec_prefix=@CMAKE_INSTALL_PREFIX@ -libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ -includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ - -Name: @PROJECT_NAME@ -Description: @PROJECT_DESCRIPTION@ -Version: @PROJECT_VERSION@ - -Requires: -Libs: -L${libdir} -lvec -Cflags: -I${includedir}