# HG changeset patch # User Paper # Date 1732122135 18000 # Node ID 41dd962abdd1ad9a323fab3c10acdf8ffd414664 # Parent 9da2aba90c8766b9c7ced7700f53375f2f16fb7e *: allow compiling vec in a C++ translation unit this is stupid, but whatever diff -r 9da2aba90c87 -r 41dd962abdd1 CMakeLists.txt --- a/CMakeLists.txt Wed Nov 20 04:16:56 2024 -0500 +++ b/CMakeLists.txt Wed Nov 20 12:02:15 2024 -0500 @@ -1,9 +1,16 @@ -cmake_minimum_required(VERSION 3.5) +cmake_minimum_required(VERSION 3.23) project(vec VERSION 2.0.0 DESCRIPTION "a tiny C99 SIMD vector library") add_library(vec SHARED src/vec.c) +target_sources(vec PUBLIC + $ + $ + $ + $ +) + include(CheckCCompilerFlag) if(MSVC) @@ -35,18 +42,121 @@ endif() endif() +######################################################################### +# integer types -set_target_properties(vec PROPERTIES PUBLIC_HEADER include/vec/vec.h C_STANDARD 99) +include(CheckTypeSize) + +check_type_size("int16_t" INT16_T_SIZE LANGUAGE C) +check_type_size("uint16_t" UINT16_T_SIZE LANGUAGE C) +check_type_size("u_int16_t" U_INT16_T_SIZE LANGUAGE C) +check_type_size("int32_t" INT32_T_SIZE LANGUAGE C) +check_type_size("uint32_t" UINT32_T_SIZE LANGUAGE C) +check_type_size("u_int32_t" U_INT32_T_SIZE LANGUAGE C) +check_type_size("int64_t" INT64_T_SIZE LANGUAGE C) +check_type_size("uint64_t" UINT64_T_SIZE LANGUAGE C) +check_type_size("u_int64_t" U_INT64_T_SIZE LANGUAGE C) +check_type_size("short" SHORT_SIZE LANGUAGE C) +check_type_size("int" INT_SIZE LANGUAGE C) +check_type_size("long" LONG_SIZE LANGUAGE C) +check_type_size("long long" LONG_LONG_SIZE LANGUAGE C) +check_type_size("uintptr_t" UINTPTR_T_SIZE LANGUAGE C) + +if(INT16_T_SIZE EQUAL 2) + set(SIZE16 "int16_t") +elseif(SHORT_SIZE EQUAL 2) + set(SIZE16 "short") +elseif(INT_SIZE EQUAL 2) + set(SIZE16 "int") +endif() + +if(UINT16_T_SIZE EQUAL 2) + set(USIZE16 "uint16_t") +elseif(U_INT16_T_SIZE EQUAL 2) + set(USIZE16 "u_int16_t") +elseif(SHORT_SIZE EQUAL 2) + set(USIZE16 "unsigned short") +elseif(INT_SIZE EQUAL 2) + set(USIZE16 "unsigned int") +endif() + +if(INT32_T_SIZE EQUAL 4) + set(SIZE32 "int32_t") +elseif(SHORT_SIZE EQUAL 4) + set(SIZE32 "short") +elseif(INT_SIZE EQUAL 4) + set(SIZE32 "int") +elseif(LONG_SIZE EQUAL 4) + set(SIZE32 "long") +endif() -target_include_directories(vec PRIVATE include) +if(UINT32_T_SIZE EQUAL 4) + set(USIZE32 "uint32_t") +elseif(U_INT32_T_SIZE EQUAL 4) + set(USIZE32 "u_int32_t") +elseif(SHORT_SIZE EQUAL 4) + set(USIZE32 "unsigned short") +elseif(INT_SIZE EQUAL 4) + set(USIZE32 "unsigned int") +elseif(LONG_SIZE EQUAL 4) + set(USIZE32 "unsigned long") +endif() + +if(INT64_T_SIZE EQUAL 8) + set(SIZE64 "int64_t") +elseif(SHORT_SIZE EQUAL 8) + set(SIZE64 "short") +elseif(INT_SIZE EQUAL 8) + set(SIZE64 "int") +elseif(LONG_SIZE EQUAL 8) + set(SIZE64 "long") +elseif(LONG_LONG_SIZE EQUAL 8) + set(SIZE64 "long long") +endif() + +if(UINT64_T_SIZE EQUAL 8) + set(USIZE64 "uint64_t") +elseif(U_INT64_T_SIZE EQUAL 8) + set(USIZE64 "u_int64_t") +elseif(SHORT_SIZE EQUAL 8) + set(USIZE64 "unsigned short") +elseif(INT_SIZE EQUAL 8) + set(USIZE64 "unsigned int") +elseif(LONG_SIZE EQUAL 8) + set(USIZE64 "unsigned long") +elseif(LONG_LONG_SIZE EQUAL 8) + set(USIZE64 "unsigned long long") +endif() + +if(CMAKE_SIZEOF_VOID_P EQUAL UINTPTR_T_SIZE) + set(USIZEPTR "uintptr_t") +elseif(CMAKE_SIZEOF_VOID_P EQUAL 1) + set(USIZEPTR "unsigned char") +elseif(CMAKE_SIZEOF_VOID_P EQUAL 2) + set(USIZEPTR "${USIZE16}") +elseif(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(USIZEPTR "${USIZE32}") +elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(USIZEPTR "${USIZE64}") +endif() + +configure_file(include/vec/impl/integer.h.in include/vec/impl/integer.h @ONLY) + +target_compile_definitions(vec PRIVATE "VEC_HAVE_IMPL_INTEGER_H") + +######################################################################### + +target_compile_features(vec PRIVATE $,c_std_11,c_std_99>) +target_include_directories(vec PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include;${CMAKE_CURRENT_BINARY_DIR}/include/vec") # Installing include(GNUInstallDirs) -install(TARGETS vec - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) +install(TARGETS vec LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/include/vec/vec.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/vec") +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/include/vec/impl/integer.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/vec/impl") # pkg-config configure_file(vec.pc.in vec.pc @ONLY) diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/align.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/align.h Wed Nov 20 12:02:15 2024 -0500 @@ -0,0 +1,267 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_ALIGN_H_ +#define VEC_IMPL_ALIGN_H_ + +// Array alignment macros + +#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) +# define VEC_ALIGNAS(x) alignas(x) +#elif (__STDC_VERSION__ >= 201112L) +# define VEC_ALIGNAS(x) _Alignas(x) +#elif VEC_GNUC_HAS_ATTRIBUTE(aligned, 2, 7, 0) +# define VEC_ALIGNAS(x) __attribute__((__aligned__(x))) +#endif + +/* the alignment must be specified in bytes and must be a multiple of the + * type size. it is always assumed that the type will be on a boundary of + * its size, which may or may not be true */ +#ifdef VEC_ALIGNAS +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_ALIGNAS(align) type var[length] +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(var)) +#else +// use unions to get an aligned offset without triggering strict aliasing +# define VEC_ALIGNED_ARRAY(type, var, length, align) \ + VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \ + union vec_aligned_union_##var##_ { \ + type arr[length]; \ + unsigned char bytes[sizeof(type) * length]; \ + }; \ + unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \ + type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \ + VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") +# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ + (sizeof(vec_unaligned_##var##_) - (align - 1)) +#endif + +#define VEC_ALIGNED_ARRAY_LENGTH(var) \ + (VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var)) + +////////////////////////////////////////////////////////////////////////////////////// +// predefined variants for each vector type + +////////////////////////////////////////////////////////////////////////////////////// +// 16-bit + +#define VINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 2, VINT8x2_ALIGNMENT) +#define VINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x2_ALIGNMENT) +#define VINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x2_ALIGNMENT) +#define VINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x2_ALIGNMENT == 0) + +#define VUINT8x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 2, VUINT8x2_ALIGNMENT) +#define VUINT8x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x2_ALIGNMENT) +#define VUINT8x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x2_ALIGNMENT) +#define VUINT8x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 32-bit + +#define VINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 4, VINT8x4_ALIGNMENT) +#define VINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x4_ALIGNMENT) +#define VINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x4_ALIGNMENT) +#define VINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x4_ALIGNMENT == 0) + +#define VINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 2, VINT16x2_ALIGNMENT) +#define VINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x2_ALIGNMENT) +#define VINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x2_ALIGNMENT) +#define VINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x2_ALIGNMENT == 0) + +#define VUINT8x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 4, VUINT8x4_ALIGNMENT) +#define VUINT8x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x4_ALIGNMENT) +#define VUINT8x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x4_ALIGNMENT) +#define VUINT8x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x4_ALIGNMENT == 0) + +#define VUINT16x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 2, VUINT16x2_ALIGNMENT) +#define VUINT16x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x2_ALIGNMENT) +#define VUINT16x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x2_ALIGNMENT) +#define VUINT16x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 64-bit + +#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 8, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT) +#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0) + +#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 4, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT) +#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0) + +#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 2, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT) +#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0) + +#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 8, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0) + +#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 4, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0) + +#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 2, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 128-bit + +#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 16, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) +#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) + +#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 8, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) +#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) + +#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 4, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) +#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) + +#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 2, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) +#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) + +#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 16, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) + +#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 8, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) + +#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 4, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) + +#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 2, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 256-bit + +#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 32, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) +#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) + +#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 16, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) +#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 8, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) +#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) + +#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 4, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) +#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) + +#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 32, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) + +#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 16, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 8, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) + +#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 4, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// +// 512-bit + +#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int8, var, 64, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) +#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) + +#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int16, var, 32, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT) +#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) + +#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int32, var, 16, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) +#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) + +#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_int64, var, 8, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) +#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) + +#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint8, var, 64, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) + +#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint16, var, 32, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT) +#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) + +#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint32, var, 16, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) + +#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(vec_uint64, var, 8, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) + +////////////////////////////////////////////////////////////////////////////////////// + +#endif /* VEC_IMPL_ALIGN_H_ */ diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/cpu.h --- a/include/vec/impl/cpu.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/cpu.h Wed Nov 20 12:02:15 2024 -0500 @@ -280,7 +280,7 @@ # endif }; int hasVectorUnit = 0; - size_t length = sizeof(hasVectorUnit); + vec_uintsize length = sizeof(hasVectorUnit); int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); if (!error) altivec = (hasVectorUnit != 0); @@ -289,14 +289,14 @@ elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; #elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) - void (*handler)(int sig); - handler = signal(SIGILL, vec_CPU_illegal_instruction); - if (!setjmp(vec_jmpbuf)) { - asm volatile("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" ::"r"(-1)); - altivec = 1; - } - signal(SIGILL, handler); + void (*handler)(int sig); + handler = signal(SIGILL, vec_CPU_illegal_instruction); + if (!setjmp(vec_jmpbuf)) { + asm volatile("mtspr 256, %0\n\t" + "vand %%v0, %%v0, %%v0" ::"r"(-1)); + altivec = 1; + } + signal(SIGILL, handler); #endif return altivec; } @@ -364,7 +364,7 @@ #define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF) -static uint32_t vec_CPU_features = VEC_CPU_FEATURES_RESET; +static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET; static void vec_get_CPU_features(void) { @@ -374,8 +374,8 @@ vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; if (vec_CPU_have_ALTIVEC_VSX()) vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; - if (vec_CPU_have_MMX()) - vec_CPU_features |= VEC_CPU_HAS_MMX; + if (vec_CPU_have_MMX()) + vec_CPU_features |= VEC_CPU_HAS_MMX; if (vec_CPU_have_SSE()) vec_CPU_features |= VEC_CPU_HAS_SSE; if (vec_CPU_have_SSE2()) diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/fallback.h --- a/include/vec/impl/fallback.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/fallback.h Wed Nov 20 12:02:15 2024 -0500 @@ -25,6 +25,8 @@ #ifndef VEC_IMPL_FALLBACK_H_ #define VEC_IMPL_FALLBACK_H_ +#include + // Fallback implementations - this is what an implementation should use if it // doesn't support a specific function. Note that the load_aligned and // store_aligned functions are not implemented here - this is on purpose; @@ -60,25 +62,25 @@ } while (0) #define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(sign##int##bits##_t x) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(vec_##sign##int##bits x) \ { \ V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ for (int i = 0; i < size; i++) arr[i] = x; \ return v##sign##int##bits##x##size##_load_aligned(arr); \ } \ \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const vec_##sign##int##bits in[size]) \ { \ V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ - memcpy(arr, in, sizeof(sign##int##bits##_t) * size); \ + memcpy(arr, in, sizeof(vec_##sign##int##bits) * size); \ return v##sign##int##bits##x##size##_load_aligned(arr); \ } \ \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ v##sign##int##bits##x##size##_store_aligned(vec, arr); \ - memcpy(out, arr, sizeof(sign##int##bits##_t) * size); \ + memcpy(out, arr, sizeof(vec_##sign##int##bits) * size); \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ @@ -123,7 +125,7 @@ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \ { \ - return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \ + return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((vec_##sign##int##bits)UINT##bits##_MAX)); \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ @@ -170,6 +172,13 @@ VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( , , bits, size) \ VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(u, U, bits, size) +// 16-bit +VEC_DEFINE_FALLBACK_OPERATIONS(8, 2) + +// 32-bit +VEC_DEFINE_FALLBACK_OPERATIONS(8, 4) +VEC_DEFINE_FALLBACK_OPERATIONS(16, 2) + // 64-bit VEC_DEFINE_FALLBACK_OPERATIONS(8, 8) VEC_DEFINE_FALLBACK_OPERATIONS(16, 4) diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/generic.h --- a/include/vec/impl/generic.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/generic.h Wed Nov 20 12:02:15 2024 -0500 @@ -27,7 +27,6 @@ #ifndef VEC_IMPL_GENERIC_H_ #define VEC_IMPL_GENERIC_H_ -#include #include // ----------------------------------------------------------------- @@ -35,29 +34,32 @@ // TODO implement these so we don't waste stack space by doing the // fallbacks #define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ - memcpy(vec.generic, in, sizeof(sign##int##bits##_t) * size); \ + memcpy(vec.generic, in, sizeof(vec_##sign##int##bits) * size); \ return vec; \ } \ \ - static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ - memcpy(out, vec.generic, sizeof(sign##int##bits##_t) * size); \ + memcpy(out, vec.generic, sizeof(vec_##sign##int##bits) * size); \ } \ \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ - .load_aligned = v##sign##int##bits##x##size##_generic_load_aligned, \ - .store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_generic_load_aligned, \ + v##sign##int##bits##x##size##_generic_load_aligned, \ + v##sign##int##bits##x##size##_generic_store_aligned, \ + v##sign##int##bits##x##size##_generic_store_aligned, \ }; #define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \ VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) -VEC_GENERIC_DEFINE_OPERATIONS(8, 8) -VEC_GENERIC_DEFINE_OPERATIONS(16, 4) +VEC_GENERIC_DEFINE_OPERATIONS(8, 2) +VEC_GENERIC_DEFINE_OPERATIONS(16, 2) VEC_GENERIC_DEFINE_OPERATIONS(32, 2) VEC_GENERIC_DEFINE_OPERATIONS(64, 2) @@ -68,7 +70,7 @@ // now we can just keep doubling the same implementation #define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ vec.generic[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \ @@ -76,21 +78,31 @@ return vec; \ } \ \ - static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[0], out); \ v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[1], out + halfsize); \ } \ \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ - .load_aligned = v##sign##int##bits##x##size##_generic_load_aligned, \ - .store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_generic_load_aligned, \ + v##sign##int##bits##x##size##_generic_load_aligned, \ + v##sign##int##bits##x##size##_generic_store_aligned, \ + v##sign##int##bits##x##size##_generic_store_aligned, \ }; #define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \ VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size, halfsize) \ VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize) +// 32-bit +VEC_GENERIC_DEFINE_OPERATIONS(8, 4, 2) + +// 64-bit +VEC_GENERIC_DEFINE_OPERATIONS(8, 8, 4) +VEC_GENERIC_DEFINE_OPERATIONS(16, 4, 2) + // 128-bit VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8) VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4) diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/integer.h.in --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/integer.h.in Wed Nov 20 12:02:15 2024 -0500 @@ -0,0 +1,58 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_INTEGER_H_ +#define VEC_IMPL_INTEGER_H_ + +#cmakedefine HAVE_SYS_TYPES_H +#cmakedefine HAVE_STDDEF_H +#cmakedefine HAVE_STDINT_H + +#ifdef HAVE_SYS_TYPES_H +# include +#endif +#ifdef HAVE_STDDEF_H +# include +#endif +#ifdef HAVE_STDINT_H +# include +#endif + +typedef signed char vec_int8; +typedef @SIZE16@ vec_int16; +typedef @SIZE32@ vec_int32; +typedef @SIZE64@ vec_int64; + +typedef unsigned char vec_uint8; +typedef @USIZE16@ vec_uint16; +typedef @USIZE32@ vec_uint32; +typedef @USIZE64@ vec_uint64; + +/* this is only used for bitshifting right now */ +typedef vec_int64 vec_intmax; +typedef vec_uint64 vec_uintmax; + +typedef @USIZEPTR@ vec_uintptr; + +#endif /* VEC_IMPL_INTEGER_H_ */ \ No newline at end of file diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/ppc/altivec.h --- a/include/vec/impl/ppc/altivec.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/ppc/altivec.h Wed Nov 20 12:02:15 2024 -0500 @@ -27,7 +27,6 @@ #ifndef VEC_IMPL_PPC_ALTIVEC_H_ #define VEC_IMPL_PPC_ALTIVEC_H_ -#include #include #include @@ -39,26 +38,30 @@ # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_mul(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_mul(vec1.altivec, vec2.altivec); \ + return vec; \ } # define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \ - .mul = v##sign##int##bits##x##size##_altivec_mul, + v##sign##int##bits##x##size##_altivec_mul #else # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) -# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) +# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) NULL #endif #ifdef vec_splats # define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(sign##int##bits##_t x) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(vec_##sign##int##bits x) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_splats(x) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_splats(x); \ + return vec; \ } # define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \ - .splat = v##sign##int##bits##x##size##_altivec_splat, + v##sign##int##bits##x##size##_altivec_splat #else # define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) -# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) +# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) NULL #endif #define VEC_ALTIVEC_uRSHIFT vec_sr @@ -67,93 +70,118 @@ #define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_sr(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_sr(vec1.altivec, vec2.altivec); \ + return vec; \ } #define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \ - .lrshift = v##sign##int##bits##x##size##_altivec_lrshift, + v##sign##int##bits##x##size##_altivec_lrshift #define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size) -#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) +#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) NULL /* Since altivec conveniently made their API super user friendly, we can just use * one giant macro to define literally everything */ #define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_ld(0, in) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_ld(0, in); \ + return vec; \ } \ \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)); \ + return vec; \ } \ \ - static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ vec_st(vec.altivec, 0, out); \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_add(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_add(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_sub(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_sub(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_sl(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_sl(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_avg(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_avg(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_and(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_and(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_or(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_or(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .altivec = vec_xor(vec1.altivec, vec2.altivec) }; \ + v##sign##int##bits##x##size vec; \ + vec.altivec = vec_xor(vec1.altivec, vec2.altivec); \ + return vec; \ } \ \ VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \ - .load_aligned = v##sign##int##bits##x##size##_altivec_load_aligned, \ - .load = v##sign##int##bits##x##size##_altivec_load, \ - .store_aligned = v##sign##int##bits##x##size##_altivec_store_aligned, \ - .add = v##sign##int##bits##x##size##_altivec_add, \ - .sub = v##sign##int##bits##x##size##_altivec_sub, \ - VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \ - .lshift = v##sign##int##bits##x##size##_altivec_lshift, \ - .rshift = v##sign##int##bits##x##size##_altivec_rshift, \ - VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size) \ - .avg = v##sign##int##bits##x##size##_altivec_avg, \ - .and = v##sign##int##bits##x##size##_altivec_and, \ - .or = v##sign##int##bits##x##size##_altivec_or, \ - .xor = v##sign##int##bits##x##size##_altivec_xor, \ - VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \ + VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size), \ + v##sign##int##bits##x##size##_altivec_load_aligned, \ + v##sign##int##bits##x##size##_altivec_load, \ + v##sign##int##bits##x##size##_altivec_store_aligned, \ + /* .store = */ NULL, \ + v##sign##int##bits##x##size##_altivec_add, \ + v##sign##int##bits##x##size##_altivec_sub, \ + VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size), \ + /* .div = */ NULL, \ + v##sign##int##bits##x##size##_altivec_avg, \ + v##sign##int##bits##x##size##_altivec_and, \ + v##sign##int##bits##x##size##_altivec_or, \ + v##sign##int##bits##x##size##_altivec_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_altivec_lshift, \ + v##sign##int##bits##x##size##_altivec_rshift, \ + VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size), \ }; #define VEC_DEFINE_OPERATIONS(bits, size) \ diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/x86/avx2.h --- a/include/vec/impl/x86/avx2.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/x86/avx2.h Wed Nov 20 12:02:15 2024 -0500 @@ -32,10 +32,12 @@ __m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1.avx2, 8), _mm256_srli_epi16(vec2.avx2, 8)); \ \ /* repack */ \ - return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \ + v##sign##int8x32 vec; \ + vec.avx2 = _mm256_or_si256( \ _mm256_slli_epi16(dst_odd, 8), \ _mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \ - )}; \ + ); \ + return vec; \ } while (0) #define VEC_AVX2_OPERATION_8x32_32x8(op, sign) \ @@ -47,7 +49,8 @@ __m256i dst_4 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 24), _mm256_srli_epi32(vec2.avx2, 24)); \ \ /* repack */ \ - return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \ + v##sign##int8x32 vec; \ + vec.avx2 = _mm256_or_si256( \ _mm256_or_si256( \ _mm256_slli_epi32(dst_4, 8), \ _mm256_srli_epi32(_mm256_slli_epi32(dst_3, 8), 8) \ @@ -56,7 +59,8 @@ _mm256_slli_epi32(_mm256_slli_epi32(dst_2, 8), 16), \ _mm256_srli_epi32(_mm256_slli_epi32(dst_1, 8), 24) \ ) \ - )}; \ + ); \ + return vec; \ } while (0) #define VEC_AVX2_OPERATION_16x16(op, sign) \ @@ -66,10 +70,12 @@ __m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \ \ /* repack */ \ - return (v##sign##int16x16){ .avx2 = _mm256_or_si256( \ + v##sign##int16x16 vec; \ + vec.avx2 = _mm256_or_si256( \ _mm256_slli_epi32(dst_odd, 16), \ _mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \ - )}; \ + ); \ + return vec; \ } while (0) // shifting @@ -82,12 +88,16 @@ #define VEC_AVX2_LSHIFT_32x8(sign) \ do { \ - return (v##sign##int32x8){ .avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2) }; \ + v##sign##int32x8 vec; \ + vec.avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2); \ + return vec; \ } while (0) #define VEC_AVX2_LSHIFT_64x4(sign) \ do { \ - return (v##sign##int64x4){ .avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2) }; \ + v##sign##int64x4 vec; \ + vec.avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2); \ + return vec; \ } while (0) #define VEC_AVX2_RSHIFT_8x32(sign, aORl) \ @@ -98,7 +108,9 @@ #define VEC_AVX2_RSHIFT_32x8(sign, aORl) \ do { \ - return (v##sign##int32x8){ .avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2) }; \ + v##sign##int32x8 vec; \ + vec.avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2); \ + return vec; \ } while (0) #define VEC_AVX2_aRSHIFT_64x4(sign) \ @@ -108,7 +120,9 @@ #define VEC_AVX2_lRSHIFT_64x4(sign) \ do { \ - return (v##sign##int64x4){ .avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2) }; \ + v##sign##int64x4 vec; \ + vec.avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2); \ + return vec; \ } while (0) #define VEC_AVX2_RSHIFT_64x4(sign, aORl) \ @@ -121,12 +135,16 @@ #define VEC_AVX2_MUL_16x16(sign) \ do { \ - return (v##sign##int16x16){ .avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2) }; \ + v##sign##int16x16 vec; \ + vec.avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2); \ + return vec; \ } while (0) #define VEC_AVX2_MUL_32x8(sign) \ do { \ - return (v##sign##int32x8) { .avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2) }; \ + v##sign##int32x8 vec; \ + vec.avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2); \ + return vec; \ } while (0) #define VEC_AVX2_MUL_64x4(sign) \ @@ -138,40 +156,51 @@ __m256i ad = _mm256_mul_epu32(vec1.avx2, d); \ __m256i hi = _mm256_add_epi64(bc, ad); \ hi = _mm256_slli_epi64(hi, 32); \ - return (v##sign##int64x4) { .avx2 = _mm256_add_epi64(hi, ac) }; \ + \ + v##sign##int64x4 vec; \ + vec.avx2 = _mm256_add_epi64(hi, ac); \ + return vec; \ } while (0) // operations #define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .avx2 = _mm256_load_si256((const __m256i *)in) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_load_si256((const __m256i *)in); \ + return vec; \ } \ \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .avx2 = _mm256_loadu_si256((const __m256i *)in) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_loadu_si256((const __m256i *)in); \ + return vec; \ } \ \ - static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm256_store_si256((__m256i *)out, vec.avx2); \ } \ \ - static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm256_storeu_si256((__m256i *)out, vec.avx2); \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ @@ -181,17 +210,23 @@ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ @@ -210,19 +245,23 @@ } \ \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \ - .load_aligned = v##sign##int##bits##x##size##_avx2_load_aligned, \ - .load = v##sign##int##bits##x##size##_avx2_load, \ - .store_aligned = v##sign##int##bits##x##size##_avx2_store_aligned, \ - .store = v##sign##int##bits##x##size##_avx2_store, \ - .add = v##sign##int##bits##x##size##_avx2_add, \ - .sub = v##sign##int##bits##x##size##_avx2_sub, \ - .mul = v##sign##int##bits##x##size##_avx2_mul, \ - .and = v##sign##int##bits##x##size##_avx2_and, \ - .or = v##sign##int##bits##x##size##_avx2_or, \ - .xor = v##sign##int##bits##x##size##_avx2_xor, \ - .lshift = v##sign##int##bits##x##size##_avx2_lshift, \ - .rshift = v##sign##int##bits##x##size##_avx2_rshift, \ - .lrshift = v##sign##int##bits##x##size##_avx2_lrshift, \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_avx2_load_aligned, \ + v##sign##int##bits##x##size##_avx2_load, \ + v##sign##int##bits##x##size##_avx2_store_aligned, \ + v##sign##int##bits##x##size##_avx2_store, \ + v##sign##int##bits##x##size##_avx2_add, \ + v##sign##int##bits##x##size##_avx2_sub, \ + v##sign##int##bits##x##size##_avx2_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_avx2_and, \ + v##sign##int##bits##x##size##_avx2_or, \ + v##sign##int##bits##x##size##_avx2_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_avx2_lshift, \ + v##sign##int##bits##x##size##_avx2_rshift, \ + v##sign##int##bits##x##size##_avx2_lrshift, \ }; #define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \ diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/x86/avx512f.h --- a/include/vec/impl/x86/avx512f.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/x86/avx512f.h Wed Nov 20 12:02:15 2024 -0500 @@ -34,7 +34,8 @@ __m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 24), _mm512_srli_epi32(vec2.avx512f, 24)); \ \ /* repack */ \ - return (v##sign##int8x64){ .avx512f = _mm512_or_si512( \ + v##sign##int8x64 vec; \ + vec.avx512f = _mm512_or_si512( \ _mm512_or_si512( \ _mm512_slli_epi32(dst_4, 8), \ _mm512_srli_epi32(_mm512_slli_epi32(dst_3, 8), 8) \ @@ -43,7 +44,8 @@ _mm512_slli_epi32(_mm512_slli_epi32(dst_2, 8), 16), \ _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 8), 24) \ ) \ - )}; \ + ); \ + return vec; \ } while (0) #define VEC_AVX512F_OPERATION_16x32(op, sign) \ @@ -53,10 +55,12 @@ __m512i dst_odd = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \ \ /* repack */ \ - return (v##sign##int16x32){ .avx512f = _mm512_or_si512( \ + v##sign##int16x32 vec; \ + vec.avx512f = _mm512_or_si512( \ _mm512_slli_epi32(dst_odd, 16), \ _mm512_srli_epi32(_mm512_slli_epi32(dst_even, 16), 16) \ - )}; \ + ); \ + return vec; \ } while (0) #define VEC_AVX512F_ADD_8x64(sign) \ @@ -67,12 +71,16 @@ #define VEC_AVX512F_ADD_32x16(sign) \ do { \ - return (v##sign##int32x16) { .avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_ADD_64x8(sign) \ do { \ - return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_SUB_8x64(sign) \ @@ -83,12 +91,16 @@ #define VEC_AVX512F_SUB_32x16(sign) \ do { \ - return (v##sign##int32x16) { .avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_SUB_64x8(sign) \ do { \ - return (v##sign##int64x8) { .avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_MUL_8x64(sign) \ @@ -99,7 +111,9 @@ #define VEC_AVX512F_MUL_32x16(sign) \ do { \ - return (v##sign##int32x16) { .avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_MUL_64x8(sign) \ @@ -111,7 +125,10 @@ __m512i ad = _mm512_mul_epu32(vec1.avx512f, d); \ __m512i hi = _mm512_add_epi64(bc, ad); \ hi = _mm512_slli_epi64(hi, 32); \ - return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(hi, ac) }; \ + \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_add_epi64(hi, ac); \ + return vec; \ } while (0) #define VEC_AVX512F_LSHIFT_8x64(sign) \ @@ -122,12 +139,16 @@ #define VEC_AVX512F_LSHIFT_32x16(sign) \ do { \ - return (v##sign##int32x16){ .avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_LSHIFT_64x8(sign) \ do { \ - return (v##sign##int64x8){ .avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_RSHIFT_8x64(sign, aORl) \ @@ -138,31 +159,39 @@ #define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \ do { \ - return (v##sign##int32x16){ .avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int32x16 vec; \ + vec.avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \ do { \ - return (v##sign##int64x8){ .avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int64x8 vec; \ + vec.avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f); \ + return vec; \ } while (0) #define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .avx512f = _mm512_load_si512((const __m512i *)in) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_load_si512((const __m512i *)in); \ + return vec; \ } \ \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .avx512f = _mm512_loadu_si512((const __m512i *)in) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_loadu_si512((const __m512i *)in); \ + return vec; \ } \ \ - static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm512_store_si512((__m512i *)out, vec.avx512f); \ } \ \ - static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm512_storeu_si512((__m512i *)out, vec.avx512f); \ } \ @@ -184,17 +213,23 @@ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f) }; \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ @@ -213,16 +248,23 @@ } \ \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \ - .load_aligned = v##sign##int##bits##x##size##_avx512f_load_aligned, \ - .load = v##sign##int##bits##x##size##_avx512f_load, \ - .store_aligned = v##sign##int##bits##x##size##_avx512f_store_aligned, \ - .store = v##sign##int##bits##x##size##_avx512f_store, \ - .add = v##sign##int##bits##x##size##_avx512f_add, \ - .sub = v##sign##int##bits##x##size##_avx512f_sub, \ - .mul = v##sign##int##bits##x##size##_avx512f_mul, \ - .and = v##sign##int##bits##x##size##_avx512f_and, \ - .or = v##sign##int##bits##x##size##_avx512f_or, \ - .xor = v##sign##int##bits##x##size##_avx512f_xor, \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_avx512f_load_aligned, \ + v##sign##int##bits##x##size##_avx512f_load, \ + v##sign##int##bits##x##size##_avx512f_store_aligned, \ + v##sign##int##bits##x##size##_avx512f_store, \ + v##sign##int##bits##x##size##_avx512f_add, \ + v##sign##int##bits##x##size##_avx512f_sub, \ + v##sign##int##bits##x##size##_avx512f_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_avx512f_and, \ + v##sign##int##bits##x##size##_avx512f_or, \ + v##sign##int##bits##x##size##_avx512f_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_avx512f_lshift, \ + v##sign##int##bits##x##size##_avx512f_rshift, \ + v##sign##int##bits##x##size##_avx512f_lrshift, \ }; #define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \ diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/x86/mmx.h --- a/include/vec/impl/x86/mmx.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/x86/mmx.h Wed Nov 20 12:02:15 2024 -0500 @@ -32,10 +32,12 @@ __m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1.mmx, 8), _mm_srli_pi16(vec2.mmx, 8)); \ \ /* repack */ \ - return (v##sign##int8x8){ .mmx = _mm_or_si64( \ + v##sign##int8x8 vec; \ + vec.mmx = _mm_or_si64( \ _mm_slli_pi16(dst_odd, 8), \ _mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \ - )}; \ + ); \ + return vec; \ } while (0) // shifting @@ -44,12 +46,16 @@ #define VEC_MMX_LSHIFT_16x4(sign) \ do { \ - return (v##sign##int16x4){ .mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx) }; \ + v##sign##int16x4 vec; \ + vec.mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx); \ + return vec; \ } while (0) #define VEC_MMX_LSHIFT_32x2(sign) \ do { \ - return (v##sign##int32x2){ .mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx) }; \ + v##sign##int32x2 vec; \ + vec.mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx); \ + return vec; \ } while (0) #define VEC_MMX_RSHIFT_8x8(sign, aORl) \ @@ -57,12 +63,16 @@ #define VEC_MMX_RSHIFT_16x4(sign, aORl) \ do { \ - return (v##sign##int16x4){ .mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx) }; \ + v##sign##int16x4 vec; \ + vec.mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx); \ + return vec; \ } while (0) #define VEC_MMX_RSHIFT_32x2(sign, aORl) \ do { \ - return (v##sign##int32x2){ .mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx) }; \ + v##sign##int32x2 vec; \ + vec.mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx); \ + return vec; \ } while (0) // shared between MMX variations @@ -72,7 +82,9 @@ #define VEC_MMX_MUL_16x4(sign) \ do { \ /* we have a real instruction for this */ \ - return (v##sign##int16x4){ .mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx) }; \ + v##sign##int16x4 vec; \ + vec.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ + return vec; \ } while (0) #define VEC_MMX_MUL_32x2(sign) \ @@ -84,30 +96,37 @@ __m64 ad = _mm_mullo_pi16(vec1.mmx, d); \ __m64 hi = _mm_add_pi32(bc, ad); \ hi = _mm_slli_pi32(hi, 16); \ - return (v##sign##int32x2) { .mmx = _mm_add_pi32(hi, ac) }; /* return ac + hi; */ \ + \ + v##sign##int32x2 vec; \ + vec.mmx = _mm_add_pi32(hi, ac); \ + return vec; \ } while (0) #define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ memcpy(&vec.mmx, in, sizeof(vec.mmx)); \ return vec; \ } \ \ - static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ memcpy(out, &vec.mmx, sizeof(vec.mmx)); \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx) }; \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx) }; \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ @@ -117,17 +136,23 @@ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .mmx = _mm_and_si64(vec1.mmx, vec2.mmx) }; \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_and_si64(vec1.mmx, vec2.mmx); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .mmx = _mm_or_si64(vec1.mmx, vec2.mmx) }; \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_or_si64(vec1.mmx, vec2.mmx); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .mmx = _mm_xor_si64(vec1.mmx, vec2.mmx) }; \ + v##sign##int##bits##x##size vec; \ + vec.mmx = _mm_xor_si64(vec1.mmx, vec2.mmx); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ @@ -146,19 +171,23 @@ } \ \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \ - .load_aligned = v##sign##int##bits##x##size##_mmx_load_aligned, \ - .load = v##sign##int##bits##x##size##_mmx_load_aligned, \ - .store_aligned = v##sign##int##bits##x##size##_mmx_store_aligned, \ - .store = v##sign##int##bits##x##size##_mmx_store_aligned, \ - .add = v##sign##int##bits##x##size##_mmx_add, \ - .sub = v##sign##int##bits##x##size##_mmx_sub, \ - .mul = v##sign##int##bits##x##size##_mmx_mul, \ - .and = v##sign##int##bits##x##size##_mmx_and, \ - .or = v##sign##int##bits##x##size##_mmx_or, \ - .xor = v##sign##int##bits##x##size##_mmx_xor, \ - .lshift = v##sign##int##bits##x##size##_mmx_lshift, \ - .rshift = v##sign##int##bits##x##size##_mmx_rshift, \ - .lrshift = v##sign##int##bits##x##size##_mmx_lrshift, \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_mmx_load_aligned, \ + v##sign##int##bits##x##size##_mmx_load_aligned, \ + v##sign##int##bits##x##size##_mmx_store_aligned, \ + v##sign##int##bits##x##size##_mmx_store_aligned, \ + v##sign##int##bits##x##size##_mmx_add, \ + v##sign##int##bits##x##size##_mmx_sub, \ + v##sign##int##bits##x##size##_mmx_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_mmx_and, \ + v##sign##int##bits##x##size##_mmx_or, \ + v##sign##int##bits##x##size##_mmx_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_mmx_lshift, \ + v##sign##int##bits##x##size##_mmx_rshift, \ + v##sign##int##bits##x##size##_mmx_lrshift, \ }; #define VEC_MMX_DEFINE_OPERATIONS(bits, size) \ diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/x86/sse2.h --- a/include/vec/impl/x86/sse2.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/x86/sse2.h Wed Nov 20 12:02:15 2024 -0500 @@ -32,10 +32,12 @@ __m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \ \ /* repack */ \ - return (v##sign##int8x16){ .sse = _mm_or_si128( \ + v##sign##int8x16 vec; \ + vec.sse = _mm_or_si128( \ _mm_slli_epi16(dst_odd, 8), \ _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ - )}; \ + ); \ + return vec; \ } while (0) // shifting @@ -44,17 +46,23 @@ #define VEC_SSE2_LSHIFT_16x8(sign) \ do { \ - return (v##sign##int16x8){ .sse = _mm_sll_epi16(vec1.sse, vec2.sse) }; \ + v##sign##int16x8 vec; \ + vec.sse = _mm_sll_epi16(vec1.sse, vec2.sse); \ + return vec; \ } while (0) #define VEC_SSE2_LSHIFT_32x4(sign) \ do { \ - return (v##sign##int32x4){ .sse = _mm_sll_epi32(vec1.sse, vec2.sse) }; \ + v##sign##int32x4 vec; \ + vec.sse = _mm_sll_epi32(vec1.sse, vec2.sse); \ + return vec; \ } while (0) #define VEC_SSE2_LSHIFT_64x2(sign) \ do { \ - return (v##sign##int64x2){ .sse = _mm_sll_epi64(vec1.sse, vec2.sse) }; \ + v##sign##int64x2 vec; \ + vec.sse = _mm_sll_epi64(vec1.sse, vec2.sse); \ + return vec; \ } while (0) #define VEC_SSE2_RSHIFT_8x16(sign, aORl) \ @@ -62,12 +70,16 @@ #define VEC_SSE2_RSHIFT_16x8(sign, aORl) \ do { \ - return (v##sign##int16x8){ .sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse) }; \ + v##sign##int16x8 vec; \ + vec.sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse); \ + return vec; \ } while (0) #define VEC_SSE2_RSHIFT_32x4(sign, aORl) \ do { \ - return (v##sign##int32x4){ .sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse) }; \ + v##sign##int32x4 vec; \ + vec.sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse); \ + return vec; \ } while (0) #define VEC_SSE2_aRSHIFT_64x2(sign) \ @@ -77,7 +89,9 @@ #define VEC_SSE2_lRSHIFT_64x2(sign) \ do { \ - return (v##sign##int64x2){ .sse = _mm_srl_epi64(vec1.sse, vec2.sse) }; \ + v##sign##int64x2 vec; \ + vec.sse = _mm_srl_epi64(vec1.sse, vec2.sse); \ + return vec; \ } while (0) #define VEC_SSE2_RSHIFT_64x2(sign, aORl) \ @@ -90,7 +104,9 @@ #define VEC_SSE2_MUL_16x8(sign) \ do { \ /* we have a real instruction for this */ \ - return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \ + v##sign##int16x8 vec; \ + vec.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \ + return vec; \ } while (0) #define VEC_SSE2_MUL_32x4(sign) \ @@ -102,7 +118,10 @@ __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ - return (v##sign##int32x4) { .sse = _mm_unpacklo_epi64(prod01, prod23) }; /* (ab3,ab2,ab1,ab0) */ \ + \ + v##sign##int32x4 vec; \ + vec.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ + return vec; \ } while (0) #define VEC_SSE2_MUL_64x2(sign) \ @@ -114,38 +133,49 @@ __m128i ad = _mm_mul_epu32(vec1.sse, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ - return (v##sign##int64x2) { .sse = _mm_add_epi64(hi, ac) }; /* return ac + hi; */ \ + \ + v##sign##int64x2 vec; \ + vec.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \ + return vec; \ } while (0) #define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .sse = _mm_load_si128((const __m128i *)in) }; \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_load_si128((const __m128i *)in); \ + return vec; \ } \ \ - static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const sign##int##bits##_t in[size]) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \ { \ - return (v##sign##int##bits##x##size) { .sse = _mm_loadu_si128((const __m128i *)in) }; \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_loadu_si128((const __m128i *)in); \ + return vec; \ } \ \ - static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm_store_si128((__m128i *)out, vec.sse); \ } \ \ - static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ _mm_storeu_si128((__m128i *)out, vec.sse); \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .sse = _mm_add_epi##bits(vec1.sse, vec2.sse) }; \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_add_epi##bits(vec1.sse, vec2.sse); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .sse = _mm_sub_epi##bits(vec1.sse, vec2.sse) }; \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_sub_epi##bits(vec1.sse, vec2.sse); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ @@ -155,17 +185,23 @@ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_and_si128(vec1.sse, vec2.sse); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_or_si128(vec1.sse, vec2.sse); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - return (v##sign##int##bits##x##size) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; \ + v##sign##int##bits##x##size vec; \ + vec.sse = _mm_xor_si128(vec1.sse, vec2.sse); \ + return vec; \ } \ \ static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ @@ -184,19 +220,23 @@ } \ \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \ - .load_aligned = v##sign##int##bits##x##size##_sse2_load_aligned, \ - .load = v##sign##int##bits##x##size##_sse2_load, \ - .store_aligned = v##sign##int##bits##x##size##_sse2_store_aligned, \ - .store = v##sign##int##bits##x##size##_sse2_store, \ - .add = v##sign##int##bits##x##size##_sse2_add, \ - .sub = v##sign##int##bits##x##size##_sse2_sub, \ - .mul = v##sign##int##bits##x##size##_sse2_mul, \ - .and = v##sign##int##bits##x##size##_sse2_and, \ - .or = v##sign##int##bits##x##size##_sse2_or, \ - .xor = v##sign##int##bits##x##size##_sse2_xor, \ - .lshift = v##sign##int##bits##x##size##_sse2_lshift, \ - .rshift = v##sign##int##bits##x##size##_sse2_rshift, \ - .lrshift = v##sign##int##bits##x##size##_sse2_lrshift, \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_sse2_load_aligned, \ + v##sign##int##bits##x##size##_sse2_load, \ + v##sign##int##bits##x##size##_sse2_store_aligned, \ + v##sign##int##bits##x##size##_sse2_store, \ + v##sign##int##bits##x##size##_sse2_add, \ + v##sign##int##bits##x##size##_sse2_sub, \ + v##sign##int##bits##x##size##_sse2_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_sse2_and, \ + v##sign##int##bits##x##size##_sse2_or, \ + v##sign##int##bits##x##size##_sse2_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_sse2_lshift, \ + v##sign##int##bits##x##size##_sse2_rshift, \ + v##sign##int##bits##x##size##_sse2_lrshift, \ }; #define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \ diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/impl/x86/sse41.h --- a/include/vec/impl/x86/sse41.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/impl/x86/sse41.h Wed Nov 20 12:02:15 2024 -0500 @@ -28,23 +28,29 @@ #define VEC_SSE41_DEFINE_OPERATIONS(sign) \ static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \ { \ - return (v##sign##int32x4){ .sse = _mm_mullo_epi32(vec1.sse, vec2.sse) }; \ + v##sign##int32x4 vec; \ + vec.sse = _mm_mullo_epi32(vec1.sse, vec2.sse); \ + return vec; \ } \ \ static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \ - .load_aligned = v##sign##int32x4_sse2_load_aligned, \ - .load = v##sign##int32x4_sse2_load, \ - .store_aligned = v##sign##int32x4_sse2_store_aligned, \ - .store = v##sign##int32x4_sse2_store, \ - .add = v##sign##int32x4_sse2_add, \ - .sub = v##sign##int32x4_sse2_sub, \ - .mul = v##sign##int32x4_sse41_mul, \ - .and = v##sign##int32x4_sse2_and, \ - .or = v##sign##int32x4_sse2_or, \ - .xor = v##sign##int32x4_sse2_xor, \ - .lshift = v##sign##int32x4_sse2_lshift, \ - .rshift = v##sign##int32x4_sse2_rshift, \ - .lrshift = v##sign##int32x4_sse2_lrshift, \ + /* .splat = */ NULL, \ + v##sign##int32x4##_sse2_load_aligned, \ + v##sign##int32x4##_sse2_load, \ + v##sign##int32x4##_sse2_store_aligned, \ + v##sign##int32x4##_sse2_store, \ + v##sign##int32x4##_sse2_add, \ + v##sign##int32x4##_sse2_sub, \ + v##sign##int32x4##_sse41_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int32x4##_sse2_and, \ + v##sign##int32x4##_sse2_or, \ + v##sign##int32x4##_sse2_xor, \ + /* .not = */ NULL, \ + v##sign##int32x4##_sse2_lshift, \ + v##sign##int32x4##_sse2_rshift, \ + v##sign##int32x4##_sse2_lrshift, \ }; VEC_SSE41_DEFINE_OPERATIONS() diff -r 9da2aba90c87 -r 41dd962abdd1 include/vec/vec.h --- a/include/vec/vec.h Wed Nov 20 04:16:56 2024 -0500 +++ b/include/vec/vec.h Wed Nov 20 12:02:15 2024 -0500 @@ -25,42 +25,75 @@ #ifndef VEC_VEC_H_ #define VEC_VEC_H_ -#include -#include -#include +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef VEC_HAVE_IMPL_INTEGER_H +# include "impl/integer.h" +#else +# if __cplusplus >= (201103L) +# include +# include +typedef std::size_t vec_uintsize; + +typedef std::uint8_t vec_uint8; +typedef std::uint16_t vec_uint16; +typedef std::uint32_t vec_uint32; +typedef std::uint64_t vec_uint64; +typedef std::uintmax_t vec_uintmax; +typedef std::uintptr_t vec_uintptr; -#define VEC_MAX(a, b) (((a) > (b)) ? (a) : (b)) -#define VEC_MIN(a, b) (((a) < (b)) ? (a) : (b)) -#define VEC_CLAMP(x, min, max) (VEC_MIN(VEC_MAX((x), (min)), (max))) +typedef std::int8_t vec_int8; +typedef std::int16_t vec_int16; +typedef std::int32_t vec_int32; +typedef std::int64_t vec_int64; +typedef std::intmax_t vec_intmax; +# elif __STDC_VERSION__ >= 199901L +# include +# include +typedef uint8_t vec_uint8; +typedef uint16_t vec_uint16; +typedef uint32_t vec_uint32; +typedef uint64_t vec_uint64; +typedef uintmax_t vec_uintmax; +typedef uintptr_t vec_uintptr; +typedef size_t vec_uintsize; +typedef int8_t vec_int8; +typedef int16_t vec_int16; +typedef int32_t vec_int32; +typedef int64_t vec_int64; +typedef intmax_t vec_intmax; +# else +# error Unable to find integer types with known size. +# endif +#endif #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ (((a) >= (x)) && \ ((a) > x || (b) >= (y)) && \ ((a) > x || (b) > (y) || (c) >= (z))) -#define VEC_GNUC_ATLEAST(x, y, z) \ +#ifdef __GNUC__ +# define VEC_GNUC_ATLEAST(x, y, z) \ VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z) +#else +# define VEC_GNUC_ATLEAST(x, y, z) (0) +#endif /* GCC/clang attributes */ #if defined(__has_attribute) -# if __has_attribute(__aligned__) -# define VEC_ALIGNED(x) __attribute__((__aligned__(x))) -# endif -# if __has_attribute(__vector_size__) -# define VEC_COMPILER_HAS_GNUC_VECTORS -# endif +# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x) +#else +# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch) #endif -#ifndef VEC_ALIGNED -# if VEC_GNUC_ATLEAST(2, 7, 0) -# define VEC_ALIGNED(x) __attribute__((__aligned__(x))) -# endif -#endif - -#if (__STDC_VERSION__ >= 201112L) +#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) +# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg) +#elif (__STDC_VERSION__ >= 201112L) # define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg) #else -// C99 static assertion # define VEC_STATIC_ASSERT(x, msg) \ extern int (*vec_impl_Static_assert_function_(void)) \ [!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })] @@ -78,28 +111,35 @@ /* --------------------------------------------------------------- */ /* Detect compiler SIMD support */ -#define VEC_GENERIC_ALIGNMENT 1 #define VEC_ALTIVEC_ALIGNMENT 16 #define VEC_SSE2_ALIGNMENT 16 #define VEC_AVX2_ALIGNMENT 32 #define VEC_AVX512F_ALIGNMENT 64 -// for the generic implementation, 64-bit -#define VINT8x8_ALIGNMENT VEC_GENERIC_ALIGNMENT -#define VINT16x4_ALIGNMENT VEC_GENERIC_ALIGNMENT -#define VINT32x2_ALIGNMENT VEC_GENERIC_ALIGNMENT -#define VUINT8x8_ALIGNMENT VEC_GENERIC_ALIGNMENT -#define VUINT16x4_ALIGNMENT VEC_GENERIC_ALIGNMENT -#define VUINT32x2_ALIGNMENT VEC_GENERIC_ALIGNMENT +// for the generic implementation +#define VINT8x2_ALIGNMENT 1 +#define VUINT8x2_ALIGNMENT 1 + +#define VINT8x4_ALIGNMENT VINT8x2_ALIGNMENT +#define VINT16x2_ALIGNMENT 2 +#define VUINT8x4_ALIGNMENT VUINT8x2_ALIGNMENT +#define VUINT16x2_ALIGNMENT 2 + +#define VINT8x8_ALIGNMENT VINT8x4_ALIGNMENT +#define VINT16x4_ALIGNMENT VINT16x2_ALIGNMENT +#define VINT32x2_ALIGNMENT 4 +#define VUINT8x8_ALIGNMENT VUINT8x4_ALIGNMENT +#define VUINT16x4_ALIGNMENT VUINT16x2_ALIGNMENT +#define VUINT32x2_ALIGNMENT 4 #define VINT8x16_ALIGNMENT VINT8x8_ALIGNMENT #define VINT16x8_ALIGNMENT VINT16x4_ALIGNMENT #define VINT32x4_ALIGNMENT VINT32x2_ALIGNMENT -#define VINT64x2_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VINT64x2_ALIGNMENT 8 #define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT #define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT #define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT -#define VUINT64x2_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VUINT64x2_ALIGNMENT 8 #define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT #define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT @@ -287,48 +327,48 @@ /* --------------------------------------------------------------- */ /* bit shift */ -inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y) +inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y) { return x >> y; } -inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y) +inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y) { return x << y; } -inline intmax_t vec_lrshift(intmax_t x, unsigned int y) +inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y) { // reinterpret as unsigned integer and then shift union { - intmax_t d; - uintmax_t u; + vec_intmax d; + vec_uintmax u; } xx; xx.d = x; - xx.u >> y; + xx.u >>= y; return xx.d; } -inline intmax_t vec_llshift(intmax_t x, unsigned int y) +inline vec_intmax vec_llshift(vec_intmax x, unsigned int y) { // reinterpret as unsigned integer and then shift union { - intmax_t d; - uintmax_t u; + vec_intmax d; + vec_uintmax u; } xx; xx.d = x; - xx.u << y; + xx.u <<= y; return xx.d; } -inline uintmax_t vec_urshift(uintmax_t x, unsigned int y) +inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y) { return x >> y; } -inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y) +inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y) { return x << y; } @@ -359,13 +399,13 @@ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. **/ -inline intmax_t vec_rshift(intmax_t x, unsigned int y) +inline vec_intmax vec_rshift(vec_intmax x, unsigned int y) { - static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1); union { - intmax_t d; - uintmax_t u; + vec_intmax d; + vec_uintmax u; } xx; xx.d = x; @@ -378,13 +418,13 @@ return xx.d; } -inline intmax_t vec_lshift(intmax_t x, unsigned int y) +inline vec_intmax vec_lshift(vec_intmax x, unsigned int y) { - static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); + static const vec_uintmax roffset = ((vec_uintmax)1) << ((sizeof(vec_intmax) * 8) - 1); union { - intmax_t d; - uintmax_t u; + vec_intmax d; + vec_uintmax u; } xx; xx.d = x; @@ -397,203 +437,56 @@ } #ifdef VEC_IMPLEMENTATION -extern inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y); -extern inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y); -extern inline intmax_t vec_lrshift(intmax_t x, unsigned int y); -extern inline intmax_t vec_llshift(intmax_t x, unsigned int y); -extern inline uintmax_t vec_urshift(uintmax_t x, unsigned int y); -extern inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y); -extern inline intmax_t vec_rshift(intmax_t x, unsigned int y); -extern inline intmax_t vec_lshift(intmax_t x, unsigned int y); +extern inline vec_uintmax vec_ulrshift(vec_uintmax x, unsigned int y); +extern inline vec_uintmax vec_ullshift(vec_uintmax x, unsigned int y); +extern inline vec_intmax vec_lrshift(vec_intmax x, unsigned int y); +extern inline vec_intmax vec_llshift(vec_intmax x, unsigned int y); +extern inline vec_uintmax vec_urshift(vec_uintmax x, unsigned int y); +extern inline vec_uintmax vec_ulshift(vec_uintmax x, unsigned int y); +extern inline vec_intmax vec_rshift(vec_intmax x, unsigned int y); +extern inline vec_intmax vec_lshift(vec_intmax x, unsigned int y); #endif /* --------------------------------------------------------------- */ -/* Array alignment macros */ -/* the alignment must be specified in bytes and must be a multiple of the - * type size. it is always assumed that the type will be on a boundary of - * its size, which may or may not be true */ -#ifdef VEC_ALIGNED -# define VEC_ALIGNED_ARRAY(type, var, length, align) \ - VEC_ALIGNED(align) type var[length] -# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(var)) -#else -# define VEC_ALIGNED_ARRAY(type, var, length, align) \ - VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \ - type vec_##var##_unaligned_[(length) + (align / sizeof(type))]; \ - type *var = (type *)(((uintptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)); \ - VEC_ASSERT(((uintptr_t)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") -# define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(vec_##var##_unaligned_) - (align - 1)) -#endif - -#define VEC_ALIGNED_ARRAY_LENGTH(var) \ - (VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var)) - -// ------------------------------------------------------------ -// predefined variants for each vector type - -#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 8, VINT8x8_ALIGNMENT) -#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT) -#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT) -#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0) - -#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 4, VINT16x4_ALIGNMENT) -#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT) -#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT) -#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0) - -#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 2, VINT32x2_ALIGNMENT) -#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT) -#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT) -#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0) - -#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 8, VUINT8x8_ALIGNMENT) -#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT) -#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT) -#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0) - -#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 4, VUINT16x4_ALIGNMENT) -#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT) -#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT) -#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0) - -#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 2, VUINT32x2_ALIGNMENT) -#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT) -#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT) -#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0) - -#define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) -#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) -#define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) - -#define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) -#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) -#define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) - -#define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) -#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) -#define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) - -#define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) -#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) -#define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) - -#define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) -#define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) - -#define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) -#define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) - -#define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) -#define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) - -#define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) -#define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) - -#define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) -#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) -#define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) - -#define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) -#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) -#define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) - -#define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) -#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) -#define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) - -#define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) -#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) -#define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) - -#define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) -#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) -#define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) - -#define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) -#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) -#define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) - -#define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) -#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) -#define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) - -#define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) -#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) -#define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) - -#define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) -#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) -#define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) - -#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x32_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT) -#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT) -#define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) - -#define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) -#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) -#define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) - -#define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) -#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) -#define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) - -#define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) -#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) -#define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) - -#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x32_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT) -#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT) -#define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) - -#define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) -#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) -#define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) - -#define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) -#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) -#define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) +#include "impl/align.h" /* --------------------------------------------------------------- */ /* Defines the structures for each vector type */ +// 16-bit +typedef union { + vec_uint8 generic[2]; +} vuint8x2; + +typedef union { + vec_int8 generic[2]; +} vint8x2; + +// 32-bit +typedef union { + vuint8x2 generic[2]; +} vuint8x4; + +typedef union { + vec_uint16 generic[2]; +} vuint16x2; + +typedef union { + vint8x2 generic[2]; +} vint8x4; + +typedef union { + vec_int16 generic[2]; +} vint16x2; + // 64-bit typedef union { #ifdef VEC_COMPILER_HAS_MMX __m64 mmx; #endif - uint8_t generic[8]; + vuint8x4 generic[2]; } vuint8x8; typedef union { @@ -601,7 +494,7 @@ __m64 mmx; #endif - uint16_t generic[4]; + vuint16x2 generic[2]; } vuint16x4; typedef union { @@ -609,7 +502,7 @@ __m64 mmx; #endif - uint32_t generic[2]; + vec_uint32 generic[2]; } vuint32x2; typedef union { @@ -617,7 +510,7 @@ __m64 mmx; #endif - int8_t generic[8]; + vint8x4 generic[2]; } vint8x8; typedef union { @@ -625,7 +518,7 @@ __m64 mmx; #endif - int16_t generic[4]; + vint16x2 generic[2]; } vint16x4; typedef union { @@ -633,7 +526,7 @@ __m64 mmx; #endif - int32_t generic[2]; + vec_int32 generic[2]; } vint32x2; // 128-bit @@ -674,7 +567,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX vector unsigned long long altivec; #endif - uint64_t generic[2]; + vec_uint64 generic[2]; } vuint64x2; typedef union { @@ -714,7 +607,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX vector signed long long altivec; #endif - int64_t generic[2]; + vec_int64 generic[2]; } vint64x2; // 256-bit @@ -837,11 +730,11 @@ int vec_init(void); #define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]); \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]); \ - void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ - void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]); \ + void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ + void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ @@ -863,6 +756,13 @@ VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \ VEC_DECLARE_OPERATIONS_SIGN(u, bits, size) +// 16-bit +VEC_DECLARE_OPERATIONS(8, 2) + +// 32-bit +VEC_DECLARE_OPERATIONS(8, 4) +VEC_DECLARE_OPERATIONS(16, 2) + // 64-bit VEC_DECLARE_OPERATIONS(8, 8) VEC_DECLARE_OPERATIONS(16, 4) @@ -897,37 +797,46 @@ // Fallback functions, need to be defined before everything else. #include "impl/fallback.h" -// okay, these are filled in for each supported backend +// okay, these are filled in for each supported backend. +// `and', `or', `xor', and `nor' have to be prefixed with +// `b' because of #define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \ typedef struct { \ - v##sign##int##bits##x##size (*splat)(sign##int##bits##_t x); \ - v##sign##int##bits##x##size (*load_aligned)(const sign##int##bits##_t in[size]); \ - v##sign##int##bits##x##size (*load)(const sign##int##bits##_t in[size]); \ - void (*store_aligned)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ - void (*store)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ + v##sign##int##bits##x##size (*splat)(vec_##sign##int##bits x); \ + v##sign##int##bits##x##size (*load_aligned)(const vec_##sign##int##bits in[size]); \ + v##sign##int##bits##x##size (*load)(const vec_##sign##int##bits in[size]); \ + void (*store_aligned)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ + void (*store)(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]); \ v##sign##int##bits##x##size (*add)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*sub)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*mul)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*div)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*avg)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*and)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*or)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*xor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*not)(v##sign##int##bits##x##size vec); \ + v##sign##int##bits##x##size (*band)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*bor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*bxor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*bnot)(v##sign##int##bits##x##size vec); \ + v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ v##sign##int##bits##x##size (*cmplt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*cmple)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*cmpeq)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*cmpge)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ v##sign##int##bits##x##size (*cmpgt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ - v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ - v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ } v##sign##int##bits##x##size##_impl; #define VEC_DEFINE_IMPL_STRUCT(bits, size) \ VEC_DEFINE_IMPL_STRUCT_SIGN( , bits, size) \ VEC_DEFINE_IMPL_STRUCT_SIGN(u, bits, size) +// 16-bit +VEC_DEFINE_IMPL_STRUCT(8, 2) + +// 32-bit +VEC_DEFINE_IMPL_STRUCT(8, 4) +VEC_DEFINE_IMPL_STRUCT(16, 2) + // 64-bit VEC_DEFINE_IMPL_STRUCT(8, 8) VEC_DEFINE_IMPL_STRUCT(16, 4) @@ -988,6 +897,16 @@ #include "impl/cpu.h" // CPU detection crap +// 16-bit +static vint8x2_impl *vint8x2_impl_cpu = &vint8x2_impl_generic; +static vuint8x2_impl *vuint8x2_impl_cpu = &vuint8x2_impl_generic; + +// 32-bit +static vint8x4_impl *vint8x4_impl_cpu = &vint8x4_impl_generic; +static vuint8x4_impl *vuint8x4_impl_cpu = &vuint8x4_impl_generic; +static vint16x2_impl *vint16x2_impl_cpu = &vint16x2_impl_generic; +static vuint16x2_impl *vuint16x2_impl_cpu = &vuint16x2_impl_generic; + // 64-bit static vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; static vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; @@ -1026,6 +945,7 @@ static vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; +// returns 0 or a negative error code on failure int vec_init(void) { // This function is NOT thread safe. However, once vec @@ -1112,12 +1032,14 @@ { // do nothing, they're already set to generics } + + return 0; } /* ---------------------------------------------------------------- */ #define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ { \ if (v##sign##int##bits##x##size##_impl_cpu->splat) \ return v##sign##int##bits##x##size##_impl_cpu->splat(x); \ @@ -1125,16 +1047,19 @@ return v##sign##int##bits##x##size##_fallback_splat(x); \ } \ \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ { \ + v##sign##int##bits##x##size err = {0}; \ + \ if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \ return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \ \ VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \ - return (v##sign##int##bits##x##size){0}; \ + \ + return err; \ } \ \ - v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const vec_##sign##int##bits in[size]) \ { \ if (v##sign##int##bits##x##size##_impl_cpu->load) \ return v##sign##int##bits##x##size##_impl_cpu->load(in); \ @@ -1142,7 +1067,7 @@ return v##sign##int##bits##x##size##_fallback_load(in); \ } \ \ - void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \ v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \ @@ -1152,7 +1077,7 @@ VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \ } \ \ - void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ if (v##sign##int##bits##x##size##_impl_cpu->store) { \ v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \ @@ -1204,32 +1129,32 @@ \ v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - if (v##sign##int##bits##x##size##_impl_cpu->and) \ - v##sign##int##bits##x##size##_impl_cpu->and(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->band) \ + v##sign##int##bits##x##size##_impl_cpu->band(vec1, vec2); \ \ return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \ } \ \ v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - if (v##sign##int##bits##x##size##_impl_cpu->or) \ - v##sign##int##bits##x##size##_impl_cpu->or(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->bor) \ + v##sign##int##bits##x##size##_impl_cpu->bor(vec1, vec2); \ \ return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \ } \ \ v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ - if (v##sign##int##bits##x##size##_impl_cpu->xor) \ - v##sign##int##bits##x##size##_impl_cpu->xor(vec1, vec2); \ + if (v##sign##int##bits##x##size##_impl_cpu->bxor) \ + v##sign##int##bits##x##size##_impl_cpu->bxor(vec1, vec2); \ \ return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \ } \ \ v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ { \ - if (v##sign##int##bits##x##size##_impl_cpu->not) \ - v##sign##int##bits##x##size##_impl_cpu->not(vec); \ + if (v##sign##int##bits##x##size##_impl_cpu->bnot) \ + v##sign##int##bits##x##size##_impl_cpu->bnot(vec); \ \ return v##sign##int##bits##x##size##_fallback_not(vec); \ } \ @@ -1302,6 +1227,13 @@ VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) +// 16-bit +VEC_DEFINE_OPERATIONS(8, 2) + +// 32-bit +VEC_DEFINE_OPERATIONS(8, 4) +VEC_DEFINE_OPERATIONS(16, 2) + // 64-bit VEC_DEFINE_OPERATIONS(8, 8) VEC_DEFINE_OPERATIONS(16, 4) @@ -1330,4 +1262,8 @@ #endif /* VEC_IMPLEMENTATION */ +#ifdef __cplusplus +} +#endif + #endif /* VEC_VEC_H_ */ diff -r 9da2aba90c87 -r 41dd962abdd1 test/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile Wed Nov 20 12:02:15 2024 -0500 @@ -0,0 +1,230 @@ +# CMAKE generated file: DO NOT EDIT! +# Generated by "Unix Makefiles" Generator, CMake Version 3.25 + +# Default target executed when no arguments are given to make. +default_target: all +.PHONY : default_target + +# Allow only one "make -f Makefile2" at a time, but pass parallelism. +.NOTPARALLEL: + +#============================================================================= +# Special targets provided by cmake. + +# Disable implicit rules so canonical targets will work. +.SUFFIXES: + +# Disable VCS-based implicit rules. +% : %,v + +# Disable VCS-based implicit rules. +% : RCS/% + +# Disable VCS-based implicit rules. +% : RCS/%,v + +# Disable VCS-based implicit rules. +% : SCCS/s.% + +# Disable VCS-based implicit rules. +% : s.% + +.SUFFIXES: .hpux_make_needs_suffix_list + +# Command-line flag to silence nested $(MAKE). +$(VERBOSE)MAKESILENT = -s + +#Suppress display of executed commands. +$(VERBOSE).SILENT: + +# A target that is always out of date. +cmake_force: +.PHONY : cmake_force + +#============================================================================= +# Set environment variables for the build. + +# The shell in which to execute make rules. +SHELL = /bin/sh + +# The CMake executable. +CMAKE_COMMAND = /usr/bin/cmake + +# The command to remove a file. +RM = /usr/bin/cmake -E rm -f + +# Escaping for special characters. +EQUALS = = + +# The top-level source directory on which CMake was run. +CMAKE_SOURCE_DIR = /home/paper/Documents/src/hg/vec + +# The top-level build directory on which CMake was run. +CMAKE_BINARY_DIR = /home/paper/Documents/src/hg/vec/test + +#============================================================================= +# Targets provided globally by CMake. + +# Special rule for the target edit_cache +edit_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." + /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. +.PHONY : edit_cache + +# Special rule for the target edit_cache +edit_cache/fast: edit_cache +.PHONY : edit_cache/fast + +# Special rule for the target rebuild_cache +rebuild_cache: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." + /usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) +.PHONY : rebuild_cache + +# Special rule for the target rebuild_cache +rebuild_cache/fast: rebuild_cache +.PHONY : rebuild_cache/fast + +# Special rule for the target list_install_components +list_install_components: + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\"" +.PHONY : list_install_components + +# Special rule for the target list_install_components +list_install_components/fast: list_install_components +.PHONY : list_install_components/fast + +# Special rule for the target install +install: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install + +# Special rule for the target install +install/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." + /usr/bin/cmake -P cmake_install.cmake +.PHONY : install/fast + +# Special rule for the target install/local +install/local: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local + +# Special rule for the target install/local +install/local/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." + /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake +.PHONY : install/local/fast + +# Special rule for the target install/strip +install/strip: preinstall + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip + +# Special rule for the target install/strip +install/strip/fast: preinstall/fast + @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." + /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake +.PHONY : install/strip/fast + +# The main all target +all: cmake_check_build_system + $(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles /home/paper/Documents/src/hg/vec/test//CMakeFiles/progress.marks + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all + $(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles 0 +.PHONY : all + +# The main clean target +clean: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean +.PHONY : clean + +# The main clean target +clean/fast: clean +.PHONY : clean/fast + +# Prepare targets for installation. +preinstall: all + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall + +# Prepare targets for installation. +preinstall/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall +.PHONY : preinstall/fast + +# clear depends +depend: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 +.PHONY : depend + +#============================================================================= +# Target rules for targets named vec + +# Build rule for target. +vec: cmake_check_build_system + $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 vec +.PHONY : vec + +# fast build rule for target. +vec/fast: + $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/build +.PHONY : vec/fast + +src/vec.o: src/vec.c.o +.PHONY : src/vec.o + +# target to build an object file +src/vec.c.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.o +.PHONY : src/vec.c.o + +src/vec.i: src/vec.c.i +.PHONY : src/vec.i + +# target to preprocess a source file +src/vec.c.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.i +.PHONY : src/vec.c.i + +src/vec.s: src/vec.c.s +.PHONY : src/vec.s + +# target to generate assembly for a file +src/vec.c.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.s +.PHONY : src/vec.c.s + +# Help Target +help: + @echo "The following are some of the valid targets for this Makefile:" + @echo "... all (the default if no target is provided)" + @echo "... clean" + @echo "... depend" + @echo "... edit_cache" + @echo "... install" + @echo "... install/local" + @echo "... install/strip" + @echo "... list_install_components" + @echo "... rebuild_cache" + @echo "... vec" + @echo "... src/vec.o" + @echo "... src/vec.i" + @echo "... src/vec.s" +.PHONY : help + + + +#============================================================================= +# Special targets to cleanup operation of make. + +# Special rule to run CMake to check the build system integrity. +# No rule that depends on this can have commands that come from listfiles +# because they might be regenerated. +cmake_check_build_system: + $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 +.PHONY : cmake_check_build_system + diff -r 9da2aba90c87 -r 41dd962abdd1 test/Makefile.ppc --- a/test/Makefile.ppc Wed Nov 20 04:16:56 2024 -0500 +++ b/test/Makefile.ppc Wed Nov 20 12:02:15 2024 -0500 @@ -1,3 +1,3 @@ -CFLAGS += -maltivec +CPPFLAGS += -maltivec include Makefile.template \ No newline at end of file diff -r 9da2aba90c87 -r 41dd962abdd1 test/Makefile.template --- a/test/Makefile.template Wed Nov 20 04:16:56 2024 -0500 +++ b/test/Makefile.template Wed Nov 20 12:02:15 2024 -0500 @@ -1,4 +1,6 @@ -CFLAGS += -g -O2 -std=c99 -I../include +CPPFLAGS += -g -O2 -I../include -Wall -Wpedantic -Werror=strict-aliasing +CFLAGS += $(CPPFLAGS) -std=c99 +CXXFLAGS += $(CPPFLAGS) -std=c++11 HEADERS = ../include/vec/vec.h \ ../include/vec/impl/ppc/altivec.h \ @@ -9,9 +11,12 @@ ../include/vec/impl/x86/sse41.h \ ../include/vec/impl/cpu.h \ ../include/vec/impl/fallback.h \ - ../include/vec/impl/generic.h -BINS = test-generic test-host -OBJS = vec-generic.o vec-host.o test.o + ../include/vec/impl/generic.h \ + test_align.h \ + test_arith.h \ + test_compare.h +BINS = test-generic test-host test-cxx +OBJS = vec-generic.o vec-host.o test.o test-cxx.o .PHONY: all clean test @@ -26,15 +31,22 @@ test.o: test.c $(CC) $(CFLAGS) -c -o $@ $< +test-cxx.o: test.cc + $(CXX) $(CXXFLAGS) -c -o $@ $< + test-generic: vec-generic.o test.o $(CC) $(LDFLAGS) -o $@ $^ test-host: vec-host.o test.o $(CC) $(LDFLAGS) -o $@ $^ +test-cxx: test-cxx.o + $(CXX) $(LDFLAGS) -o $@ $^ + clean: $(RM) $(BINS) $(OBJS) test: clean $(BINS) ./test-generic ./test-host + ./test-cxx diff -r 9da2aba90c87 -r 41dd962abdd1 test/Makefile.x86 --- a/test/Makefile.x86 Wed Nov 20 04:16:56 2024 -0500 +++ b/test/Makefile.x86 Wed Nov 20 12:02:15 2024 -0500 @@ -1,3 +1,3 @@ -CFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f +CPPFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f include Makefile.template \ No newline at end of file diff -r 9da2aba90c87 -r 41dd962abdd1 test/test.c --- a/test/test.c Wed Nov 20 04:16:56 2024 -0500 +++ b/test/test.c Wed Nov 20 12:02:15 2024 -0500 @@ -1,6 +1,7 @@ #include "vec/vec.h" #include +#include #include #define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) @@ -78,6 +79,11 @@ VTEST(, , bits, size) VTEST(u, U, bits, size) \ VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size) +DEF_VEC_TEST_FUNCS(8, 2) + +DEF_VEC_TEST_FUNCS(8, 4) +DEF_VEC_TEST_FUNCS(16, 2) + DEF_VEC_TEST_FUNCS(8, 8) DEF_VEC_TEST_FUNCS(16, 4) DEF_VEC_TEST_FUNCS(32, 2) diff -r 9da2aba90c87 -r 41dd962abdd1 test/test.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test.cc Wed Nov 20 12:02:15 2024 -0500 @@ -0,0 +1,27 @@ +#define VEC_IMPLEMENTATION +#include "vec/vec.h" + +#include + +/* this test makes sure that vec can be included under C++ */ +int main(void) +{ + int ret = 0; + + VUINT32x8_ALIGNED_ARRAY(varrin); + VUINT32x8_ALIGNED_ARRAY(varrout); + + for (int i = 0; i < 8; i++) + varrin[i] = i; + + vuint32x8 vec = vuint32x8_load_aligned(varrin); + vec = vuint32x8_add(vec, vec); + + vuint32x8_store_aligned(vec, varrout); + + for (int i = 0; i < 8; i++) + if (varrout[i] != (uint32_t)(varrin[i] + varrin[i])) + ret |= 1; + + return ret; +} \ No newline at end of file diff -r 9da2aba90c87 -r 41dd962abdd1 test/test_align.h --- a/test/test_align.h Wed Nov 20 04:16:56 2024 -0500 +++ b/test/test_align.h Wed Nov 20 12:02:15 2024 -0500 @@ -31,6 +31,11 @@ RUN_TEST( , , bits, size) \ RUN_TEST(u, U, bits, size) + RUN_TESTS(8, 2) + + RUN_TESTS(8, 4) + RUN_TESTS(16, 2) + RUN_TESTS(8, 8) RUN_TESTS(16, 4) RUN_TESTS(32, 2) diff -r 9da2aba90c87 -r 41dd962abdd1 test/test_arith.h --- a/test/test_arith.h Wed Nov 20 04:16:56 2024 -0500 +++ b/test/test_arith.h Wed Nov 20 12:02:15 2024 -0500 @@ -69,6 +69,11 @@ CREATE_TESTS_SIGN(, d, , bits, size) \ CREATE_TESTS_SIGN(u, u, U, bits, size) +CREATE_TESTS(8, 2) + +CREATE_TESTS(8, 4) +CREATE_TESTS(16, 2) + CREATE_TESTS(8, 8) CREATE_TESTS(16, 4) CREATE_TESTS(32, 2) @@ -91,6 +96,7 @@ #undef CREATE_TESTS_SIGN #undef CREATE_TESTS #undef CREATE_TEST +#undef CREATE_TEST_SHIFT static int test_arith(void) { @@ -126,6 +132,11 @@ RUN_TESTS_SIGN( , bits, size) \ RUN_TESTS_SIGN(u, bits, size) + RUN_TESTS(8, 2) + + RUN_TESTS(8, 4) + RUN_TESTS(16, 2) + RUN_TESTS(8, 8) RUN_TESTS(16, 4) RUN_TESTS(32, 2) diff -r 9da2aba90c87 -r 41dd962abdd1 test/test_compare.h --- a/test/test_compare.h Wed Nov 20 04:16:56 2024 -0500 +++ b/test/test_compare.h Wed Nov 20 12:02:15 2024 -0500 @@ -32,6 +32,11 @@ #define CREATE_TESTS(bits, size) CREATE_TESTS_SIGN(, d, bits, size) CREATE_TESTS_SIGN(u, u, bits, size) +CREATE_TESTS(8, 2) + +CREATE_TESTS(8, 4) +CREATE_TESTS(16, 2) + CREATE_TESTS(8, 8) CREATE_TESTS(16, 4) CREATE_TESTS(32, 2) @@ -76,6 +81,11 @@ RUN_TESTS_SIGN( , bits, size) \ RUN_TESTS_SIGN(u, bits, size) + RUN_TESTS(8, 2) + + RUN_TESTS(8, 4) + RUN_TESTS(16, 2) + RUN_TESTS(8, 8) RUN_TESTS(16, 4) RUN_TESTS(32, 2) diff -r 9da2aba90c87 -r 41dd962abdd1 test/vec.pc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/vec.pc Wed Nov 20 12:02:15 2024 -0500 @@ -0,0 +1,12 @@ +prefix=/usr/local +exec_prefix=/usr/local +libdir=${exec_prefix}/lib +includedir=${prefix}/include + +Name: vec +Description: a tiny C99 SIMD vector library +Version: 2.0.0 + +Requires: +Libs: -L${libdir} -lvec +Cflags: -I${includedir}