# HG changeset patch # User Paper # Date 1732093837 18000 # Node ID e05c257c6a23092dac82b5bdb8ccd19277fb59fa # Parent 981cf0bc7f3a398b6129614e6b879f5684aea407 *: huge refactor, add many new x86 intrinsics and the like ALSO!! now intrinsics are enabled at runtime, depending on what is detected. altivec *should* still work but I only tested compiling it. the major version has been updated to 2.0 for this... diff -r 981cf0bc7f3a -r e05c257c6a23 CMakeLists.txt --- a/CMakeLists.txt Tue Nov 19 15:55:01 2024 -0500 +++ b/CMakeLists.txt Wed Nov 20 04:10:37 2024 -0500 @@ -1,10 +1,42 @@ cmake_minimum_required(VERSION 3.5) -project(vec VERSION 1.0.0 DESCRIPTION "a tiny C99 SIMD vector library") +project(vec VERSION 2.0.0 DESCRIPTION "a tiny C99 SIMD vector library") add_library(vec SHARED src/vec.c) -set_target_properties(vec PROPERTIES PUBLIC_HEADER include/vec/vec.h) +include(CheckCCompilerFlag) + +if(MSVC) + # TODO ? +else() + check_c_compiler_flag("-maltivec" COMPILER_HAS_ALTIVEC) + if(COMPILER_HAS_ALTIVEC) + target_compile_options(vec PRIVATE "-maltivec") + endif() + check_c_compiler_flag("-mmmx" COMPILER_HAS_MMX) + if(COMPILER_HAS_MMX) + target_compile_options(vec PRIVATE "-mmmx") + endif() + check_c_compiler_flag("-msse2" COMPILER_HAS_SSE2) + if(COMPILER_HAS_SSE2) + target_compile_options(vec PRIVATE "-msse2") + endif() + check_c_compiler_flag("-msse4.1" COMPILER_HAS_SSE41) + if(COMPILER_HAS_SSE41) + target_compile_options(vec PRIVATE "-msse4.1") + endif() + check_c_compiler_flag("-mavx2" COMPILER_HAS_AVX2) + if(COMPILER_HAS_AVX2) + target_compile_options(vec PRIVATE "-mavx2") + endif() + check_c_compiler_flag("-mavx512f" COMPILER_HAS_AVX512F) + if(COMPILER_HAS_AVX512F) + target_compile_options(vec PRIVATE "-mavx512f") + endif() +endif() + + +set_target_properties(vec PROPERTIES PUBLIC_HEADER include/vec/vec.h C_STANDARD 99) target_include_directories(vec PRIVATE include) @@ -13,9 +45,9 @@ include(GNUInstallDirs) install(TARGETS vec - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # pkg-config configure_file(vec.pc.in vec.pc @ONLY) -install(FILES ${CMAKE_BINARY_DIR}/vec.pc DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig) \ No newline at end of file +install(FILES ${CMAKE_BINARY_DIR}/vec.pc DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig) diff -r 981cf0bc7f3a -r e05c257c6a23 README --- a/README Tue Nov 19 15:55:01 2024 -0500 +++ b/README Wed Nov 20 04:10:37 2024 -0500 @@ -1,19 +1,19 @@ vec - a tiny SIMD vector header-only library written in C99 -it comes with an extremely basic (and somewhat lacking) API, -where there are eight supported vector types, all 128-bit: +it comes with an extremely basic API that is similar to other intrinsics +libraries; each type is in the exact same format: - vint8x16 - 16 signed 8-bit integers - vint16x8 - 8 signed 16-bit integers - vint32x4 - 4 signed 32-bit integers - vint64x2 - 2 signed 64-bit integers - vuint8x16 - 16 unsigned 8-bit integers - vuint16x8 - 8 unsigned 16-bit integers - vuint32x4 - 4 unsigned 32-bit integers - vuint32x4 - 2 unsigned 64-bit integers + v[sign][bits]x[size] + where `sign' is either nothing (for signed) or `u' (for unsigned), + `bits' is the bit size of the integer format, + and `size' is the how many integers are in the vector -all of these have many operations that are prefixed with the -name of the type and an underscore, for example: +vec provides types for 64-bit, 128-bit, 256-bit, and 512-bit SIMD intrinsics +on processors where vec has an implementation and falls back to array-based +implementations where they are not. + +all of these have many operations that are prefixed with the name of the +type and an underscore, for example: vint8x16 vint8x16_splat(uint8_t x) - creates a vint8x16 where all of the values are filled @@ -106,3 +106,10 @@ the result vector if the value in `vec1' is greater than or equal to the corresponding value in `vec2', else all of the bits are turned off. + +to initialize vec, you MUST call `vec_init()' when your programs starts up. + +note that `vec_init()' is NOT thread-safe, and things can and will +blow up if you call it simultaneously from different threads (i.e. you +try to only initialize it when you need to... please just initialize +it on startup so you don't have to worry about that!!!) diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/altivec.h --- a/include/vec/impl/altivec.h Tue Nov 19 15:55:01 2024 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,219 +0,0 @@ -/** - * vec - a tiny SIMD vector library in plain C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -/* Altivec vector support. */ - -#include -#include - -#include - -#define VEC_ALTIVEC_ALIGNMENT 16 - -/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */ -#ifdef vec_mul -# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \ - VEC_DECL_MUL(sign, csign, bits, size) \ - { \ - return vec_mul(vec1, vec2); \ - } -#else -# define VEC_ALTIVEC_MUL(sign, csign, bits, size) \ - VEC_GENERIC_MULTIPLY(sign, csign, bits, size) -#endif - -#ifdef vec_splats -# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \ - VEC_DECL_SPLAT(sign, bits, size) \ - { \ - return vec_splats(x); \ - } -#else -# define VEC_ALTIVEC_SPLAT(sign, csign, bits, size) \ - VEC_GENERIC_SPLAT(sign, csign, bits, size) -#endif - -#define VEC_ALTIVEC_uRSHIFT vec_sr -#define VEC_ALTIVEC_RSHIFT vec_sra - -#define VEC_ALTIVEC_uLRSHIFT(sign, csign, bits, size) \ - VEC_DECL_SHIFT(sign, bits, size, l, r) \ - { \ - return vec_sr(vec1, vec2); \ - } -#define VEC_ALTIVEC_LRSHIFT(sign, csign, bits, size) \ - VEC_GENERIC_SHIFT(sign, csign, bits, size, l, r) - -/* Since altivec conveniently made their API super user friendly, we can just use - * one giant macro to define literally everything */ -#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ - VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ - { \ - return vec_ld(0, in); \ - } \ - \ - VEC_DECL_LOAD(sign, bits, size) \ - { \ - return vec_perm(vec_ld(0, in), vec_ld(VEC_ALTIVEC_ALIGNMENT, in), vec_lvsl(0, in)); \ - } \ - \ - VEC_DECL_STORE_ALIGNED(sign, bits, size) \ - { \ - vec_st(vec, 0, out); \ - } \ - \ - VEC_DECL_STORE(sign, bits, size) \ - { \ - VEC_ALIGNED_ARRAY(sign##int##bits##_t, aligned_out, size, VEC_ALTIVEC_ALIGNMENT); \ - vec_st(vec, 0, aligned_out); \ - memcpy(out, aligned_out, size * sizeof(*aligned_out)); \ - } \ - \ - VEC_DECL_ADD(sign, bits, size) \ - { \ - return vec_add(vec1, vec2); \ - } \ - \ - VEC_DECL_SUB(sign, bits, size) \ - { \ - return vec_sub(vec1, vec2); \ - } \ - \ - VEC_ALTIVEC_MUL(sign, csign, bits, size) \ - \ - VEC_DECL_SHIFT(sign, bits, size, , l) \ - { \ - return vec_sl(vec1, vec2); \ - } \ - \ - VEC_DECL_SHIFT(sign, bits, size, , r) \ - { \ - return VEC_ALTIVEC_##sign##RSHIFT(vec1, vec2); \ - } \ - \ - VEC_ALTIVEC_##sign##LRSHIFT(sign, csign, bits, size) \ - \ - VEC_DECL_AVG(sign, bits, size) \ - { \ - return vec_avg(vec1, vec2); \ - } \ - \ - VEC_DECL_AND(sign, bits, size) \ - { \ - return vec_and(vec1, vec2); \ - } \ - \ - VEC_DECL_OR(sign, bits, size) \ - { \ - return vec_or(vec1, vec2); \ - } \ - \ - VEC_DECL_XOR(sign, bits, size) \ - { \ - return vec_xor(vec1, vec2); \ - } \ - \ - VEC_GENERIC_COMPARISONS(sign, csign, bits, size) \ - VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ - VEC_ALTIVEC_SPLAT(sign, csign, bits, size) - -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -typedef vector unsigned char vuint8x16; -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 8, 16) -#endif /* VEC_VUINT8X16 */ - -#ifndef VEC_VINT8X16 -# define VEC_VINT8X16 -typedef vector signed char vint8x16; -# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 8, 16) -#endif /* VEC_VINT8X16 */ - -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -typedef vector unsigned short vuint16x8; -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vuint16x8){ a, b, c, d, e, f, g, h } -# define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 16, 8) -#endif /* VEC_VUINT16X8 */ - -#ifndef VEC_VINT16X8 -# define VEC_VINT16X8 -typedef vector signed short vint16x8; -# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vint16x8){ a, b, c, d, e, f, g, h } -# define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 16, 8) -#endif /* VEC_VINT16X8 */ - -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -typedef vector unsigned int vuint32x4; -# define VUINT32x4_CONSTANT(a, b, c, d) \ - (vuint32x4){ a, b, c, d } -# define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 32, 4) -#endif /* VEC_VUINT32X4 */ - -#ifndef VEC_VINT32X4 -# define VEC_VINT32X4 -typedef vector signed int vint32x4; -# define VINT32x4_CONSTANT(a, b, c, d) \ - (vint32x4){ a, b, c, d } -# define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 32, 4) -#endif /* VEC_VINT32X4 */ - -#if defined(__POWER8__) && defined(__VSX__) - -# ifndef VEC_VUINT64X2 -# define VEC_VUINT64X2 -typedef vector unsigned long long vuint64x2; -# define VUINT64x2_CONSTANT(a, b) \ - (vuint64x2){ a, b } -# define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 64, 2) -# endif /* VEC_VUINT64X2 */ - -# ifndef VEC_VINT64X2 -# define VEC_VINT64X2 -typedef vector signed long long vint64x2; -# define VINT64x2_CONSTANT(a, b) \ - (vint64x2){ a, b } -# define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 64, 2) -# endif /* VEC_VINT64X2 */ - -#endif /* defined(__POWER8__) && defined(__VSX__) */ - -#undef VEC_DEFINE_OPERATIONS -#undef VEC_ALTIVEC_MUL -#undef VEC_ALTIVEC_SPLAT diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/cpu.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/cpu.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,397 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_CPU_H_ +#define VEC_IMPL_CPU_H_ + +/* Detect CPU SIMD support. Much of this code was stolen from SDL. + * + * Simple DirectMedia Layer + * Copyright (C) 1997-2024 Sam Lantinga + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. +*/ + +# if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) +# include // For AltiVec check +# elif defined(__OpenBSD__) && defined(__powerpc__) +# include +# include // For AltiVec check +# include +# elif defined(__FreeBSD__) && defined(__powerpc__) +# include +# include +# elif defined(__ALTIVEC__) +# include +# include +# endif + +# ifdef __FreeBSD__ +# include +# endif + +static inline int vec_CPU_have_CPUID(void) +{ + int has_CPUID = 0; + +#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) + __asm__ ( +" pushfl # Get original EFLAGS \n" +" popl %%eax \n" +" movl %%eax,%%ecx \n" +" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" +" pushl %%eax # Save new EFLAGS value on stack \n" +" popfl # Replace current EFLAGS value \n" +" pushfl # Get new EFLAGS \n" +" popl %%eax # Store new EFLAGS in EAX \n" +" xorl %%ecx,%%eax # Can not toggle ID bit, \n" +" jz 1f # Processor=80486 \n" +" movl $1,%0 # We have CPUID support \n" +"1: \n" + : "=m" (has_CPUID) + : + : "%eax", "%ecx" + ); +#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) +/* Technically, if this is being compiled under __x86_64__ then it has + CPUid by definition. But it's nice to be able to prove it. :) */ + __asm__ ( +" pushfq # Get original EFLAGS \n" +" popq %%rax \n" +" movq %%rax,%%rcx \n" +" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" +" pushq %%rax # Save new EFLAGS value on stack \n" +" popfq # Replace current EFLAGS value \n" +" pushfq # Get new EFLAGS \n" +" popq %%rax # Store new EFLAGS in EAX \n" +" xorl %%ecx,%%eax # Can not toggle ID bit, \n" +" jz 1f # Processor=80486 \n" +" movl $1,%0 # We have CPUID support \n" +"1: \n" + : "=m" (has_CPUID) + : + : "%rax", "%rcx" + ); +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) + __asm { + pushfd ; Get original EFLAGS + pop eax + mov ecx, eax + xor eax, 200000h ; Flip ID bit in EFLAGS + push eax ; Save new EFLAGS value on stack + popfd ; Replace current EFLAGS value + pushfd ; Get new EFLAGS + pop eax ; Store new EFLAGS in EAX + xor eax, ecx ; Can not toggle ID bit, + jz done ; Processor=80486 + mov has_CPUID,1 ; We have CPUID support +done: + } +#elif defined(_MSC_VER) && defined(_M_X64) + has_CPUID = 1; +#elif defined(__sun) && defined(__i386) + __asm ( +" pushfl \n" +" popl %eax \n" +" movl %eax,%ecx \n" +" xorl $0x200000,%eax \n" +" pushl %eax \n" +" popfl \n" +" pushfl \n" +" popl %eax \n" +" xorl %ecx,%eax \n" +" jz 1f \n" +" movl $1,-8(%ebp) \n" +"1: \n" + ); +#elif defined(__sun) && defined(__amd64) + __asm ( +" pushfq \n" +" popq %rax \n" +" movq %rax,%rcx \n" +" xorl $0x200000,%eax \n" +" pushq %rax \n" +" popfq \n" +" pushfq \n" +" popq %rax \n" +" xorl %ecx,%eax \n" +" jz 1f \n" +" movl $1,-8(%rbp) \n" +"1: \n" + ); +#endif + + return has_CPUID; +} + +#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm__ __volatile__( \ + " pushl %%ebx \n" \ + " xorl %%ecx,%%ecx \n" \ + " cpuid \n" \ + " movl %%ebx, %%esi \n" \ + " popl %%ebx \n" \ + : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ + : "a"(func)) +#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm__ __volatile__( \ + " pushq %%rbx \n" \ + " xorq %%rcx,%%rcx \n" \ + " cpuid \n" \ + " movq %%rbx, %%rsi \n" \ + " popq %%rbx \n" \ + : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ + : "a"(func)) +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm { \ + __asm mov eax, func \ + __asm xor ecx, ecx \ + __asm cpuid \ + __asm mov a, eax \ + __asm mov b, ebx \ + __asm mov c, ecx \ + __asm mov d, edx \ + } +#elif (defined(_MSC_VER) && defined(_M_X64)) +// Use __cpuidex instead of __cpuid because ICL does not clear ecx register +# define VEC_CPU_CPUID(func, a, b, c, d) \ + do { \ + int CPUInfo[4]; \ + __cpuidex(CPUInfo, func, 0); \ + a = CPUInfo[0]; \ + b = CPUInfo[1]; \ + c = CPUInfo[2]; \ + d = CPUInfo[3]; \ + } while (0) +#else +# define VEC_CPU_CPUID(func, a, b, c, d) \ + do { \ + a = b = c = d = 0; \ + (void)a; \ + (void)b; \ + (void)c; \ + (void)d; \ + } while (0) +#endif + +// --------------------------------------------------------------- + +static int vec_CPU_CPUIDFeatures[4]; +static int vec_CPU_CPUIDMaxFunction = 0; +static int vec_CPU_OSSavesYMM = 0; +static int vec_CPU_OSSavesZMM = 0; + +static inline void vec_CPU_get_CPUID_features(void) +{ + static int checked = 0; + if (!checked) { + checked = 1; + if (vec_CPU_have_CPUID()) { + int a, b, c, d; + VEC_CPU_CPUID(0, a, b, c, d); + vec_CPU_CPUIDMaxFunction = a; + if (vec_CPU_CPUIDMaxFunction >= 1) { + VEC_CPU_CPUID(1, a, b, c, d); + vec_CPU_CPUIDFeatures[0] = a; + vec_CPU_CPUIDFeatures[1] = b; + vec_CPU_CPUIDFeatures[2] = c; + vec_CPU_CPUIDFeatures[3] = d; + + // Check to make sure we can call xgetbv + if (c & 0x08000000) { + // Call xgetbv to see if YMM (etc) register state is saved +#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__)) + __asm__(".byte 0x0f, 0x01, 0xd0" + : "=a"(a) + : "c"(0) + : "%edx"); +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1 + a = (int)_xgetbv(0); +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) + __asm { + xor ecx, ecx + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 + mov a, eax + } +#endif + vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0; + vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0; + } + } + } + } +} + +#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) +static jmp_buf vec_jmpbuf; +static void vec_CPU_illegal_instruction(int sig) +{ + longjmp(vec_jmpbuf, 1); +} +#endif + +static int vec_CPU_have_ALTIVEC(void) +{ + volatile int altivec = 0; +#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__)) + int selectors[2] = { +# ifdef __OpenBSD__ + CTL_MACHDEP, CPU_ALTIVEC +# else + CTL_HW, HW_VECTORUNIT +# endif + }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if (!error) + altivec = (hasVectorUnit != 0); +#elif defined(__FreeBSD__) && defined(__powerpc__) + unsigned long cpufeatures = 0; + elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); + altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; +#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) + void (*handler)(int sig); + handler = signal(SIGILL, vec_CPU_illegal_instruction); + if (!setjmp(vec_jmpbuf)) { + asm volatile("mtspr 256, %0\n\t" + "vand %%v0, %%v0, %%v0" ::"r"(-1)); + altivec = 1; + } + signal(SIGILL, handler); +#endif + return altivec; +} + +static int vec_CPU_have_ALTIVEC_VSX(void) +{ + volatile int vsx = 0; +#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__) + void (*handler)(int sig); + handler = signal(SIGILL, vec_CPU_illegal_instruction); + if (!setjmp(vec_jmpbuf)) { + // this is completely untested + asm volatile("mtspr 256, %0\n\t" + "xxland %%v0, %%v0, %%v0" ::"r"(-1)); + vsx = 1; + } + signal(SIGILL, handler); +#endif + return vsx; +} + +#define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000) +#define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000) +#define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000) +#define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001) +#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000) +#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000) +#define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000)) + +static inline int vec_CPU_have_AVX2(void) +{ + if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { + int a, b, c, d; + VEC_CPU_CPUID(7, a, b, c, d); + return b & 0x00000020; + (void)a, (void)c, (void)d; + } + return 0; +} + +static inline int vec_CPU_have_AVX512F(void) +{ + if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { + int a, b, c, d; + VEC_CPU_CPUID(7, a, b, c, d); + return b & 0x00000020; + (void)a, (void)c, (void)d; + } + return 0; +} + +enum { + VEC_CPU_HAS_ALTIVEC = (1 << 0), + VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1), + VEC_CPU_HAS_MMX = (1 << 2), + VEC_CPU_HAS_SSE = (1 << 3), + VEC_CPU_HAS_SSE2 = (1 << 4), + VEC_CPU_HAS_SSE3 = (1 << 5), + VEC_CPU_HAS_SSE41 = (1 << 6), + VEC_CPU_HAS_SSE42 = (1 << 7), + VEC_CPU_HAS_AVX = (1 << 8), + VEC_CPU_HAS_AVX2 = (1 << 9), + VEC_CPU_HAS_AVX512F = (1 << 10), +}; + +#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF) + +static uint32_t vec_CPU_features = VEC_CPU_FEATURES_RESET; + +static void vec_get_CPU_features(void) +{ + vec_CPU_get_CPUID_features(); + vec_CPU_features = 0; + if (vec_CPU_have_ALTIVEC()) + vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; + if (vec_CPU_have_ALTIVEC_VSX()) + vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; + if (vec_CPU_have_MMX()) + vec_CPU_features |= VEC_CPU_HAS_MMX; + if (vec_CPU_have_SSE()) + vec_CPU_features |= VEC_CPU_HAS_SSE; + if (vec_CPU_have_SSE2()) + vec_CPU_features |= VEC_CPU_HAS_SSE2; + if (vec_CPU_have_SSE3()) + vec_CPU_features |= VEC_CPU_HAS_SSE3; + if (vec_CPU_have_SSE41()) + vec_CPU_features |= VEC_CPU_HAS_SSE41; + if (vec_CPU_have_SSE42()) + vec_CPU_features |= VEC_CPU_HAS_SSE42; + if (vec_CPU_have_AVX()) + vec_CPU_features |= VEC_CPU_HAS_AVX; + if (vec_CPU_have_AVX2()) + vec_CPU_features |= VEC_CPU_HAS_AVX2; + if (vec_CPU_have_AVX512F()) + vec_CPU_features |= VEC_CPU_HAS_AVX512F; +} + +#endif /* VEC_IMPL_CPU_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/fallback.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/fallback.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,202 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_FALLBACK_H_ +#define VEC_IMPL_FALLBACK_H_ + +// Fallback implementations - this is what an implementation should use if it +// doesn't support a specific function. Note that the load_aligned and +// store_aligned functions are not implemented here - this is on purpose; +// every single implementation *needs* to have one of these. + +#define VEC_FALLBACK_OPERATION(op, sign, csign, bits, size) \ + do { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ + v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \ + \ + for (int i = 0; i < size; i++) varr1[i] = (op); \ + \ + return v##sign##int##bits##x##size##_load_aligned(varr1); \ + } while (0) + +#define VEC_FALLBACK_CMP(op, sign, csign, bits, size) \ + VEC_FALLBACK_OPERATION((varr1[i] op varr2[i]) ? UINT##bits##_MAX : 0, sign, csign, bits, size) + +#define VEC_FALLBACK_SHIFT(op, sign, csign, bits, size) \ + do { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ + VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ + vuint##bits##x##size##_store_aligned(vec2, varr2); \ + \ + for (int i = 0; i < size; i++) varr1[i] = (op); \ + \ + return v##sign##int##bits##x##size##_load_aligned(varr1); \ + } while (0) + +#define VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_splat(sign##int##bits##_t x) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ + for (int i = 0; i < size; i++) arr[i] = x; \ + return v##sign##int##bits##x##size##_load_aligned(arr); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_load(const sign##int##bits##_t in[size]) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ + memcpy(arr, in, sizeof(sign##int##bits##_t) * size); \ + return v##sign##int##bits##x##size##_load_aligned(arr); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(arr); \ + v##sign##int##bits##x##size##_store_aligned(vec, arr); \ + memcpy(out, arr, sizeof(sign##int##bits##_t) * size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size##_add(vec1, vec2), v##sign##int##bits##x##size##_splat(2)); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_not(v##sign##int##bits##x##size vec) \ + { \ + return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(<, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(<=, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(==, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(>=, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_FALLBACK_CMP(>, sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_FALLBACK_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_FALLBACK_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_fallback_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_FALLBACK_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size); \ + } + +#define VEC_DEFINE_FALLBACK_OPERATIONS(bits, size) \ + VEC_DEFINE_FALLBACK_OPERATIONS_SIGN( , , bits, size) \ + VEC_DEFINE_FALLBACK_OPERATIONS_SIGN(u, U, bits, size) + +// 64-bit +VEC_DEFINE_FALLBACK_OPERATIONS(8, 8) +VEC_DEFINE_FALLBACK_OPERATIONS(16, 4) +VEC_DEFINE_FALLBACK_OPERATIONS(32, 2) + +// 128-bit +VEC_DEFINE_FALLBACK_OPERATIONS(8, 16) +VEC_DEFINE_FALLBACK_OPERATIONS(16, 8) +VEC_DEFINE_FALLBACK_OPERATIONS(32, 4) +VEC_DEFINE_FALLBACK_OPERATIONS(64, 2) + +// 256-bit +VEC_DEFINE_FALLBACK_OPERATIONS(8, 32) +VEC_DEFINE_FALLBACK_OPERATIONS(16, 16) +VEC_DEFINE_FALLBACK_OPERATIONS(32, 8) +VEC_DEFINE_FALLBACK_OPERATIONS(64, 4) + +// 512-bit +VEC_DEFINE_FALLBACK_OPERATIONS(8, 64) +VEC_DEFINE_FALLBACK_OPERATIONS(16, 32) +VEC_DEFINE_FALLBACK_OPERATIONS(32, 16) +VEC_DEFINE_FALLBACK_OPERATIONS(64, 8) + +#undef VEC_FALLBACK_OPERATION +#undef VEC_FALLBACK_CMP +#undef VEC_FALLBACK_SHIFT +#undef VEC_DEFINE_FALLBACK_OPERATIONS +#undef VEC_DEFINE_FALLBACK_OPERATIONS_SIGN + +#endif /* VEC_IMPL_FALLBACK_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/gcc.h --- a/include/vec/impl/gcc.h Tue Nov 19 15:55:01 2024 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,221 +0,0 @@ -/** - * vec - a tiny SIMD vector library in plain C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -/* GCC built in vectors */ - -#include -#include - -#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ - VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ - { \ - v##sign##int##bits##x##size vec; \ - memcpy(&vec, in, sizeof(*in) * size); \ - return vec; \ - } \ - \ - VEC_DECL_LOAD(sign, bits, size) \ - { \ - return v##sign##int##bits##x##size##_load_aligned(in); \ - } \ - \ - VEC_DECL_STORE_ALIGNED(sign, bits, size) \ - { \ - memcpy(out, &vec, sizeof(vec)); \ - } \ - \ - VEC_DECL_STORE(sign, bits, size) \ - { \ - return v##sign##int##bits##x##size##_store_aligned(vec, out); \ - } \ - \ - VEC_DECL_ADD(sign, bits, size) \ - { \ - return vec1 + vec2; \ - } \ - \ - VEC_DECL_SUB(sign, bits, size) \ - { \ - return vec1 - vec2; \ - } \ - \ - VEC_DECL_MUL(sign, bits, size) \ - { \ - return vec1 * vec2; \ - } \ - \ - VEC_DECL_AND(sign, bits, size) \ - { \ - return vec1 & vec2; \ - } \ - \ - VEC_DECL_OR(sign, bits, size) \ - { \ - return vec1 | vec2; \ - } \ - \ - VEC_DECL_XOR(sign, bits, size) \ - { \ - return vec1 ^ vec2; \ - } \ - VEC_DECL_CMPLT(sign, bits, size) \ - { \ - return vec1 < vec2; \ - } \ - VEC_DECL_CMPGT(sign, bits, size) \ - { \ - return vec1 > vec2; \ - } \ - VEC_DECL_CMPEQ(sign, bits, size) \ - { \ - return vec1 == vec2; \ - } \ - VEC_DECL_CMPLE(sign, bits, size) \ - { \ - return vec1 <= vec2; \ - } \ - VEC_DECL_CMPGE(sign, bits, size) \ - { \ - return vec1 >= vec2; \ - } \ - \ - VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ - VEC_GENERIC_SPLAT(sign, csign, bits, size) \ - VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ - VEC_GENERIC_AVG(sign, bits, size) - -// ----------------------------------------------------------------------------------- -// 128-bit vector types - -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -typedef uint8_t vuint8x16 __attribute__((__vector_size__(16))); -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VUINT8x16_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 8, 16) -#endif - -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -typedef uint16_t vuint16x8 __attribute__((__vector_size__(16))); -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vuint16x8){ a, b, c, d, e, f, g, h } -# define VUINT16x8_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 16, 8) -#endif - -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -typedef uint32_t vuint32x4 __attribute__((__vector_size__(16))); -# define VUINT32x4_CONSTANT(a, b, c, d) \ - (vuint32x4){ a, b, c, d } -# define VUINT32x4_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 32, 4) -#endif - -#ifndef VEC_VUINT64X2 -# define VEC_VUINT64X2 -typedef uint64_t vuint64x2 __attribute__((__vector_size__(16))); -# define VUINT64x2_CONSTANT(a, b) \ - (vuint64x2){ a, b } -# define VUINT64x2_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 64, 2) -#endif - -#ifndef VEC_VINT8X16 -# define VEC_VINT8X16 -typedef int8_t vint8x16 __attribute__((__vector_size__(16))); -# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VINT8x16_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(, , 8, 16) -#endif - -#ifndef VEC_VINT16X8 -# define VEC_VINT16X8 -typedef int16_t vint16x8 __attribute__((__vector_size__(16))); -# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vint16x8){ a, b, c, d, e, f, g, h } -# define VINT16x8_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(, , 16, 8) -#endif - -#ifndef VEC_VINT32X4 -# define VEC_VINT32X4 -typedef int32_t vint32x4 __attribute__((__vector_size__(16))); -# define VINT32x4_CONSTANT(a, b, c, d) \ - (vint32x4){ a, b, c, d } -# define VINT32x4_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(, , 32, 4) -#endif - -#ifndef VEC_VINT64X2 -# define VEC_VINT64X2 -typedef int64_t vint64x2 __attribute__((__vector_size__(16))); -# define VINT64x2_CONSTANT(a, b) \ - (vint64x2){ a, b } -# define VINT64x2_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(, , 64, 2) -#endif - -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -typedef uint8_t vuint8x16 __attribute__((__vector_size__(16))); -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } -# define VUINT8x16_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 8, 16) -#endif - -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -typedef uint16_t vuint16x8 __attribute__((__vector_size__(16))); -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (vuint16x8){ a, b, c, d, e, f, g, h } -# define VUINT16x8_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 16, 8) -#endif - -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -typedef uint32_t vuint32x4 __attribute__((__vector_size__(16))); -# define VUINT32x4_CONSTANT(a, b, c, d) \ - (vuint32x4){ a, b, c, d } -# define VUINT32x4_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 32, 4) -#endif - -#ifndef VEC_VUINT64X2 -# define VEC_VUINT64X2 -typedef uint64_t vuint64x2 __attribute__((__vector_size__(16))); -# define VUINT64x2_CONSTANT(a, b) \ - (vuint64x2){ a, b } -# define VUINT64x2_ALIGNMENT 16 -VEC_DEFINE_OPERATIONS(u, U, 64, 2) -#endif - -// ---------------------------------------------------------- - -#undef VEC_DEFINE_OPERATIONS diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/generic.h --- a/include/vec/impl/generic.h Tue Nov 19 15:55:01 2024 -0500 +++ b/include/vec/impl/generic.h Wed Nov 20 04:10:37 2024 -0500 @@ -24,362 +24,91 @@ /* Generic array-based implementation. */ +#ifndef VEC_IMPL_GENERIC_H_ +#define VEC_IMPL_GENERIC_H_ + #include #include -#define VEC_DEFINE_STRUCT(sign, bits, size) \ - typedef struct { \ - sign##int##bits##_t arr[size]; \ - } v##sign##int##bits##x##size; +// ----------------------------------------------------------------- -#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ - VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ +// TODO implement these so we don't waste stack space by doing the +// fallbacks +#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \ { \ v##sign##int##bits##x##size vec; \ - memcpy(vec.arr, in, sizeof(vec.arr)); \ - return vec; \ - } \ - \ - VEC_DECL_LOAD(sign, bits, size) \ - { \ - return v##sign##int##bits##x##size##_load_aligned(in); \ - } \ - \ - VEC_DECL_STORE_ALIGNED(sign, bits, size) \ - { \ - memcpy(out, vec.arr, sizeof(vec.arr)); \ - } \ - \ - VEC_DECL_STORE(sign, bits, size) \ - { \ - return v##sign##int##bits##x##size##_store_aligned(vec, out); \ - } \ - \ - VEC_DECL_ADD(sign, bits, size) \ - { \ - for (int i = 0; i < size; i++) vec1.arr[i] += vec2.arr[i]; \ - return vec1; \ - } \ - \ - VEC_DECL_SUB(sign, bits, size) \ - { \ - for (int i = 0; i < size; i++) vec1.arr[i] -= vec2.arr[i]; \ - return vec1; \ - } \ - \ - VEC_DECL_MUL(sign, bits, size) \ - { \ - for (int i = 0; i < size; i++) vec1.arr[i] *= vec2.arr[i]; \ - return vec1; \ - } \ - \ - VEC_DECL_AND(sign, bits, size) \ - { \ - for (int i = 0; i < size; i++) vec1.arr[i] &= vec2.arr[i]; \ - return vec1; \ - } \ - \ - VEC_DECL_OR(sign, bits, size) \ - { \ - for (int i = 0; i < size; i++) vec1.arr[i] |= vec2.arr[i]; \ - return vec1; \ - } \ - \ - VEC_DECL_XOR(sign, bits, size) \ - { \ - for (int i = 0; i < size; i++) vec1.arr[i] ^= vec2.arr[i]; \ - return vec1; \ - } \ - \ - VEC_GENERIC_SPLAT(sign, csign, bits, size) \ - VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ - VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ - VEC_GENERIC_AVG(sign, bits, size) \ - VEC_GENERIC_COMPARISONS(sign, csign, bits, size) - -#ifndef VEC_VUINT8X16 -# define VEC_VUINT8X16 -VEC_DEFINE_STRUCT(u, 8, 16) -# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - ((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) -# define VUINT8x16_ALIGNMENT 1 -VEC_DEFINE_OPERATIONS(u, U, 8, 16) -#endif - -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -VEC_DEFINE_STRUCT(u, 16, 8) -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - ((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } }) -# define VUINT16x8_ALIGNMENT 2 -VEC_DEFINE_OPERATIONS(u, U, 16, 8) -#endif - -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -VEC_DEFINE_STRUCT(u, 32, 4) -# define VUINT32x4_CONSTANT(a, b, c, d) \ - ((vuint32x4){ .arr = { a, b, c, d } }) -# define VUINT32x4_ALIGNMENT 4 -VEC_DEFINE_OPERATIONS(u, U, 32, 4) -#endif - -#ifndef VEC_VUINT64X2 -# define VEC_VUINT64X2 -VEC_DEFINE_STRUCT(u, 64, 2) -# define VUINT64x2_CONSTANT(a, b) \ - ((vuint64x2){ .arr = { a, b } }) -# define VUINT64x2_ALIGNMENT 8 -VEC_DEFINE_OPERATIONS(u, U, 64, 2) -#endif - -#ifndef VEC_VINT16X8 -# define VEC_VINT16X8 -VEC_DEFINE_STRUCT(, 16, 8) -# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - ((vint16x8){ .arr = { a, b, c, d, e, f, g, h } }) -# define VINT16x8_ALIGNMENT 2 -VEC_DEFINE_OPERATIONS(, , 16, 8) -#endif - -#ifndef VEC_VINT32X4 -# define VEC_VINT32X4 -VEC_DEFINE_STRUCT(, 32, 4) -# define VINT32x4_CONSTANT(a, b, c, d) \ - ((vint32x4){ .arr = { a, b, c, d } }) -# define VINT32x4_ALIGNMENT 4 -VEC_DEFINE_OPERATIONS(, , 32, 4) -#endif - -#ifndef VEC_VINT64X2 -# define VEC_VINT64X2 -VEC_DEFINE_STRUCT(, 64, 2) -# define VINT64x2_CONSTANT(a, b) \ - ((vint64x2){ .arr = { a, b } }) -# define VINT64x2_ALIGNMENT 8 -VEC_DEFINE_OPERATIONS(, , 64, 2) -#endif - -#undef VEC_DEFINE_STRUCT -#undef VEC_DEFINE_OPERATIONS - -// ----------------------------------------------------------------- -// Okay, now we can implement our "double" structures. -// These use existing structures that are 128 bits in -// size to provide 256-bit or even 512-bit data types. - -#define VEC_DEFINE_STRUCT(sign, bits, size, halfsize) \ - typedef struct { \ - v##sign##int##bits##x##halfsize vecs[2]; \ - } v##sign##int##bits##x##size; - -#define VEC_DEFINE_OP(opcap, op, sign, bits, size, halfsize) \ - VEC_DECL_##opcap(sign, bits, size) \ - { \ - vec1.vecs[0] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[0], vec2.vecs[0]); \ - vec1.vecs[1] = v##sign##int##bits##x##halfsize##_##op(vec1.vecs[1], vec2.vecs[1]); \ - return vec1; \ - } - -// This could be in way fewer lines, but whatever -#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size, halfsize) \ - VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.vecs[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \ - vec.vecs[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \ - return vec; \ - } \ - \ - VEC_DECL_LOAD(sign, bits, size) \ - { \ - v##sign##int##bits##x##size vec; \ - vec.vecs[0] = v##sign##int##bits##x##halfsize##_load(in); \ - vec.vecs[1] = v##sign##int##bits##x##halfsize##_load(in + halfsize); \ + memcpy(vec.generic, in, sizeof(sign##int##bits##_t) * size); \ return vec; \ } \ \ - VEC_DECL_SPLAT(sign, bits, size) \ + static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + memcpy(out, vec.generic, sizeof(sign##int##bits##_t) * size); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ + .load_aligned = v##sign##int##bits##x##size##_generic_load_aligned, \ + .store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \ + }; + +#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +VEC_GENERIC_DEFINE_OPERATIONS(8, 8) +VEC_GENERIC_DEFINE_OPERATIONS(16, 4) +VEC_GENERIC_DEFINE_OPERATIONS(32, 2) +VEC_GENERIC_DEFINE_OPERATIONS(64, 2) + +#undef VEC_GENERIC_DEFINE_OPERATIONS +#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN + +// ----------------------------------------------------------------- +// now we can just keep doubling the same implementation + +#define VEC_GENERIC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size, halfsize) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_generic_load_aligned(const sign##int##bits##_t in[size]) \ { \ v##sign##int##bits##x##size vec; \ - vec.vecs[0] = v##sign##int##bits##x##halfsize##_splat(x); \ - vec.vecs[1] = v##sign##int##bits##x##halfsize##_splat(x); \ + vec.generic[0] = v##sign##int##bits##x##halfsize##_load_aligned(in); \ + vec.generic[1] = v##sign##int##bits##x##halfsize##_load_aligned(in + halfsize); \ return vec; \ } \ \ - VEC_DECL_STORE_ALIGNED(sign, bits, size) \ + static void v##sign##int##bits##x##size##_generic_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ { \ - v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[0], out); \ - v##sign##int##bits##x##halfsize##_store_aligned(vec.vecs[1], out + halfsize); \ - } \ - \ - VEC_DECL_STORE(sign, bits, size) \ - { \ - v##sign##int##bits##x##halfsize##_store(vec.vecs[0], out); \ - v##sign##int##bits##x##halfsize##_store(vec.vecs[1], out + halfsize); \ + v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[0], out); \ + v##sign##int##bits##x##halfsize##_store_aligned(vec.generic[1], out + halfsize); \ } \ \ - VEC_DEFINE_OP(ADD, add, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(SUB, sub, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(MUL, mul, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(AND, and, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(OR, or, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(XOR, xor, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(LSHIFT, lshift, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(RSHIFT, rshift, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(LRSHIFT, lrshift, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(DIV, div, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(AVG, avg, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(CMPLT, cmplt, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(CMPGT, cmpgt, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(CMPEQ, cmpeq, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(CMPGE, cmpge, sign, bits, size, halfsize) \ - VEC_DEFINE_OP(CMPLE, cmple, sign, bits, size, halfsize) - -#ifndef VEC_VUINT8X32 -# define VEC_VUINT8X32 -VEC_DEFINE_STRUCT(u, 8, 32, 16) -# define VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vuint8x32){ .vecs = { VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) -# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 8, 32, 16) -#endif - -#ifndef VEC_VUINT16X16 -# define VEC_VUINT16X16 -VEC_DEFINE_STRUCT(u, 16, 16, 8) -# define VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - ((vuint16x16){ .vecs = { VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } }) -# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 16, 16, 8) -#endif + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_generic = { \ + .load_aligned = v##sign##int##bits##x##size##_generic_load_aligned, \ + .store_aligned = v##sign##int##bits##x##size##_generic_store_aligned, \ + }; -#ifndef VEC_VUINT32X8 -# define VEC_VUINT32X8 -VEC_DEFINE_STRUCT(u, 32, 8, 4) -# define VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \ - ((vuint32x8){ .vecs = { VUINT32x4_CONSTANT(a, b, c, d), VUINT32x4_CONSTANT(e, f, g, h) } }) -# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 32, 8, 4) -#endif - -#ifndef VEC_VUINT64X4 -# define VEC_VUINT64X4 -VEC_DEFINE_STRUCT(u, 64, 4, 2) -# define VUINT64x4_CONSTANT(a, b, c, d) \ - ((vuint64x4){ .vecs = { VUINT64x2_CONSTANT(a, b), VUINT64x2_CONSTANT(c, d) } }) -# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 64, 4, 2) -#endif +#define VEC_GENERIC_DEFINE_OPERATIONS(bits, size, halfsize) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN( , , bits, size, halfsize) \ + VEC_GENERIC_DEFINE_OPERATIONS_SIGN(u, U, bits, size, halfsize) -#ifndef VEC_VINT8X32 -# define VEC_VINT8X32 -VEC_DEFINE_STRUCT(, 8, 32, 16) -# define VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vint8x32){ .vecs = { VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT8x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) -# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 8, 32, 16) -#endif - -#ifndef VEC_VINT16X16 -# define VEC_VINT16X16 -VEC_DEFINE_STRUCT(, 16, 16, 8) -# define VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - ((vint16x16){ .vecs = { VINT16x8_CONSTANT(a, b, c, d, e, f, g, h), VINT16x8_CONSTANT(i, j, k, l, m, n, o, p) } }) -# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 16, 16, 8) -#endif - -#ifndef VEC_VINT32X8 -# define VEC_VINT32X8 -VEC_DEFINE_STRUCT(, 32, 8, 4) -# define VINT32x8_CONSTANT(a, b, c, d, e, f, g, h) \ - ((vuint32x8){ .vecs = { VINT32x4_CONSTANT(a, b, c, d), VINT32x4_CONSTANT(e, f, g, h) } }) -# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 32, 8, 4) -#endif +// 128-bit +VEC_GENERIC_DEFINE_OPERATIONS(8, 16, 8) +VEC_GENERIC_DEFINE_OPERATIONS(16, 8, 4) +VEC_GENERIC_DEFINE_OPERATIONS(32, 4, 2) -#ifndef VEC_VINT64X4 -# define VEC_VINT64X4 -VEC_DEFINE_STRUCT(, 64, 4, 2) -# define VINT64x4_CONSTANT(a, b, c, d) \ - ((vint64x4){ .vecs = { VINT64x2_CONSTANT(a, b), VINT64x2_CONSTANT(c, d) } }) -# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 64, 4, 2) -#endif - -#ifndef VEC_VUINT8X64 -# define VEC_VUINT8X64 -VEC_DEFINE_STRUCT(u, 8, 64, 32) -# define VUINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \ - ((vuint8x64){ .vecs = { VUINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VUINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } }) -# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 8, 64, 32) -#endif - -#ifndef VEC_VUINT16X32 -# define VEC_VUINT16X32 -VEC_DEFINE_STRUCT(u, 16, 32, 16) -# define VUINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vuint16x32){ .vecs = { VUINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VUINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) -# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 16, 32, 16) -#endif - -#ifndef VEC_VUINT32X16 -# define VEC_VUINT32X16 -VEC_DEFINE_STRUCT(u, 32, 16, 8) -# define VUINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - ((vuint32x16){ .vecs = { VUINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VUINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } }) -# define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 32, 16, 8) -#endif +// 256-bit +VEC_GENERIC_DEFINE_OPERATIONS(8, 32, 16) +VEC_GENERIC_DEFINE_OPERATIONS(16, 16, 8) +VEC_GENERIC_DEFINE_OPERATIONS(32, 8, 4) +VEC_GENERIC_DEFINE_OPERATIONS(64, 4, 2) -#ifndef VEC_VUINT64X8 -# define VEC_VUINT64X8 -VEC_DEFINE_STRUCT(u, 64, 8, 4) -# define VUINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \ - ((vuint64x8){ .vecs = { VUINT64x4_CONSTANT(a, b, c, d), VUINT64x4_CONSTANT(e, f, g, h) } }) -# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 64, 8, 4) -#endif - -#ifndef VEC_VINT8X64 -# define VEC_VINT8X64 -VEC_DEFINE_STRUCT(, 8, 64, 32) -# define VINT8x64_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) \ - ((vint8x64){ .vecs = { VINT8x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af), VINT8x32_CONSTANT(ag, ah, ai, aj, ak, al, am, an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl) } }) -# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 8, 64, 32) -#endif +// 512-bit +VEC_GENERIC_DEFINE_OPERATIONS(8, 64, 32) +VEC_GENERIC_DEFINE_OPERATIONS(16, 32, 16) +VEC_GENERIC_DEFINE_OPERATIONS(32, 16, 8) +VEC_GENERIC_DEFINE_OPERATIONS(64, 8, 4) -#ifndef VEC_VINT16X32 -# define VEC_VINT16X32 -VEC_DEFINE_STRUCT(, 16, 32, 16) -# define VINT16x32_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) \ - ((vint16x32){ .vecs = { VINT16x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p), VINT16x16_CONSTANT(q, r, s, t, u, v, w, x, y, z, aa, ab, ac, ad, ae, af) } }) -# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 16, 32, 16) -#endif +#undef VEC_GENERIC_DEFINE_OPERATIONS +#undef VEC_GENERIC_DEFINE_OPERATIONS_SIGN -#ifndef VEC_VINT32X16 -# define VEC_VINT32X16 -VEC_DEFINE_STRUCT(, 32, 16, 8) -# define VINT32x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - ((vint32x16){ .vecs = { VINT32x8_CONSTANT(a, b, c, d, e, f, g, h), VINT32x8_CONSTANT(i, j, k, l, m, n, o, p) } }) -# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 32, 16, 8) -#endif - -#ifndef VEC_VINT64X8 -# define VEC_VINT64X8 -VEC_DEFINE_STRUCT(, 64, 8, 4) -# define VINT64x8_CONSTANT(a, b, c, d, e, f, g, h) \ - ((vint64x8){ .vecs = { VINT64x4_CONSTANT(a, b, c, d), VINT64x4_CONSTANT(e, f, g, h) } }) -# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 64, 8, 4) -#endif - -#undef VEC_DEFINE_STRUCT -#undef VEC_DEFINE_OPERATIONS -#undef VEC_DEFINE_OP +#endif /* VEC_IMPL_GENERIC_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/ppc/altivec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/ppc/altivec.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,183 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +/* Altivec vector support. */ + +#ifndef VEC_IMPL_PPC_ALTIVEC_H_ +#define VEC_IMPL_PPC_ALTIVEC_H_ + +#include +#include + +#include + +#define VEC_ALTIVEC_ALIGNMENT 16 + +/* GCC 4.2.1 on Mac OS X doesn't have these for some reason */ +#ifdef vec_mul +# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_mul(vec1.altivec, vec2.altivec) }; \ + } +# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \ + .mul = v##sign##int##bits##x##size##_altivec_mul, +#else +# define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) +# define VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) +#endif + +#ifdef vec_splats +# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_splat(sign##int##bits##_t x) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_splats(x) }; \ + } +# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \ + .splat = v##sign##int##bits##x##size##_altivec_splat, +#else +# define VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) +# define VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) +#endif + +#define VEC_ALTIVEC_uRSHIFT vec_sr +#define VEC_ALTIVEC_RSHIFT vec_sra + +#define VEC_ALTIVEC_DEFINE_uLRSHIFT(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lrshift(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_sr(vec1.altivec, vec2.altivec) }; \ + } +#define VEC_ALTIVEC_STRUCT_uLRSHIFT(sign, csign, bits, size) \ + .lrshift = v##sign##int##bits##x##size##_altivec_lrshift, + +#define VEC_ALTIVEC_DEFINE_LRSHIFT(sign, csign, bits, size) +#define VEC_ALTIVEC_STRUCT_LRSHIFT(sign, csign, bits, size) + +/* Since altivec conveniently made their API super user friendly, we can just use + * one giant macro to define literally everything */ +#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load_aligned(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_ld(0, in) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_load(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) }; \ + } \ + \ + static void v##sign##int##bits##x##size##_altivec_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + vec_st(vec.altivec, 0, out); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_add(vec1.altivec, vec2.altivec) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_sub(vec1.altivec, vec2.altivec) }; \ + } \ + \ + VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_sl(vec1.altivec, vec2.altivec) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = VEC_ALTIVEC_##sign##RSHIFT(vec1.altivec, vec2.altivec) }; \ + } \ + \ + VEC_ALTIVEC_DEFINE_##sign##LRSHIFT(sign, csign, bits, size) \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_avg(vec1.altivec, vec2.altivec) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_and(vec1.altivec, vec2.altivec) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_or(vec1.altivec, vec2.altivec) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_altivec_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .altivec = vec_xor(vec1.altivec, vec2.altivec) }; \ + } \ + \ + VEC_ALTIVEC_DEFINE_SPLAT(sign, csign, bits, size) \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_altivec = { \ + .load_aligned = v##sign##int##bits##x##size##_altivec_load_aligned, \ + .load = v##sign##int##bits##x##size##_altivec_load, \ + .store_aligned = v##sign##int##bits##x##size##_altivec_store_aligned, \ + .add = v##sign##int##bits##x##size##_altivec_add, \ + .sub = v##sign##int##bits##x##size##_altivec_sub, \ + VEC_ALTIVEC_STRUCT_MUL(sign, csign, bits, size) \ + .lshift = v##sign##int##bits##x##size##_altivec_lshift, \ + .rshift = v##sign##int##bits##x##size##_altivec_rshift, \ + VEC_ALTIVEC_STRUCT_##sign##LRSHIFT(sign, csign, bits, size) \ + .avg = v##sign##int##bits##x##size##_altivec_avg, \ + .and = v##sign##int##bits##x##size##_altivec_and, \ + .or = v##sign##int##bits##x##size##_altivec_or, \ + .xor = v##sign##int##bits##x##size##_altivec_xor, \ + VEC_ALTIVEC_STRUCT_SPLAT(sign, csign, bits, size) \ + }; + +#define VEC_DEFINE_OPERATIONS(bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +VEC_DEFINE_OPERATIONS(8, 16) +VEC_DEFINE_OPERATIONS(16, 8) +VEC_DEFINE_OPERATIONS(32, 4) +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX +VEC_DEFINE_OPERATIONS(64, 2) +#endif + +#undef VEC_DEFINE_OPERATIONS +#undef VEC_DEFINE_OPERATIONS_SIGN +#undef VEC_ALTIVEC_DEFINE_MUL +#undef VEC_ALTIVEC_STRUCT_MUL +#undef VEC_ALTIVEC_DEFINE_LRSHIFT +#undef VEC_ALTIVEC_STRUCT_LRSHIFT +#undef VEC_ALTIVEC_DEFINE_uLRSHIFT +#undef VEC_ALTIVEC_STRUCT_uLRSHIFT +#undef VEC_ALTIVEC_DEFINE_SPLAT +#undef VEC_ALTIVEC_STRUCT_SPLAT +#undef VEC_ALTIVEC_uRSHIFT +#undef VEC_ALTIVEC_RSHIFT + +#endif /* VEC_IMPL_PPC_ALTIVEC_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/sse2.h --- a/include/vec/impl/sse2.h Tue Nov 19 15:55:01 2024 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,225 +0,0 @@ -/** - * vec - a tiny SIMD vector library in plain C99 - * - * Copyright (c) 2024 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#include - -#define VEC_SSE2_ALIGNMENT 16 - -#define VEC_SSE2_MUL_8x16(sign) \ - VEC_DECL_MUL(sign, 8, 16) \ - { \ - /* unpack and multiply */ \ - __m128i dst_even = _mm_mullo_epi16(vec1, vec2); \ - __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8)); \ - \ - /* repack */ \ - return _mm_or_si128( \ - _mm_slli_epi16(dst_odd, 8), \ - _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ - ); \ - } - -#define VEC_SSE2_MUL_16x8(sign) \ - VEC_DECL_MUL(sign, 16, 8) \ - { \ - /* we have a real instruction for this */ \ - return _mm_mullo_epi16(vec1, vec2); \ - } - -#define VEC_SSE2_MUL_32x4(sign) \ - VEC_DECL_MUL(sign, 32, 4) \ - { \ - /* this was stolen from... somewhere :) */ \ - __m128i a13 = _mm_shuffle_epi32(vec1, 0xF5); /* (-,a3,-,a1) */ \ - __m128i b13 = _mm_shuffle_epi32(vec2, 0xF5); /* (-,b3,-,b1) */ \ - __m128i prod02 = _mm_mul_epu32(vec1, vec2); /* (-,a2*b2,-,a0*b0) */ \ - __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ - __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ - return _mm_unpacklo_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ - } - -#define VEC_SSE2_MUL_64x2(sign) \ - VEC_DECL_MUL(sign, 64, 2) \ - { \ - __m128i ac = _mm_mul_epu32(vec1, vec2); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ - __m128i b = _mm_srli_epi64(vec1, 32); /* b = vec1 >> 32; */ \ - __m128i bc = _mm_mul_epu32(b, vec2); /* bc = b * (vec2 & UINT32_MAX); */ \ - __m128i d = _mm_srli_epi64(vec2, 32); /* d = vec2 >> 32; */ \ - __m128i ad = _mm_mul_epu32(vec1, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ - __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ - hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ - return _mm_add_epi64(hi, ac); /* return ac + hi; */ \ - } - -#define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ - VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ - { \ - return _mm_load_si128((const __m128i *)in); \ - } \ - \ - VEC_DECL_LOAD(sign, bits, size) \ - { \ - return _mm_loadu_si128((const __m128i *)in); \ - } \ - \ - VEC_DECL_STORE_ALIGNED(sign, bits, size) \ - { \ - _mm_store_si128((__m128i *)out, vec); \ - } \ - \ - VEC_DECL_STORE(sign, bits, size) \ - { \ - _mm_storeu_si128((__m128i *)out, vec); \ - } \ - \ - VEC_DECL_ADD(sign, bits, size) \ - { \ - return _mm_add_epi##bits(vec1, vec2); \ - } \ - \ - VEC_DECL_SUB(sign, bits, size) \ - { \ - return _mm_sub_epi##bits(vec1, vec2); \ - } \ - \ - VEC_DECL_AND(sign, bits, size) \ - { \ - return _mm_and_si128(vec1, vec2); \ - } \ - \ - VEC_DECL_OR(sign, bits, size) \ - { \ - return _mm_or_si128(vec1, vec2); \ - } \ - \ - VEC_DECL_XOR(sign, bits, size) \ - { \ - return _mm_xor_si128(vec1, vec2); \ - } \ - \ - VEC_SSE2_MUL_##bits##x##size(sign) \ - \ - VEC_GENERIC_SPLAT(sign, csign, bits, size) \ - VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ - VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ - VEC_GENERIC_AVG(sign, bits, size) - -#define VEC_DEFINE_COMPARISONS_SIGNED(bits, size) \ - VEC_DECL_CMPEQ(, bits, size) \ - { \ - return _mm_cmpeq_epi##bits(vec1, vec2); \ - } \ - VEC_DECL_CMPLT(, bits, size) \ - { \ - return _mm_cmplt_epi##bits(vec1, vec2); \ - } \ - VEC_DECL_CMPGT(, bits, size) \ - { \ - return _mm_cmpgt_epi##bits(vec1, vec2); \ - } \ - VEC_GENERIC_THAN_OR_EQUAL(, bits, size) - -#ifndef VEC_VUINT16X8 -# define VEC_VUINT16X8 -typedef __m128i vuint16x8; -# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (_mm_setr_epi16(h, g, f, e, d, c, b, a)) -# define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 16, 8) -VEC_GENERIC_COMPARISONS(u, U, 16, 8) -#endif - -#ifndef VEC_VUINT32X4 -# define VEC_VUINT32X4 -typedef __m128i vuint32x4; -# define VUINT32x4_CONSTANT(a, b, c, d) \ - (_mm_setr_epi32(d, c, b, a)) -# define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 32, 4) -VEC_GENERIC_COMPARISONS(u, U, 32, 4) -#endif - -#ifndef VEC_VUINT64X2 -# define VEC_VUINT64X2 -typedef __m128i vuint64x2; -VEC_FUNC_KEYWORDS vuint64x2 VUINT64x2_CONSTANT(uint64_t a, uint64_t b) -{ - return _mm_setr_epi32(b, b >> 32, a, a >> 32); -} -# define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_DEFINE_OPERATIONS(u, U, 64, 2) -VEC_GENERIC_COMPARISONS(u, U, 64, 2) -#endif - -#ifndef VEC_VINT8X16 -# define VEC_VINT8X16 -typedef __m128i vint8x16; -# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ - (_mm_setr_epi8(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)) -# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 8, 16) -VEC_DEFINE_COMPARISONS_SIGNED(8, 16) -#endif - -#ifndef VEC_VINT16X8 -# define VEC_VINT16X8 -typedef __m128i vint16x8; -# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ - (_mm_setr_epi16(h, g, f, e, d, c, b, a)) -# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 16, 8) -VEC_DEFINE_COMPARISONS_SIGNED(16, 8) -#endif - -#ifndef VEC_VINT32X4 -# define VEC_VINT32X4 -typedef __m128i vint32x4; -# define VINT32x4_CONSTANT(a, b, c, d) \ - (_mm_setr_epi32(d, c, b, a)) -# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_DEFINE_OPERATIONS(, , 32, 4) -VEC_DEFINE_COMPARISONS_SIGNED(32, 4) -#endif - -#ifndef VEC_VINT64X2 -# define VEC_VINT64X2 -typedef __m128i vint64x2; -# define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT -VEC_FUNC_KEYWORDS vint64x2 VINT64x2_CONSTANT(int64_t a, int64_t b) -{ - return _mm_setr_epi32(b, vec_rshift(b, 32), a, vec_rshift(a, 32)); -} -VEC_DEFINE_OPERATIONS(, , 64, 2) -VEC_GENERIC_COMPARISONS(, , 64, 2) -#endif - -#undef VEC_DEFINE_OPERATIONS -#undef VEC_DEFINE_COMPARISONS_SIGNED - -/* multiply */ -#undef VEC_SSE2_MUL_8x16 -#undef VEC_SSE2_MUL_16x8 -#undef VEC_SSE2_MUL_32x4 -#undef VEC_SSE2_MUL_64x2 diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/x86/avx2.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/avx2.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,257 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_AVX2_H_ +#define VEC_IMPL_X86_AVX2_H_ + +#define VEC_AVX2_OPERATION_8x32_16x16(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m256i dst_even = _mm256_##op##_epi16(vec1.avx2, vec2.avx2); \ + __m256i dst_odd = _mm256_##op##_epi16(_mm256_srli_epi16(vec1.avx2, 8), _mm256_srli_epi16(vec2.avx2, 8)); \ + \ + /* repack */ \ + return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \ + _mm256_slli_epi16(dst_odd, 8), \ + _mm256_srli_epi16(_mm256_slli_epi16(dst_even, 8), 8) \ + )}; \ + } while (0) + +#define VEC_AVX2_OPERATION_8x32_32x8(op, sign) \ + do { \ + /* unpack */ \ + __m256i dst_1 = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \ + __m256i dst_2 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 8), _mm256_srli_epi32(vec2.avx2, 8)); \ + __m256i dst_3 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \ + __m256i dst_4 = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 24), _mm256_srli_epi32(vec2.avx2, 24)); \ + \ + /* repack */ \ + return (v##sign##int8x32){ .avx2 = _mm256_or_si256( \ + _mm256_or_si256( \ + _mm256_slli_epi32(dst_4, 8), \ + _mm256_srli_epi32(_mm256_slli_epi32(dst_3, 8), 8) \ + ), \ + _mm256_or_si256( \ + _mm256_slli_epi32(_mm256_slli_epi32(dst_2, 8), 16), \ + _mm256_srli_epi32(_mm256_slli_epi32(dst_1, 8), 24) \ + ) \ + )}; \ + } while (0) + +#define VEC_AVX2_OPERATION_16x16(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m256i dst_even = _mm256_##op##_epi32(vec1.avx2, vec2.avx2); \ + __m256i dst_odd = _mm256_##op##_epi32(_mm256_srli_epi32(vec1.avx2, 16), _mm256_srli_epi32(vec2.avx2, 16)); \ + \ + /* repack */ \ + return (v##sign##int16x16){ .avx2 = _mm256_or_si256( \ + _mm256_slli_epi32(dst_odd, 16), \ + _mm256_srli_epi32(_mm256_slli_epi16(dst_even, 16), 16) \ + )}; \ + } while (0) + +// shifting + +#define VEC_AVX2_LSHIFT_8x32(sign) \ + VEC_AVX2_OPERATION_8x32_32x8(sllv, sign) + +#define VEC_AVX2_LSHIFT_16x16(sign) \ + VEC_AVX2_OPERATION_16x16(sllv, sign) + +#define VEC_AVX2_LSHIFT_32x8(sign) \ + do { \ + return (v##sign##int32x8){ .avx2 = _mm256_sllv_epi32(vec1.avx2, vec2.avx2) }; \ + } while (0) + +#define VEC_AVX2_LSHIFT_64x4(sign) \ + do { \ + return (v##sign##int64x4){ .avx2 = _mm256_sllv_epi64(vec1.avx2, vec2.avx2) }; \ + } while (0) + +#define VEC_AVX2_RSHIFT_8x32(sign, aORl) \ + VEC_AVX2_OPERATION_8x32_32x8(sr##aORl##v, sign) + +#define VEC_AVX2_RSHIFT_16x16(sign, aORl) \ + VEC_AVX2_OPERATION_16x16(sr##aORl##v, sign) + +#define VEC_AVX2_RSHIFT_32x8(sign, aORl) \ + do { \ + return (v##sign##int32x8){ .avx2 = _mm256_sr##aORl##v_epi32(vec1.avx2, vec2.avx2) }; \ + } while (0) + +#define VEC_AVX2_aRSHIFT_64x4(sign) \ + do { \ + return v##sign##int64x4_fallback_rshift(vec1, vec2); \ + } while (0) + +#define VEC_AVX2_lRSHIFT_64x4(sign) \ + do { \ + return (v##sign##int64x4){ .avx2 = _mm256_srlv_epi64(vec1.avx2, vec2.avx2) }; \ + } while (0) + +#define VEC_AVX2_RSHIFT_64x4(sign, aORl) \ + VEC_AVX2_##aORl##RSHIFT_64x4(sign) + +// multiplication + +#define VEC_AVX2_MUL_8x32(sign) \ + VEC_AVX2_OPERATION_8x32_16x16(mullo, sign) + +#define VEC_AVX2_MUL_16x16(sign) \ + do { \ + return (v##sign##int16x16){ .avx2 = _mm256_mullo_epi16(vec1.avx2, vec2.avx2) }; \ + } while (0) + +#define VEC_AVX2_MUL_32x8(sign) \ + do { \ + return (v##sign##int32x8) { .avx2 = _mm256_mullo_epi32(vec1.avx2, vec2.avx2) }; \ + } while (0) + +#define VEC_AVX2_MUL_64x4(sign) \ + do { \ + __m256i ac = _mm256_mul_epu32(vec1.avx2, vec2.avx2); \ + __m256i b = _mm256_srli_epi64(vec1.avx2, 32); \ + __m256i bc = _mm256_mul_epu32(b, vec2.avx2); \ + __m256i d = _mm256_srli_epi64(vec2.avx2, 32); \ + __m256i ad = _mm256_mul_epu32(vec1.avx2, d); \ + __m256i hi = _mm256_add_epi64(bc, ad); \ + hi = _mm256_slli_epi64(hi, 32); \ + return (v##sign##int64x4) { .avx2 = _mm256_add_epi64(hi, ac) }; \ + } while (0) + +// operations + +#define VEC_AVX2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load_aligned(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .avx2 = _mm256_load_si256((const __m256i *)in) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_load(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .avx2 = _mm256_loadu_si256((const __m256i *)in) }; \ + } \ + \ + static void v##sign##int##bits##x##size##_avx2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + _mm256_store_si256((__m256i *)out, vec.avx2); \ + } \ + \ + static void v##sign##int##bits##x##size##_avx2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + _mm256_storeu_si256((__m256i *)out, vec.avx2); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx2 = _mm256_add_epi##bits(vec1.avx2, vec2.avx2) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx2 = _mm256_sub_epi##bits(vec1.avx2, vec2.avx2) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX2_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx2 = _mm256_and_si256(vec1.avx2, vec2.avx2) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx2 = _mm256_or_si256(vec1.avx2, vec2.avx2) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx2 = _mm256_xor_si256(vec1.avx2, vec2.avx2) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX2_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX2_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX2_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx2 = { \ + .load_aligned = v##sign##int##bits##x##size##_avx2_load_aligned, \ + .load = v##sign##int##bits##x##size##_avx2_load, \ + .store_aligned = v##sign##int##bits##x##size##_avx2_store_aligned, \ + .store = v##sign##int##bits##x##size##_avx2_store, \ + .add = v##sign##int##bits##x##size##_avx2_add, \ + .sub = v##sign##int##bits##x##size##_avx2_sub, \ + .mul = v##sign##int##bits##x##size##_avx2_mul, \ + .and = v##sign##int##bits##x##size##_avx2_and, \ + .or = v##sign##int##bits##x##size##_avx2_or, \ + .xor = v##sign##int##bits##x##size##_avx2_xor, \ + .lshift = v##sign##int##bits##x##size##_avx2_lshift, \ + .rshift = v##sign##int##bits##x##size##_avx2_rshift, \ + .lrshift = v##sign##int##bits##x##size##_avx2_lrshift, \ + }; + +#define VEC_AVX2_DEFINE_OPERATIONS(bits, size) \ + VEC_AVX2_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_AVX2_DEFINE_OPERATIONS_SIGN(u, bits, size) + +VEC_AVX2_DEFINE_OPERATIONS(8, 32) +VEC_AVX2_DEFINE_OPERATIONS(16, 16) +VEC_AVX2_DEFINE_OPERATIONS(32, 8) +VEC_AVX2_DEFINE_OPERATIONS(64, 4) + +#undef VEC_AVX2_DEFINE_OPERATIONS +#undef VEC_AVX2_DEFINE_OPERATIONS_SIGN +#undef VEC_AVX2_MUL_8x32 +#undef VEC_AVX2_MUL_16x16 +#undef VEC_AVX2_MUL_32x8 +#undef VEC_AVX2_MUL_64x4 +#undef VEC_AVX2_OPERATION_8x32_16x16 +#undef VEC_AVX2_OPERATION_8x32_32x8 +#undef VEC_AVX2_OPERATION_16x16 +#undef VEC_AVX2_LSHIFT_8x32 +#undef VEC_AVX2_LSHIFT_16x16 +#undef VEC_AVX2_LSHIFT_32x8 +#undef VEC_AVX2_LSHIFT_64x4 +#undef VEC_AVX2_RSHIFT_8x32 +#undef VEC_AVX2_RSHIFT_16x16 +#undef VEC_AVX2_RSHIFT_32x8 +#undef VEC_AVX2_aRSHIFT_64x4 +#undef VEC_AVX2_lRSHIFT_64x4 +#undef VEC_AVX2_RSHIFT_64x4 + +#endif /* VEC_IMPL_X86_AVX2_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/x86/avx512f.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/avx512f.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,254 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_AVX512F_H_ +#define VEC_IMPL_X86_AVX512F_H_ + +#define VEC_AVX512F_OPERATION_8x64(op, sign) \ + do { \ + /* unpack and add */ \ + __m512i dst_1 = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \ + __m512i dst_2 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 8), _mm512_srli_epi32(vec2.avx512f, 8)); \ + __m512i dst_3 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \ + __m512i dst_4 = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 24), _mm512_srli_epi32(vec2.avx512f, 24)); \ + \ + /* repack */ \ + return (v##sign##int8x64){ .avx512f = _mm512_or_si512( \ + _mm512_or_si512( \ + _mm512_slli_epi32(dst_4, 8), \ + _mm512_srli_epi32(_mm512_slli_epi32(dst_3, 8), 8) \ + ), \ + _mm512_or_si512( \ + _mm512_slli_epi32(_mm512_slli_epi32(dst_2, 8), 16), \ + _mm512_srli_epi32(_mm512_slli_epi32(dst_1, 8), 24) \ + ) \ + )}; \ + } while (0) + +#define VEC_AVX512F_OPERATION_16x32(op, sign) \ + do { \ + /* unpack and add */ \ + __m512i dst_even = _mm512_##op##_epi32(vec1.avx512f, vec2.avx512f); \ + __m512i dst_odd = _mm512_##op##_epi32(_mm512_srli_epi32(vec1.avx512f, 16), _mm512_srli_epi32(vec2.avx512f, 16)); \ + \ + /* repack */ \ + return (v##sign##int16x32){ .avx512f = _mm512_or_si512( \ + _mm512_slli_epi32(dst_odd, 16), \ + _mm512_srli_epi32(_mm512_slli_epi32(dst_even, 16), 16) \ + )}; \ + } while (0) + +#define VEC_AVX512F_ADD_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(add, sign) + +#define VEC_AVX512F_ADD_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(add, sign) + +#define VEC_AVX512F_ADD_32x16(sign) \ + do { \ + return (v##sign##int32x16) { .avx512f = _mm512_add_epi32(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_ADD_64x8(sign) \ + do { \ + return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_SUB_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(sub, sign) + +#define VEC_AVX512F_SUB_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(sub, sign) + +#define VEC_AVX512F_SUB_32x16(sign) \ + do { \ + return (v##sign##int32x16) { .avx512f = _mm512_sub_epi32(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_SUB_64x8(sign) \ + do { \ + return (v##sign##int64x8) { .avx512f = _mm512_sub_epi64(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_MUL_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(mullo, sign) + +#define VEC_AVX512F_MUL_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(mullo, sign) + +#define VEC_AVX512F_MUL_32x16(sign) \ + do { \ + return (v##sign##int32x16) { .avx512f = _mm512_mullo_epi32(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_MUL_64x8(sign) \ + do { \ + __m512i ac = _mm512_mul_epu32(vec1.avx512f, vec2.avx512f); \ + __m512i b = _mm512_srli_epi64(vec1.avx512f, 32); \ + __m512i bc = _mm512_mul_epu32(b, vec2.avx512f); \ + __m512i d = _mm512_srli_epi64(vec2.avx512f, 32); \ + __m512i ad = _mm512_mul_epu32(vec1.avx512f, d); \ + __m512i hi = _mm512_add_epi64(bc, ad); \ + hi = _mm512_slli_epi64(hi, 32); \ + return (v##sign##int64x8) { .avx512f = _mm512_add_epi64(hi, ac) }; \ + } while (0) + +#define VEC_AVX512F_LSHIFT_8x64(sign) \ + VEC_AVX512F_OPERATION_8x64(sllv, sign) + +#define VEC_AVX512F_LSHIFT_16x32(sign) \ + VEC_AVX512F_OPERATION_16x32(sllv, sign) + +#define VEC_AVX512F_LSHIFT_32x16(sign) \ + do { \ + return (v##sign##int32x16){ .avx512f = _mm512_sllv_epi32(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_LSHIFT_64x8(sign) \ + do { \ + return (v##sign##int64x8){ .avx512f = _mm512_sllv_epi64(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_RSHIFT_8x64(sign, aORl) \ + VEC_AVX512F_OPERATION_8x64(sr##aORl##v, sign) + +#define VEC_AVX512F_RSHIFT_16x32(sign, aORl) \ + VEC_AVX512F_OPERATION_16x32(sr##aORl##v, sign) + +#define VEC_AVX512F_RSHIFT_32x16(sign, aORl) \ + do { \ + return (v##sign##int32x16){ .avx512f = _mm512_sr##aORl##v_epi32(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_RSHIFT_64x8(sign, aORl) \ + do { \ + return (v##sign##int64x8){ .avx512f = _mm512_sr##aORl##v_epi64(vec1.avx512f, vec2.avx512f) }; \ + } while (0) + +#define VEC_AVX512F_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load_aligned(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .avx512f = _mm512_load_si512((const __m512i *)in) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_load(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .avx512f = _mm512_loadu_si512((const __m512i *)in) }; \ + } \ + \ + static void v##sign##int##bits##x##size##_avx512f_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + _mm512_store_si512((__m512i *)out, vec.avx512f); \ + } \ + \ + static void v##sign##int##bits##x##size##_avx512f_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + _mm512_storeu_si512((__m512i *)out, vec.avx512f); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX512F_ADD_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX512F_SUB_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_AVX512F_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx512f = _mm512_and_si512(vec1.avx512f, vec2.avx512f) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx512f = _mm512_or_si512(vec1.avx512f, vec2.avx512f) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .avx512f = _mm512_xor_si512(vec1.avx512f, vec2.avx512f) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX512F_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX512F_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_avx512f_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_AVX512F_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_avx512f = { \ + .load_aligned = v##sign##int##bits##x##size##_avx512f_load_aligned, \ + .load = v##sign##int##bits##x##size##_avx512f_load, \ + .store_aligned = v##sign##int##bits##x##size##_avx512f_store_aligned, \ + .store = v##sign##int##bits##x##size##_avx512f_store, \ + .add = v##sign##int##bits##x##size##_avx512f_add, \ + .sub = v##sign##int##bits##x##size##_avx512f_sub, \ + .mul = v##sign##int##bits##x##size##_avx512f_mul, \ + .and = v##sign##int##bits##x##size##_avx512f_and, \ + .or = v##sign##int##bits##x##size##_avx512f_or, \ + .xor = v##sign##int##bits##x##size##_avx512f_xor, \ + }; + +#define VEC_AVX512F_DEFINE_OPERATIONS(bits, size) \ + VEC_AVX512F_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_AVX512F_DEFINE_OPERATIONS_SIGN(u, bits, size) + +VEC_AVX512F_DEFINE_OPERATIONS(8, 64) +VEC_AVX512F_DEFINE_OPERATIONS(16, 32) +VEC_AVX512F_DEFINE_OPERATIONS(32, 16) +VEC_AVX512F_DEFINE_OPERATIONS(64, 8) + +#undef VEC_AVX512F_DEFINE_OPERATIONS +#undef VEC_AVX512F_DEFINE_OPERATIONS_SIGN +#undef VEC_AVX512F_MUL_8x64 +#undef VEC_AVX512F_MUL_16x32 +#undef VEC_AVX512F_MUL_32x16 +#undef VEC_AVX512F_MUL_64x8 + +#undef VEC_AVX512F_LSHIFT_8x64 +#undef VEC_AVX512F_LSHIFT_16x32 +#undef VEC_AVX512F_LSHIFT_32x16 +#undef VEC_AVX512F_LSHIFT_64x8 + +#undef VEC_AVX512F_RSHIFT_8x64 +#undef VEC_AVX512F_RSHIFT_16x32 +#undef VEC_AVX512F_RSHIFT_32x16 +#undef VEC_AVX512F_RSHIFT_64x8 + +#endif /* VEC_IMPL_X86_AVX512F_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/x86/mmx.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/mmx.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,185 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_MMX_H_ +#define VEC_IMPL_X86_MMX_H_ + +#define VEC_MMX_OPERATION_8x8(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m64 dst_even = _mm_##op##_pi16(vec1.mmx, vec2.mmx); \ + __m64 dst_odd = _mm_##op##_pi16(_mm_srli_pi16(vec1.mmx, 8), _mm_srli_pi16(vec2.mmx, 8)); \ + \ + /* repack */ \ + return (v##sign##int8x8){ .mmx = _mm_or_si64( \ + _mm_slli_pi16(dst_odd, 8), \ + _mm_srli_pi16(_mm_slli_pi16(dst_even, 8), 8) \ + )}; \ + } while (0) + +// shifting +#define VEC_MMX_LSHIFT_8x8(sign) \ + VEC_MMX_OPERATION_8x8(sll, sign) + +#define VEC_MMX_LSHIFT_16x4(sign) \ + do { \ + return (v##sign##int16x4){ .mmx = _mm_sll_pi16(vec1.mmx, vec2.mmx) }; \ + } while (0) + +#define VEC_MMX_LSHIFT_32x2(sign) \ + do { \ + return (v##sign##int32x2){ .mmx = _mm_sll_pi32(vec1.mmx, vec2.mmx) }; \ + } while (0) + +#define VEC_MMX_RSHIFT_8x8(sign, aORl) \ + VEC_MMX_OPERATION_8x8(sr##aORl, sign) + +#define VEC_MMX_RSHIFT_16x4(sign, aORl) \ + do { \ + return (v##sign##int16x4){ .mmx = _mm_sr##aORl##_pi16(vec1.mmx, vec2.mmx) }; \ + } while (0) + +#define VEC_MMX_RSHIFT_32x2(sign, aORl) \ + do { \ + return (v##sign##int32x2){ .mmx = _mm_sr##aORl##_pi32(vec1.mmx, vec2.mmx) }; \ + } while (0) + +// shared between MMX variations +#define VEC_MMX_MUL_8x8(sign) \ + VEC_MMX_OPERATION_8x8(mullo, sign) + +#define VEC_MMX_MUL_16x4(sign) \ + do { \ + /* we have a real instruction for this */ \ + return (v##sign##int16x4){ .mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx) }; \ + } while (0) + +#define VEC_MMX_MUL_32x2(sign) \ + do { \ + __m64 ac = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ + __m64 b = _mm_srli_pi32(vec1.mmx, 16); \ + __m64 bc = _mm_mullo_pi16(b, vec2.mmx); \ + __m64 d = _mm_srli_pi32(vec2.mmx, 16); \ + __m64 ad = _mm_mullo_pi16(vec1.mmx, d); \ + __m64 hi = _mm_add_pi32(bc, ad); \ + hi = _mm_slli_pi32(hi, 16); \ + return (v##sign##int32x2) { .mmx = _mm_add_pi32(hi, ac) }; /* return ac + hi; */ \ + } while (0) + +#define VEC_MMX_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_load_aligned(const sign##int##bits##_t in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + memcpy(&vec.mmx, in, sizeof(vec.mmx)); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_mmx_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + memcpy(out, &vec.mmx, sizeof(vec.mmx)); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .mmx = _mm_add_pi##bits(vec1.mmx, vec2.mmx) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .mmx = _mm_sub_pi##bits(vec1.mmx, vec2.mmx) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_MMX_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .mmx = _mm_and_si64(vec1.mmx, vec2.mmx) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .mmx = _mm_or_si64(vec1.mmx, vec2.mmx) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .mmx = _mm_xor_si64(vec1.mmx, vec2.mmx) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_MMX_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_MMX_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_mmx_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_MMX_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_mmx = { \ + .load_aligned = v##sign##int##bits##x##size##_mmx_load_aligned, \ + .load = v##sign##int##bits##x##size##_mmx_load_aligned, \ + .store_aligned = v##sign##int##bits##x##size##_mmx_store_aligned, \ + .store = v##sign##int##bits##x##size##_mmx_store_aligned, \ + .add = v##sign##int##bits##x##size##_mmx_add, \ + .sub = v##sign##int##bits##x##size##_mmx_sub, \ + .mul = v##sign##int##bits##x##size##_mmx_mul, \ + .and = v##sign##int##bits##x##size##_mmx_and, \ + .or = v##sign##int##bits##x##size##_mmx_or, \ + .xor = v##sign##int##bits##x##size##_mmx_xor, \ + .lshift = v##sign##int##bits##x##size##_mmx_lshift, \ + .rshift = v##sign##int##bits##x##size##_mmx_rshift, \ + .lrshift = v##sign##int##bits##x##size##_mmx_lrshift, \ + }; + +#define VEC_MMX_DEFINE_OPERATIONS(bits, size) \ + VEC_MMX_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_MMX_DEFINE_OPERATIONS_SIGN(u, bits, size) + +VEC_MMX_DEFINE_OPERATIONS(8, 8) +VEC_MMX_DEFINE_OPERATIONS(16, 4) +VEC_MMX_DEFINE_OPERATIONS(32, 2) + +#undef VEC_MMX_DEFINE_OPERATIONS +#undef VEC_MMX_DEFINE_OPERATIONS_SIGN +#undef VEC_MMX_MUL_8x8 +#undef VEC_MMX_MUL_16x4 +#undef VEC_MMX_MUL_32x2 +#undef VEC_MMX_OPERATION_8x8 +#undef VEC_MMX_LSHIFT_8x8 +#undef VEC_MMX_LSHIFT_16x4 +#undef VEC_MMX_LSHIFT_32x2 +#undef VEC_MMX_RSHIFT_8x8 +#undef VEC_MMX_RSHIFT_16x4 +#undef VEC_MMX_RSHIFT_32x2 + +#endif /* VEC_IMPL_X86_MMX_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/x86/sse2.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/sse2.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,230 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_SSE2_H_ +#define VEC_IMPL_X86_SSE2_H_ + +#define VEC_SSE2_OPERATION_8x16(op, sign) \ + do { \ + /* unpack and multiply */ \ + __m128i dst_even = _mm_##op##_epi16(vec1.sse, vec2.sse); \ + __m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \ + \ + /* repack */ \ + return (v##sign##int8x16){ .sse = _mm_or_si128( \ + _mm_slli_epi16(dst_odd, 8), \ + _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ + )}; \ + } while (0) + +// shifting +#define VEC_SSE2_LSHIFT_8x16(sign) \ + VEC_SSE2_OPERATION_8x16(sll, sign) + +#define VEC_SSE2_LSHIFT_16x8(sign) \ + do { \ + return (v##sign##int16x8){ .sse = _mm_sll_epi16(vec1.sse, vec2.sse) }; \ + } while (0) + +#define VEC_SSE2_LSHIFT_32x4(sign) \ + do { \ + return (v##sign##int32x4){ .sse = _mm_sll_epi32(vec1.sse, vec2.sse) }; \ + } while (0) + +#define VEC_SSE2_LSHIFT_64x2(sign) \ + do { \ + return (v##sign##int64x2){ .sse = _mm_sll_epi64(vec1.sse, vec2.sse) }; \ + } while (0) + +#define VEC_SSE2_RSHIFT_8x16(sign, aORl) \ + VEC_SSE2_OPERATION_8x16(sr##aORl, sign) + +#define VEC_SSE2_RSHIFT_16x8(sign, aORl) \ + do { \ + return (v##sign##int16x8){ .sse = _mm_sr##aORl##_epi16(vec1.sse, vec2.sse) }; \ + } while (0) + +#define VEC_SSE2_RSHIFT_32x4(sign, aORl) \ + do { \ + return (v##sign##int32x4){ .sse = _mm_sr##aORl##_epi32(vec1.sse, vec2.sse) }; \ + } while (0) + +#define VEC_SSE2_aRSHIFT_64x2(sign) \ + do { \ + return v##sign##int64x2_fallback_rshift(vec1, vec2); \ + } while (0) + +#define VEC_SSE2_lRSHIFT_64x2(sign) \ + do { \ + return (v##sign##int64x2){ .sse = _mm_srl_epi64(vec1.sse, vec2.sse) }; \ + } while (0) + +#define VEC_SSE2_RSHIFT_64x2(sign, aORl) \ + VEC_SSE2_##aORl##RSHIFT_64x2(sign) + +// shared between SSE2 variations +#define VEC_SSE2_MUL_8x16(sign) \ + VEC_SSE2_OPERATION_8x16(mullo, sign) + +#define VEC_SSE2_MUL_16x8(sign) \ + do { \ + /* we have a real instruction for this */ \ + return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \ + } while (0) + +#define VEC_SSE2_MUL_32x4(sign) \ + do { \ + /* this was stolen from... somewhere :) */ \ + __m128i a13 = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \ + __m128i b13 = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \ + __m128i prod02 = _mm_mul_epu32(vec1.sse, vec2.sse); /* (-,a2*b2,-,a0*b0) */ \ + __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ + __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ + return (v##sign##int32x4) { .sse = _mm_unpacklo_epi64(prod01, prod23) }; /* (ab3,ab2,ab1,ab0) */ \ + } while (0) + +#define VEC_SSE2_MUL_64x2(sign) \ + do { \ + __m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ + __m128i b = _mm_srli_epi64(vec1.sse, 32); /* b = vec1 >> 32; */ \ + __m128i bc = _mm_mul_epu32(b, vec2.sse); /* bc = b * (vec2 & UINT32_MAX); */ \ + __m128i d = _mm_srli_epi64(vec2.sse, 32); /* d = vec2 >> 32; */ \ + __m128i ad = _mm_mul_epu32(vec1.sse, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ + __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ + hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ + return (v##sign##int64x2) { .sse = _mm_add_epi64(hi, ac) }; /* return ac + hi; */ \ + } while (0) + +#define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .sse = _mm_load_si128((const __m128i *)in) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const sign##int##bits##_t in[size]) \ + { \ + return (v##sign##int##bits##x##size) { .sse = _mm_loadu_si128((const __m128i *)in) }; \ + } \ + \ + static void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + _mm_store_si128((__m128i *)out, vec.sse); \ + } \ + \ + static void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + _mm_storeu_si128((__m128i *)out, vec.sse); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .sse = _mm_add_epi##bits(vec1.sse, vec2.sse) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .sse = _mm_sub_epi##bits(vec1.sse, vec2.sse) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_SSE2_MUL_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return (v##sign##int##bits##x##size) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_SSE2_LSHIFT_##bits##x##size(sign); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_SSE2_RSHIFT_##bits##x##size(sign, a); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + VEC_SSE2_RSHIFT_##bits##x##size(sign, l); \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \ + .load_aligned = v##sign##int##bits##x##size##_sse2_load_aligned, \ + .load = v##sign##int##bits##x##size##_sse2_load, \ + .store_aligned = v##sign##int##bits##x##size##_sse2_store_aligned, \ + .store = v##sign##int##bits##x##size##_sse2_store, \ + .add = v##sign##int##bits##x##size##_sse2_add, \ + .sub = v##sign##int##bits##x##size##_sse2_sub, \ + .mul = v##sign##int##bits##x##size##_sse2_mul, \ + .and = v##sign##int##bits##x##size##_sse2_and, \ + .or = v##sign##int##bits##x##size##_sse2_or, \ + .xor = v##sign##int##bits##x##size##_sse2_xor, \ + .lshift = v##sign##int##bits##x##size##_sse2_lshift, \ + .rshift = v##sign##int##bits##x##size##_sse2_rshift, \ + .lrshift = v##sign##int##bits##x##size##_sse2_lrshift, \ + }; + +#define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \ + VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size) + +// SSE is *only* 128-bit +VEC_SSE2_DEFINE_OPERATIONS(8, 16) +VEC_SSE2_DEFINE_OPERATIONS(16, 8) +VEC_SSE2_DEFINE_OPERATIONS(32, 4) +VEC_SSE2_DEFINE_OPERATIONS(64, 2) + +#undef VEC_SSE2_DEFINE_OPERATIONS +#undef VEC_SSE2_DEFINE_OPERATIONS_SIGN +#undef VEC_SSE2_MUL_8x16 +#undef VEC_SSE2_MUL_16x8 +#undef VEC_SSE2_MUL_32x4 +#undef VEC_SSE2_MUL_64x2 +#undef VEC_SSE2_OPERATION_8x16 +#undef VEC_SSE2_LSHIFT_8x16 +#undef VEC_SSE2_LSHIFT_16x8 +#undef VEC_SSE2_LSHIFT_32x4 +#undef VEC_SSE2_LSHIFT_64x2 +#undef VEC_SSE2_RSHIFT_8x16 +#undef VEC_SSE2_RSHIFT_16x8 +#undef VEC_SSE2_RSHIFT_32x4 +#undef VEC_SSE2_aRSHIFT_64x2 +#undef VEC_SSE2_lRSHIFT_64x2 +#undef VEC_SSE2_RSHIFT_64x2 + +#endif /* VEC_IMPL_X86_SSE2_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/impl/x86/sse41.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/sse41.h Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,55 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_SSE41_H_ +#define VEC_IMPL_X86_SSE41_H_ + +#define VEC_SSE41_DEFINE_OPERATIONS(sign) \ + static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \ + { \ + return (v##sign##int32x4){ .sse = _mm_mullo_epi32(vec1.sse, vec2.sse) }; \ + } \ + \ + static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \ + .load_aligned = v##sign##int32x4_sse2_load_aligned, \ + .load = v##sign##int32x4_sse2_load, \ + .store_aligned = v##sign##int32x4_sse2_store_aligned, \ + .store = v##sign##int32x4_sse2_store, \ + .add = v##sign##int32x4_sse2_add, \ + .sub = v##sign##int32x4_sse2_sub, \ + .mul = v##sign##int32x4_sse41_mul, \ + .and = v##sign##int32x4_sse2_and, \ + .or = v##sign##int32x4_sse2_or, \ + .xor = v##sign##int32x4_sse2_xor, \ + .lshift = v##sign##int32x4_sse2_lshift, \ + .rshift = v##sign##int32x4_sse2_rshift, \ + .lrshift = v##sign##int32x4_sse2_lrshift, \ + }; + +VEC_SSE41_DEFINE_OPERATIONS() +VEC_SSE41_DEFINE_OPERATIONS(u) + +#undef VEC_SSE41_DEFINE_OPERATIONS + +#endif /* VEC_IMPL_X86_SSE41_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 include/vec/vec.h --- a/include/vec/vec.h Tue Nov 19 15:55:01 2024 -0500 +++ b/include/vec/vec.h Wed Nov 20 04:10:37 2024 -0500 @@ -29,6 +29,10 @@ #include #include +#define VEC_MAX(a, b) (((a) > (b)) ? (a) : (b)) +#define VEC_MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define VEC_CLAMP(x, min, max) (VEC_MIN(VEC_MAX((x), (min)), (max))) + #define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ (((a) >= (x)) && \ ((a) > x || (b) >= (y)) && \ @@ -39,14 +43,11 @@ /* GCC/clang attributes */ #if defined(__has_attribute) -# if __has_attribute(__always_inline__) -# define VEC_ALWAYS_INLINE __attribute__((__always_inline__)) -# endif # if __has_attribute(__aligned__) # define VEC_ALIGNED(x) __attribute__((__aligned__(x))) # endif # if __has_attribute(__vector_size__) -# define VEC_HAVE_GNUC_VECTORS +# define VEC_COMPILER_HAS_GNUC_VECTORS # endif #endif @@ -56,26 +57,6 @@ # endif #endif -/* FIXME: gcc 4.2 on Mac OS X doesn't have always_inline, - * even though docs and many online sources say that it - * should have it. */ - -#ifndef VEC_ALWAYS_INLINE -# define VEC_ALWAYS_INLINE -#endif - -/* Allow users to define all of the symbols externally in - * one translation unit, or as a shared library. */ -#ifdef VEC_EXTERN -# ifdef VEC_EXTERN_DEFINE -# define VEC_FUNC_KEYWORDS extern inline -# else -# define VEC_FUNC_KEYWORDS inline -# endif -#else -# define VEC_FUNC_KEYWORDS static inline VEC_ALWAYS_INLINE -#endif - #if (__STDC_VERSION__ >= 201112L) # define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg) #else @@ -97,131 +78,226 @@ /* --------------------------------------------------------------- */ /* Detect compiler SIMD support */ +#define VEC_GENERIC_ALIGNMENT 1 +#define VEC_ALTIVEC_ALIGNMENT 16 +#define VEC_SSE2_ALIGNMENT 16 +#define VEC_AVX2_ALIGNMENT 32 +#define VEC_AVX512F_ALIGNMENT 64 + +// for the generic implementation, 64-bit +#define VINT8x8_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VINT16x4_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VINT32x2_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VUINT8x8_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VUINT16x4_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VUINT32x2_ALIGNMENT VEC_GENERIC_ALIGNMENT + +#define VINT8x16_ALIGNMENT VINT8x8_ALIGNMENT +#define VINT16x8_ALIGNMENT VINT16x4_ALIGNMENT +#define VINT32x4_ALIGNMENT VINT32x2_ALIGNMENT +#define VINT64x2_ALIGNMENT VEC_GENERIC_ALIGNMENT +#define VUINT8x16_ALIGNMENT VUINT8x8_ALIGNMENT +#define VUINT16x8_ALIGNMENT VUINT16x4_ALIGNMENT +#define VUINT32x4_ALIGNMENT VUINT32x2_ALIGNMENT +#define VUINT64x2_ALIGNMENT VEC_GENERIC_ALIGNMENT + +#define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT +#define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT +#define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT +#define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT +#define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT +#define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT +#define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT +#define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT + +#define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT +#define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT +#define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT +#define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT +#define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT +#define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT +#define VUINT32x16_ALIGNMENT VUINT32x8_ALIGNMENT +#define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT + +#ifndef VEC_SUPPRESS_HW + // IIRC `__VEC__' is also defined, but I don't know for sure. // IBM says that `__ALTIVEC__' is standard though. #ifdef __ALTIVEC__ # include # define VEC_COMPILER_HAS_ALTIVEC +# if defined(__POWER8__) && defined(__VSX__) +# define VEC_COMPILER_HAS_ALTIVEC_VSX +# endif +# if VINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT8x16_ALIGNMENT +# define VINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT16x8_ALIGNMENT +# define VINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VINT64x2_ALIGNMENT +# define VINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT8x16_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT8x16_ALIGNMENT +# define VUINT8x16_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT16x8_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT16x8_ALIGNMENT +# define VUINT16x8_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +# if VUINT64x2_ALIGNMENT < VEC_ALTIVEC_ALIGNMENT +# undef VUINT64x2_ALIGNMENT +# define VUINT64x2_ALIGNMENT VEC_ALTIVEC_ALIGNMENT +# endif +#endif -# define VINT8x16_ALIGNMENT 16 -# define VINT16x8_ALIGNMENT 16 -# define VINT32x4_ALIGNMENT 16 -# define VINT64x2_ALIGNMENT 16 +#ifdef __MMX__ +# include +# define VEC_COMPILER_HAS_MMX #endif #ifdef __SSE2__ -# include +# include # define VEC_COMPILER_HAS_SSE2 -# ifdef __SSE42__ -# define VEC_COMPILER_HAS_SSE42 +# ifdef __SSE4_1__ +# define VEC_COMPILER_HAS_SSE41 +# endif +# if VINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT8x16_ALIGNMENT +# define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT16x8_ALIGNMENT +# define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT # endif - -# define VINT8x16_ALIGNMENT 16 -# define VINT16x8_ALIGNMENT 16 -# define VINT32x4_ALIGNMENT 16 -# define VINT64x2_ALIGNMENT 16 -#endif - -#ifndef VINT8x16_ALIGNMENT -# define VINT8x16_ALIGNMENT 1 -#endif -#ifndef VINT16x8_ALIGNMENT -# define VINT16x8_ALIGNMENT 1 -#endif -#ifndef VINT32x4_ALIGNMENT -# define VINT32x4_ALIGNMENT 1 -#endif -#ifndef VINT64x2_ALIGNMENT -# define VINT64x2_ALIGNMENT 1 -#endif -#ifndef VUINT8x16_ALIGNMENT -# define VUINT8x16_ALIGNMENT 1 -#endif -#ifndef VUINT16x8_ALIGNMENT -# define VUINT16x8_ALIGNMENT 1 -#endif -#ifndef VUINT32x4_ALIGNMENT -# define VUINT32x4_ALIGNMENT 1 -#endif -#ifndef VUINT64x2_ALIGNMENT -# define VUINT64x2_ALIGNMENT 1 +# if VINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VINT64x2_ALIGNMENT +# define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT8x16_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT8x16_ALIGNMENT +# define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT16x8_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT16x8_ALIGNMENT +# define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif +# if VUINT64x2_ALIGNMENT < VEC_SSE2_ALIGNMENT +# undef VUINT64x2_ALIGNMENT +# define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT +# endif #endif -// generic 256-bit is just doubled 128-bit -#ifndef VINT8x32_ALIGNMENT -# define VINT8x32_ALIGNMENT VINT8x16_ALIGNMENT -#endif -#ifndef VINT16x16_ALIGNMENT -# define VINT16x16_ALIGNMENT VINT16x8_ALIGNMENT -#endif -#ifndef VINT32x8_ALIGNMENT -# define VINT32x8_ALIGNMENT VINT32x4_ALIGNMENT -#endif -#ifndef VINT64x4_ALIGNMENT -# define VINT64x4_ALIGNMENT VINT64x2_ALIGNMENT -#endif -#ifndef VUINT8x32_ALIGNMENT -# define VUINT8x32_ALIGNMENT VUINT8x16_ALIGNMENT -#endif -#ifndef VUINT16x16_ALIGNMENT -# define VUINT16x16_ALIGNMENT VUINT16x8_ALIGNMENT -#endif -#ifndef VUINT32x8_ALIGNMENT -# define VUINT32x8_ALIGNMENT VUINT32x4_ALIGNMENT -#endif -#ifndef VUINT64x4_ALIGNMENT -# define VUINT64x4_ALIGNMENT VUINT64x2_ALIGNMENT +#ifdef __AVX2__ +# include +# define VEC_COMPILER_HAS_AVX2 +# if VINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT8x32_ALIGNMENT +# define VINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT16x16_ALIGNMENT +# define VINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT32x8_ALIGNMENT +# define VINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VINT64x4_ALIGNMENT +# define VINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT8x32_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT8x32_ALIGNMENT +# define VUINT8x32_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT16x16_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT16x16_ALIGNMENT +# define VUINT16x16_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT32x8_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT32x8_ALIGNMENT +# define VUINT32x8_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif +# if VUINT64x4_ALIGNMENT < VEC_AVX2_ALIGNMENT +# undef VUINT64x4_ALIGNMENT +# define VUINT64x4_ALIGNMENT VEC_AVX2_ALIGNMENT +# endif #endif -// generic 512-bit is just doubled 256-bit -#ifndef VINT8x64_ALIGNMENT -# define VINT8x64_ALIGNMENT VINT8x32_ALIGNMENT -#endif -#ifndef VINT16x32_ALIGNMENT -# define VINT16x32_ALIGNMENT VINT16x16_ALIGNMENT -#endif -#ifndef VINT32x16_ALIGNMENT -# define VINT32x16_ALIGNMENT VINT32x8_ALIGNMENT -#endif -#ifndef VINT64x8_ALIGNMENT -# define VINT64x8_ALIGNMENT VINT64x4_ALIGNMENT -#endif -#ifndef VUINT8x64_ALIGNMENT -# define VUINT8x64_ALIGNMENT VUINT8x32_ALIGNMENT -#endif -#ifndef VUINT16x32_ALIGNMENT -# define VUINT16x32_ALIGNMENT VUINT16x16_ALIGNMENT -#endif -#ifndef VUINT32x16_ALIGNMENT -# define VUINT32x16_ALIGNMENT VUINT32x16_ALIGNMENT -#endif -#ifndef VUINT64x8_ALIGNMENT -# define VUINT64x8_ALIGNMENT VUINT64x4_ALIGNMENT +#ifdef __AVX512F__ +# include +# define VEC_COMPILER_HAS_AVX512F +# if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT8x64_ALIGNMENT +# define VINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT16x32_ALIGNMENT +# define VINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT32x16_ALIGNMENT +# define VINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VINT64x8_ALIGNMENT +# define VINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT8x64_ALIGNMENT +# define VUINT8x64_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT16x32_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT16x32_ALIGNMENT +# define VUINT16x32_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT32x16_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT32x16_ALIGNMENT +# define VUINT32x16_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif +# if VUINT64x8_ALIGNMENT < VEC_AVX512F_ALIGNMENT +# undef VUINT64x8_ALIGNMENT +# define VUINT64x8_ALIGNMENT VEC_AVX512F_ALIGNMENT +# endif #endif -/* --------------------------------------------------------------- */ -/* Detect CPU SIMD support */ - -// stubs for now... will be implemented sometime -#define VEC_CPU_have_SSE2() (0) -#define VEC_CPU_have_SSE42() (0) -#define VEC_CPU_have_ALTIVEC() (0) -#define VEC_CPU_have_ALTIVEC_VSX() (0) +#endif /* --------------------------------------------------------------- */ /* bit shift */ -VEC_FUNC_KEYWORDS uintmax_t vec_ulrshift(uintmax_t x, unsigned int y) +inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y) { return x >> y; } -VEC_FUNC_KEYWORDS uintmax_t vec_ullshift(uintmax_t x, unsigned int y) +inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y) { return x << y; } -VEC_FUNC_KEYWORDS intmax_t vec_lrshift(intmax_t x, unsigned int y) +inline intmax_t vec_lrshift(intmax_t x, unsigned int y) { // reinterpret as unsigned integer and then shift union { @@ -234,7 +310,7 @@ return xx.d; } -VEC_FUNC_KEYWORDS intmax_t vec_llshift(intmax_t x, unsigned int y) +inline intmax_t vec_llshift(intmax_t x, unsigned int y) { // reinterpret as unsigned integer and then shift union { @@ -247,12 +323,12 @@ return xx.d; } -VEC_FUNC_KEYWORDS uintmax_t vec_urshift(uintmax_t x, unsigned int y) +inline uintmax_t vec_urshift(uintmax_t x, unsigned int y) { return x >> y; } -VEC_FUNC_KEYWORDS uintmax_t vec_ulshift(uintmax_t x, unsigned int y) +inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y) { return x << y; } @@ -283,7 +359,7 @@ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. **/ -VEC_FUNC_KEYWORDS intmax_t vec_rshift(intmax_t x, unsigned int y) +inline intmax_t vec_rshift(intmax_t x, unsigned int y) { static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); @@ -302,7 +378,7 @@ return xx.d; } -VEC_FUNC_KEYWORDS intmax_t vec_lshift(intmax_t x, unsigned int y) +inline intmax_t vec_lshift(intmax_t x, unsigned int y) { static const uintmax_t roffset = ((uintmax_t)1) << ((sizeof(intmax_t) * CHAR_BIT) - 1); @@ -320,110 +396,246 @@ return xx.d; } +#ifdef VEC_IMPLEMENTATION +extern inline uintmax_t vec_ulrshift(uintmax_t x, unsigned int y); +extern inline uintmax_t vec_ullshift(uintmax_t x, unsigned int y); +extern inline intmax_t vec_lrshift(intmax_t x, unsigned int y); +extern inline intmax_t vec_llshift(intmax_t x, unsigned int y); +extern inline uintmax_t vec_urshift(uintmax_t x, unsigned int y); +extern inline uintmax_t vec_ulshift(uintmax_t x, unsigned int y); +extern inline intmax_t vec_rshift(intmax_t x, unsigned int y); +extern inline intmax_t vec_lshift(intmax_t x, unsigned int y); +#endif + /* --------------------------------------------------------------- */ /* Array alignment macros */ -#include - +/* the alignment must be specified in bytes and must be a multiple of the + * type size. it is always assumed that the type will be on a boundary of + * its size, which may or may not be true */ #ifdef VEC_ALIGNED # define VEC_ALIGNED_ARRAY(type, var, length, align) \ VEC_ALIGNED(align) type var[length] # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ (sizeof(var)) #else -/* the alignment must be specified in bytes and must be a multiple of the - * type size. it is always assumed that the type will be on a boundary of - * its size, which may or may not be true */ # define VEC_ALIGNED_ARRAY(type, var, length, align) \ - VEC_STATIC_ASSERT(align % sizeof(type) == 0 && align != 0, "vec: alignment needs to be a multiple of the type size and non-zero"); \ - type vec_##var##_unaligned_[(length) + (align / sizeof(type)) - 1]; \ + VEC_STATIC_ASSERT(align && ((align & (align - 1)) == 0), "vec: alignment must be a power of two"); \ + type vec_##var##_unaligned_[(length) + (align / sizeof(type))]; \ type *var = (type *)(((uintptr_t)vec_##var##_unaligned_ + (align - 1)) & ~(align - 1)); \ VEC_ASSERT(((uintptr_t)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ - (sizeof(vec_##var##_unaligned_) - ((align) - 1)) + (sizeof(vec_##var##_unaligned_) - (align - 1)) #endif -#define VEC_ALIGNED_ARRAY_LENGTH(var, align) \ - (VEC_ALIGNED_ARRAY_SIZEOF(var, align)/sizeof(*var)) +#define VEC_ALIGNED_ARRAY_LENGTH(var) \ + (VEC_ALIGNED_ARRAY_SIZEOF(var)/sizeof(*var)) // ------------------------------------------------------------ // predefined variants for each vector type +#define VINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 8, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x8_ALIGNMENT) +#define VINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x8_ALIGNMENT) +#define VINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x8_ALIGNMENT == 0) + +#define VINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 4, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x4_ALIGNMENT) +#define VINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x4_ALIGNMENT) +#define VINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x4_ALIGNMENT == 0) + +#define VINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 2, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x2_ALIGNMENT) +#define VINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x2_ALIGNMENT) +#define VINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x2_ALIGNMENT == 0) + +#define VUINT8x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 8, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x8_ALIGNMENT) +#define VUINT8x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x8_ALIGNMENT == 0) + +#define VUINT16x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 4, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x4_ALIGNMENT) +#define VUINT16x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x4_ALIGNMENT == 0) + +#define VUINT32x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 2, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x2_ALIGNMENT) +#define VUINT32x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x2_ALIGNMENT == 0) + #define VINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 16, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x16_ALIGNMENT) +#define VINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x16_ALIGNMENT) #define VINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x16_ALIGNMENT == 0) #define VINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 8, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x8_ALIGNMENT) +#define VINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x8_ALIGNMENT) #define VINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x8_ALIGNMENT == 0) #define VINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 4, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x4_ALIGNMENT) +#define VINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x4_ALIGNMENT) #define VINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x4_ALIGNMENT == 0) #define VINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 2, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x2_ALIGNMENT) +#define VINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x2_ALIGNMENT) #define VINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x2_ALIGNMENT == 0) #define VUINT8x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 16, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x16_ALIGNMENT) +#define VUINT8x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x16_ALIGNMENT) #define VUINT8x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x16_ALIGNMENT == 0) #define VUINT16x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 8, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x8_ALIGNMENT) +#define VUINT16x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x8_ALIGNMENT) #define VUINT16x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x8_ALIGNMENT == 0) #define VUINT32x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 4, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x4_ALIGNMENT) +#define VUINT32x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x4_ALIGNMENT) #define VUINT32x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x4_ALIGNMENT == 0) #define VUINT64x2_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 2, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x2_ALIGNMENT) +#define VUINT64x2_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x2_ALIGNMENT) #define VUINT64x2_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x2_ALIGNMENT == 0) #define VINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 32, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x32_ALIGNMENT) +#define VINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x32_ALIGNMENT) #define VINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x32_ALIGNMENT == 0) #define VINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 16, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x16_ALIGNMENT) +#define VINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x16_ALIGNMENT) #define VINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) #define VINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 8, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x8_ALIGNMENT) +#define VINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x8_ALIGNMENT) #define VINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x8_ALIGNMENT == 0) #define VINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 4, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x4_ALIGNMENT) +#define VINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x4_ALIGNMENT) #define VINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x4_ALIGNMENT == 0) #define VUINT8x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 32, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x32_ALIGNMENT) +#define VUINT8x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x32_ALIGNMENT) #define VUINT8x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x32_ALIGNMENT == 0) #define VUINT16x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 16, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x16_ALIGNMENT) +#define VUINT16x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x16_ALIGNMENT) #define VUINT16x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) #define VUINT32x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 8, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x8_ALIGNMENT) +#define VUINT32x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x8_ALIGNMENT) #define VUINT32x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x8_ALIGNMENT == 0) #define VUINT64x4_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 4, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x4_ALIGNMENT) +#define VUINT64x4_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x4_ALIGNMENT) #define VUINT64x4_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x4_ALIGNMENT == 0) #define VINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int8_t, var, 64, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT8x64_ALIGNMENT) +#define VINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT8x64_ALIGNMENT) #define VINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT8x64_ALIGNMENT == 0) -#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x16_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int16_t, var, 32, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT16x32_ALIGNMENT) +#define VINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT16x32_ALIGNMENT) #define VINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT16x16_ALIGNMENT == 0) #define VINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int32_t, var, 16, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT32x16_ALIGNMENT) +#define VINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT32x16_ALIGNMENT) #define VINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT32x16_ALIGNMENT == 0) #define VINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(int64_t, var, 8, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VINT64x8_ALIGNMENT) +#define VINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VINT64x8_ALIGNMENT) #define VINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VINT64x8_ALIGNMENT == 0) #define VUINT8x64_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint8_t, var, 64, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT8x64_ALIGNMENT) +#define VUINT8x64_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT8x64_ALIGNMENT) #define VUINT8x64_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT8x64_ALIGNMENT == 0) -#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x16_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint16_t, var, 32, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT16x32_ALIGNMENT) +#define VUINT16x32_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT16x32_ALIGNMENT) #define VUINT16x32_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT16x16_ALIGNMENT == 0) #define VUINT32x16_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint32_t, var, 16, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT32x16_ALIGNMENT) +#define VUINT32x16_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT32x16_ALIGNMENT) #define VUINT32x16_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT32x16_ALIGNMENT == 0) #define VUINT64x8_ALIGNED_ARRAY(var) VEC_ALIGNED_ARRAY(uint64_t, var, 8, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_SIZEOF(var) VEC_ALIGNED_ARRAY_SIZEOF(var, VUINT64x8_ALIGNMENT) +#define VUINT64x8_ALIGNED_ARRAY_LENGTH(var) VEC_ALIGNED_ARRAY_LENGTH(var, VUINT64x8_ALIGNMENT) #define VUINT64x8_PTR_ALIGNED(ptr) (((uintptr_t)ptr) % VUINT64x8_ALIGNMENT == 0) /* --------------------------------------------------------------- */ /* Defines the structures for each vector type */ +// 64-bit +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif + + uint8_t generic[8]; +} vuint8x8; + +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif + + uint16_t generic[4]; +} vuint16x4; + +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif + + uint32_t generic[2]; +} vuint32x2; + +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif + + int8_t generic[8]; +} vint8x8; + +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif + + int16_t generic[4]; +} vint16x4; + +typedef union { +#ifdef VEC_COMPILER_HAS_MMX + __m64 mmx; +#endif + + int32_t generic[2]; +} vint32x2; + // 128-bit typedef union { #ifdef VEC_COMPILER_HAS_SSE2 @@ -432,7 +644,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector unsigned char altivec; #endif - uint8_t generic[16]; + vuint8x8 generic[2]; } vuint8x16; typedef union { @@ -442,7 +654,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector unsigned short altivec; #endif - uint16_t generic[8]; + vuint16x4 generic[2]; } vuint16x8; typedef union { @@ -452,7 +664,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector unsigned int altivec; #endif - uint32_t generic[4]; + vuint32x2 generic[2]; } vuint32x4; typedef union { @@ -472,7 +684,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector signed char altivec; #endif - int8_t generic[16]; + vint8x8 generic[2]; } vint8x16; typedef union { @@ -482,7 +694,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector signed short altivec; #endif - int16_t generic[8]; + vint16x4 generic[2]; } vint16x8; typedef union { @@ -492,7 +704,7 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector signed int altivec; #endif - int32_t generic[4]; + vint32x2 generic[2]; } vint32x4; typedef union { @@ -507,1162 +719,615 @@ // 256-bit typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vuint8x16 generic[2]; } vuint8x32; typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vuint16x8 generic[2]; } vuint16x16; typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vuint32x4 generic[2]; } vuint32x8; typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vuint64x2 generic[2]; } vuint64x4; typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vint8x16 generic[2]; } vint8x32; typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vint16x8 generic[2]; } vint16x16; typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vint32x4 generic[2]; } vint32x8; typedef union { +#ifdef VEC_COMPILER_HAS_AVX2 + __m256i avx2; +#endif vint64x2 generic[2]; } vint64x4; // 512-bit typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vuint8x32 generic[2]; } vuint8x64; typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vuint16x16 generic[2]; } vuint16x32; typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vuint32x8 generic[2]; } vuint32x16; typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vuint64x4 generic[2]; } vuint64x8; typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vint8x32 generic[2]; } vint8x64; typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vint16x16 generic[2]; } vint16x32; typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vint32x8 generic[2]; } vint32x16; typedef union { +#ifdef VEC_COMPILER_HAS_AVX512F + __m512i avx512f; +#endif vint64x4 generic[2]; } vint64x8; -// -------------------------------------------------------------------------------- -// okay, now onto the actual functions: -// -// we have generic variations of every major operation EXCEPT aligned load and -// aligned store. this means that a vector implementation can be created with -// only aligned load and aligned store implemented, which sucks, but it werks +// --------------------------------------------------------------------------------- +// function declarations + +int vec_init(void); + +#define VEC_DECLARE_OPERATIONS_SIGN(sign, bits, size) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]); \ + void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ + void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); -#define VEC_GENERIC_OPERATION(op, sign, csign, bits, size) \ - do { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr2); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ - v##sign##int##bits##x##size##_store_aligned(vec2, varr2); \ - \ - for (int i = 0; i < size; i++) varr1[i] = (op); \ - \ - return v##sign##int##bits##x##size##_load_aligned(varr1); \ - } while (0) +#define VEC_DECLARE_OPERATIONS(bits, size) \ + VEC_DECLARE_OPERATIONS_SIGN( , bits, size) \ + VEC_DECLARE_OPERATIONS_SIGN(u, bits, size) + +// 64-bit +VEC_DECLARE_OPERATIONS(8, 8) +VEC_DECLARE_OPERATIONS(16, 4) +VEC_DECLARE_OPERATIONS(32, 2) + +// 128-bit +VEC_DECLARE_OPERATIONS(8, 16) +VEC_DECLARE_OPERATIONS(16, 8) +VEC_DECLARE_OPERATIONS(32, 4) +VEC_DECLARE_OPERATIONS(64, 2) -#define VEC_GENERIC_ADD(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] + varr2[i], sign, csign, bits, size) -#define VEC_GENERIC_SUB(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] - varr2[i], sign, csign, bits, size) -#define VEC_GENERIC_MUL(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] * varr2[i], sign, csign, bits, size) -#define VEC_GENERIC_DIV(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr2[i] ? (varr1[i] / varr2[i]) : 0, sign, csign, bits, size) -#define VEC_GENERIC_AND(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] & varr2[i], sign, csign, bits, size) -#define VEC_GENERIC_OR(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] | varr2[i], sign, csign, bits, size) -#define VEC_GENERIC_XOR(sign, csign, bits, size) VEC_GENERIC_OPERATION(varr1[i] ^ varr2[i], sign, csign, bits, size) +// 256-bit +VEC_DECLARE_OPERATIONS(8, 32) +VEC_DECLARE_OPERATIONS(16, 16) +VEC_DECLARE_OPERATIONS(32, 8) +VEC_DECLARE_OPERATIONS(64, 4) -#define VEC_GENERIC_CMP(op, sign, csign, bits, size) \ - VEC_GENERIC_OPERATION((varr1[i] op varr1[i]) ? csign##INT##bits##_MAX : 0, sign, csign, bits, size) +// 512-bit +VEC_DECLARE_OPERATIONS(8, 64) +VEC_DECLARE_OPERATIONS(16, 32) +VEC_DECLARE_OPERATIONS(32, 16) +VEC_DECLARE_OPERATIONS(64, 8) -#define VEC_GENERIC_CMPLT(sign, csign, bits, size) VEC_GENERIC_CMP(<, sign, csign, bits, size) -#define VEC_GENERIC_CMPLE(sign, csign, bits, size) VEC_GENERIC_CMP(<=, sign, csign, bits, size) -#define VEC_GENERIC_CMPEQ(sign, csign, bits, size) VEC_GENERIC_CMP(==, sign, csign, bits, size) -#define VEC_GENERIC_CMPGE(sign, csign, bits, size) VEC_GENERIC_CMP(>=, sign, csign, bits, size) -#define VEC_GENERIC_CMPGT(sign, csign, bits, size) VEC_GENERIC_CMP(>, sign, csign, bits, size) +#undef VEC_DECLARE_OPERATIONS +#undef VEC_DECLARE_OPERATIONS_SIGN + +// --------------------------------------------------------------------------------- +// okay, now we can actually implement the functions + +#ifdef VEC_IMPLEMENTATION -#define VEC_GENERIC_SHIFT(op, sign, csign, bits, size) \ - do { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(varr1); \ - VUINT##bits##x##size##_ALIGNED_ARRAY(varr2); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec1, varr1); \ - vuint##bits##x##size##_store_aligned(vec2, varr2); \ - \ - for (int i = 0; i < size; i++) varr1[i] = (op); \ - \ - return v##sign##int##bits##x##size##_load_aligned(varr1); \ - } while (0) +// Fallback functions, need to be defined before everything else. +#include "impl/fallback.h" -#define VEC_GENERIC_LSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lshift(varr1[i], varr2[i]), sign, csign, bits, size) -#define VEC_GENERIC_RSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##rshift(varr1[i], varr2[i]), sign, csign, bits, size) -#define VEC_GENERIC_LRSHIFT(sign, csign, bits, size) VEC_GENERIC_SHIFT(vec_##sign##lrshift(varr1[i], varr2[i]), sign, csign, bits, size) +// okay, these are filled in for each supported backend +#define VEC_DEFINE_IMPL_STRUCT_SIGN(sign, bits, size) \ + typedef struct { \ + v##sign##int##bits##x##size (*splat)(sign##int##bits##_t x); \ + v##sign##int##bits##x##size (*load_aligned)(const sign##int##bits##_t in[size]); \ + v##sign##int##bits##x##size (*load)(const sign##int##bits##_t in[size]); \ + void (*store_aligned)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ + void (*store)(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]); \ + v##sign##int##bits##x##size (*add)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*sub)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*mul)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*div)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*avg)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*and)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*or)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*xor)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*not)(v##sign##int##bits##x##size vec); \ + v##sign##int##bits##x##size (*cmplt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*cmple)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*cmpeq)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*cmpge)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*cmpgt)(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2); \ + v##sign##int##bits##x##size (*lshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size (*rshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + v##sign##int##bits##x##size (*lrshift)(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2); \ + } v##sign##int##bits##x##size##_impl; -#ifdef VEC_COMPILER_HAS_SSE2 -// these are shared between SSE2 variations -# define VEC_SSE2_MUL_8x16(sign) \ - do { \ - /* unpack and multiply */ \ - __m128i dst_even = _mm_mullo_epi16(vec1.sse, vec2.sse); \ - __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1.sse, 8), _mm_srli_epi16(vec2.sse, 8)); \ - \ - /* repack */ \ - return (v##sign##int8x16){ .sse = _mm_or_si128( \ - _mm_slli_epi16(dst_odd, 8), \ - _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ - )}; \ - } while (0) +#define VEC_DEFINE_IMPL_STRUCT(bits, size) \ + VEC_DEFINE_IMPL_STRUCT_SIGN( , bits, size) \ + VEC_DEFINE_IMPL_STRUCT_SIGN(u, bits, size) -# define VEC_SSE2_MUL_16x8(sign) \ - do { \ - /* we have a real instruction for this */ \ - return (v##sign##int16x8){ .sse = _mm_mullo_epi16(vec1.sse, vec2.sse) }; \ - } while (0) +// 64-bit +VEC_DEFINE_IMPL_STRUCT(8, 8) +VEC_DEFINE_IMPL_STRUCT(16, 4) +VEC_DEFINE_IMPL_STRUCT(32, 2) + +// 128-bit +VEC_DEFINE_IMPL_STRUCT(8, 16) +VEC_DEFINE_IMPL_STRUCT(16, 8) +VEC_DEFINE_IMPL_STRUCT(32, 4) +VEC_DEFINE_IMPL_STRUCT(64, 2) -# define VEC_SSE2_MUL_32x4(sign) \ - do { \ - /* this was stolen from... somewhere :) */ \ - __m128i a13 = _mm_shuffle_epi32(vec1.sse, 0xF5); /* (-,a3,-,a1) */ \ - __m128i b13 = _mm_shuffle_epi32(vec2.sse, 0xF5); /* (-,b3,-,b1) */ \ - __m128i prod02 = _mm_mul_epu32(vec1, vec2); /* (-,a2*b2,-,a0*b0) */ \ - __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ - __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ - return (v##sign##int32x4) {.sse = _mm_unpacklo_epi64(prod01, prod23)}; /* (ab3,ab2,ab1,ab0) */ \ - } while (0) +// 256-bit +VEC_DEFINE_IMPL_STRUCT(8, 32) +VEC_DEFINE_IMPL_STRUCT(16, 16) +VEC_DEFINE_IMPL_STRUCT(32, 8) +VEC_DEFINE_IMPL_STRUCT(64, 4) -# define VEC_SSE2_MUL_64x2(sign) \ - do { \ - __m128i ac = _mm_mul_epu32(vec1.sse, vec2.sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ - __m128i b = _mm_srli_epi64(vec1.sse, 32); /* b = vec1 >> 32; */ \ - __m128i bc = _mm_mul_epu32(b, vec2.sse); /* bc = b * (vec2 & UINT32_MAX); */ \ - __m128i d = _mm_srli_epi64(vec2.sse, 32); /* d = vec2 >> 32; */ \ - __m128i ad = _mm_mul_epu32(vec1.sse, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ - __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ - hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ - return (v##sign##int64x2) {.sse = _mm_add_epi64(hi, ac); } /* return ac + hi; */ \ - } while (0) +// 512-bit +VEC_DEFINE_IMPL_STRUCT(8, 64) +VEC_DEFINE_IMPL_STRUCT(16, 32) +VEC_DEFINE_IMPL_STRUCT(32, 16) +VEC_DEFINE_IMPL_STRUCT(64, 8) + +#undef VEC_DEFINE_IMPL_STRUCT +#undef VEC_DEFINE_IMPL_STRUCT_SIGN + +// ------------------------------------------------------------------------ + +#ifdef VEC_COMPILER_HAS_ALTIVEC +# include "impl/ppc/altivec.h" #endif -// -------------------------------------------------------------------------------- -// vuint8x16 implementation - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load_aligned(const uint8_t in[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vuint8x16) { .sse = _mm_load_si128((__m128i *)in) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_ld(0, in); - } else -#endif - { - vuint8x16 vec; - memcpy(vec.generic, in, sizeof(vec.generic)); - return vec; - } - - VEC_ASSERT(0, "No suitable load_aligned variant found"); - - return (vuint8x16){ 0 }; -} - -VEC_FUNC_KEYWORDS void vuint8x16_store_aligned(vuint8x16 vec, uint8_t out[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - _mm_store_si128((__m128i *)out, vec.sse); - return; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - vec_st(vec.altivec, 0, out); - return; - } else -#endif - { - memcpy(out, vec.generic, sizeof(vec.generic)); - return; - } - - VEC_ASSERT(0, "No suitable aligned store variant found"); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_splat(uint8_t x) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else +#ifdef VEC_COMPILER_HAS_AVX512F +# include "impl/x86/avx512f.h" #endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return (vuint8x16){ .altivec = vec_splat_u8(x) }; - } else -#endif - { - return (vuint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} }; - } - - // okay, we don't have a regular thing. call the load function with a splatted array - VUINT8x16_ALIGNED_ARRAY(arr); - for (int i = 0; i < 16; i++) arr[i] = x; - return vuint8x16_load_aligned(arr); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_load(const uint8_t in[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vuint8x16) { .sse = _mm_loadu_si128((__m128i *)in) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)); - } else -#endif - { - vuint8x16 vec; - memcpy(vec.generic, in, sizeof(vec.generic)); - return vec; - } - - // ok, we don't have unaligned load, copy the array - // and call the aligned load function - VUINT8x16_ALIGNED_ARRAY(aligned_in); - memcpy(aligned_in, in, 16); - return vuint8x16_load_aligned(aligned_in); -} - -VEC_FUNC_KEYWORDS void vuint8x16_store(vuint8x16 vec, uint8_t out[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - _mm_storeu_si128((__m128i *)out, vec.sse); - return; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // noop - } else -#endif - { - memcpy(out, vec.generic, sizeof(vec.generic)); - return; - } - // no unaligned store? use the aligned version - VUINT8x16_ALIGNED_ARRAY(aligned_out); - vuint8x16_store_aligned(vec, aligned_out); - - // then copy to the output buffer - memcpy(out, aligned_out, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_add(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vuint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_add(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_ADD(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_sub(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vuint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_sub(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_SUB(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_mul(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - VEC_SSE2_MUL_8x16(u); - } else +#ifdef VEC_COMPILER_HAS_AVX2 +# include "impl/x86/avx2.h" #endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { -# ifdef vec_mul // this isn't available on older compilers - return vec_mul(vec1.altivec, vec2.altivec); -# endif - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i]; - return vec1; - } - VEC_GENERIC_MUL(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_div(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX - if (VEC_CPU_have_ALTIVEC_VSX()) { - return vec_div(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0; - return vec1; - } - - VEC_GENERIC_DIV(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_and(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vuint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_and(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_AND(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_or(vuint8x16 vec1, vuint8x16 vec2) -{ #ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vuint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_or(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_OR(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_xor(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vuint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_xor(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_XOR(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lshift(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - //noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_sl(vec1, vec2); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulshift(vec1.generic[i], vec2.generic[i]); - return vec1; - } - - VEC_GENERIC_LSHIFT(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_rshift(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - //noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_sl(vec1, vec2); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec_urshift(vec1.generic[i], vec2.generic[i]); - return vec1; - } - - VEC_GENERIC_RSHIFT(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_lrshift(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - //noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_sl(vec1, vec2); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec_ulrshift(vec1.generic[i], vec2.generic[i]); - return vec1; - } - - VEC_GENERIC_LRSHIFT(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_avg(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_avg(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (uint8_t)(vec1.generic[i] + vec2.generic[i]) / 2; - return vec1; - } - - return vuint8x16_div(vuint8x16_add(vec1, vec2), vuint8x16_splat(2)); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmplt(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPLT(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmple(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPLE(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpeq(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPEQ(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpgt(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPGT(u, U, 8, 16); -} - -VEC_FUNC_KEYWORDS vuint8x16 vuint8x16_cmpge(vuint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPGE(u, U, 8, 16); -} - -// -------------------------------------------------------------------------------- -// vint8x16 implementation - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_load_aligned(const int8_t in[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vint8x16) { .sse = _mm_load_si128((__m128i *)in) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return (vint8x16) { .altivec = vec_ld(0, in) }; - } else +# include "impl/x86/sse2.h" #endif - { - vint8x16 vec; - memcpy(vec.generic, in, sizeof(vec.generic)); - return vec; - } - - VEC_ASSERT(0, "No suitable load_aligned variant found"); - - return (vint8x16){ 0 }; -} - -VEC_FUNC_KEYWORDS void vint8x16_store_aligned(vint8x16 vec, int8_t out[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - _mm_store_si128((__m128i *)out, vec.sse); - return; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - vec_st(vec.altivec, 0, out); - return; - } else -#endif - { - memcpy(out, vec.generic, sizeof(vec.generic)); - return; - } - - VEC_ASSERT(0, "No suitable aligned store variant found"); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_splat(int8_t x) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return (vint8x16){ .altivec = vec_splat_s8(x) }; - } else -#endif - { - return (vint8x16){ .generic = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x} }; - } - - // okay, we don't have a regular thing. call the load function with a splatted array - VINT8x16_ALIGNED_ARRAY(arr); - for (int i = 0; i < 16; i++) arr[i] = x; - return vint8x16_load_aligned(arr); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_load(const int8_t in[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vint8x16) { .sse = _mm_loadu_si128((__m128i *)in) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return (vint8x16) { .altivec = vec_perm(vec_ld(0, in), vec_ld(16, in), vec_lvsl(0, in)) }; - } else -#endif - { - vint8x16 vec; - memcpy(vec.generic, in, sizeof(vec.generic)); - return vec; - } - - // ok, we don't have unaligned load, copy the array - // and call the aligned load function - VINT8x16_ALIGNED_ARRAY(aligned_in); - memcpy(aligned_in, in, 16); - return vint8x16_load_aligned(aligned_in); -} - -VEC_FUNC_KEYWORDS void vint8x16_store(vint8x16 vec, int8_t out[16]) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - _mm_storeu_si128((__m128i *)out, vec.sse); - return; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // noop - } else -#endif - { - memcpy(out, vec.generic, sizeof(vec.generic)); - return; - } - - // no unaligned store? use the aligned version - VINT8x16_ALIGNED_ARRAY(aligned_out); - vint8x16_store_aligned(vec, aligned_out); - - // then copy to the output buffer - memcpy(out, aligned_out, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_add(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vint8x16) { .sse = _mm_add_epi8(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return (vint8x16) { .altivec = vec_add(vec1.altivec, vec2.altivec) }; - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] += vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_ADD(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_sub(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vint8x16) { .sse = _mm_sub_epi8(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return (vint8x16) { .altivec = vec_sub(vec1.altivec, vec2.altivec) }; - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] -= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_SUB(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_mul(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - VEC_SSE2_MUL_8x16(); - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { -# ifdef vec_mul // this isn't available on older compilers - return (vint8x16) { .altivec = vec_mul(vec1.altivec, vec2.altivec) }; -# endif - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] *= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_MUL(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_div(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX - if (VEC_CPU_have_ALTIVEC_VSX()) { - return (vint8x16) { .altivec = vec_div(vec1.altivec, vec2.altivec) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // noop - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec2.generic[i] ? (vec1.generic[i] / vec2.generic[i]) : 0; - return vec1; - } - - VEC_GENERIC_DIV(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_and(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vint8x16) { .sse = _mm_and_si128(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return (vint8x16) {.altivec = vec_and(vec1.altivec, vec2.altivec) }; - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] &= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_ADD(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_or(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vint8x16) { .sse = _mm_or_si128(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_or(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] |= vec2.generic[i]; - return vec1; - } - VEC_GENERIC_OR(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_xor(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - return (vint8x16) { .sse = _mm_xor_si128(vec1.sse, vec2.sse) }; - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_xor(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] ^= vec2.generic[i]; - return vec1; - } - - VEC_GENERIC_XOR(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_lshift(vint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - //noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_sl(vec1, vec2); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lshift(vec1.generic[i], vec2.generic[i]); - return vec1; - } - - VEC_GENERIC_LSHIFT(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_rshift(vint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - //noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_sl(vec1, vec2); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec_rshift(vec1.generic[i], vec2.generic[i]); - return vec1; - } - - VEC_GENERIC_RSHIFT(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_lrshift(vint8x16 vec1, vuint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - //noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_sl(vec1, vec2); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = vec_lrshift(vec1.generic[i], vec2.generic[i]); - return vec1; - } - - VEC_GENERIC_LRSHIFT(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_avg(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - return vec_avg(vec1.altivec, vec2.altivec); - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (int8_t)(vec1.generic[i] + vec2.generic[i]) / 2; - return vec1; - } - - return vint8x16_div(vint8x16_add(vec1, vec2), vint8x16_splat(2)); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmplt(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else +// depends on SSE2 functions; the only thing SSE4.1 provides for us +// is a native 32-bit multiply +#ifdef VEC_COMPILER_HAS_SSE41 +# include "impl/x86/sse41.h" #endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] < vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - VEC_GENERIC_CMPLT(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmple(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] <= vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPLE(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpeq(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho :) - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] == vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPEQ(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpgt(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else +#ifdef VEC_COMPILER_HAS_MMX +# include "impl/x86/mmx.h" #endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] > vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - VEC_GENERIC_CMPGT(, , 8, 16); -} - -VEC_FUNC_KEYWORDS vint8x16 vint8x16_cmpge(vint8x16 vec1, vint8x16 vec2) -{ -#ifdef VEC_COMPILER_HAS_SSE2 - if (VEC_CPU_have_SSE2()) { - // noop - } else -#endif -#ifdef VEC_COMPILER_HAS_ALTIVEC - if (VEC_CPU_have_ALTIVEC()) { - // these functions exist, no internet rn tho - } else -#endif - { - for (int i = 0; i < 16; i++) vec1.generic[i] = (vec1.generic[i] >= vec2.generic[i]) ? UINT8_MAX : 0; - return vec1; - } - - VEC_GENERIC_CMPGE(, , 8, 16); -} - -/* ----------------------------------------------------------------- */ -/* bitwise NOT is just an XOR with UINT[BITS]_MAX */ - -#define DEFINE_NOT_OPERATION(sign, bits, size) \ - VEC_FUNC_KEYWORDS v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ - { \ - return v##sign##int##bits##x##size##_xor(vec, v##sign##int##bits##x##size##_splat((sign##int##bits##_t)UINT##bits##_MAX)); \ - } - -DEFINE_NOT_OPERATION(, 8, 16) -DEFINE_NOT_OPERATION(u, 8, 16) - -DEFINE_NOT_OPERATION(, 8, 32) -DEFINE_NOT_OPERATION(, 16, 16) -DEFINE_NOT_OPERATION(, 32, 8) -DEFINE_NOT_OPERATION(, 64, 4) -DEFINE_NOT_OPERATION(u, 8, 32) -DEFINE_NOT_OPERATION(u, 16, 16) -DEFINE_NOT_OPERATION(u, 32, 8) -DEFINE_NOT_OPERATION(u, 64, 4) - -DEFINE_NOT_OPERATION(, 8, 64) -DEFINE_NOT_OPERATION(, 16, 32) -DEFINE_NOT_OPERATION(, 32, 16) -DEFINE_NOT_OPERATION(, 64, 8) -DEFINE_NOT_OPERATION(u, 8, 64) -DEFINE_NOT_OPERATION(u, 16, 32) -DEFINE_NOT_OPERATION(u, 32, 16) -DEFINE_NOT_OPERATION(u, 64, 8) - -#undef DEFINE_NOT_OPERATION +#include "impl/generic.h" /* ---------------------------------------------------------------- */ -/* cleanup */ -#undef VEC_OPERATION_DECL -#undef VEC_OPERATION_THIS_DECL -#undef VEC_TWOWAY_DECL +#include "impl/cpu.h" // CPU detection crap + +// 64-bit +static vint8x8_impl *vint8x8_impl_cpu = &vint8x8_impl_generic; +static vuint8x8_impl *vuint8x8_impl_cpu = &vuint8x8_impl_generic; +static vint16x4_impl *vint16x4_impl_cpu = &vint16x4_impl_generic; +static vuint16x4_impl *vuint16x4_impl_cpu = &vuint16x4_impl_generic; +static vint32x2_impl *vint32x2_impl_cpu = &vint32x2_impl_generic; +static vuint32x2_impl *vuint32x2_impl_cpu = &vuint32x2_impl_generic; + +// 128-bit +static vint8x16_impl *vint8x16_impl_cpu = &vint8x16_impl_generic; +static vuint8x16_impl *vuint8x16_impl_cpu = &vuint8x16_impl_generic; +static vint16x8_impl *vint16x8_impl_cpu = &vint16x8_impl_generic; +static vuint16x8_impl *vuint16x8_impl_cpu = &vuint16x8_impl_generic; +static vint32x4_impl *vint32x4_impl_cpu = &vint32x4_impl_generic; +static vuint32x4_impl *vuint32x4_impl_cpu = &vuint32x4_impl_generic; +static vint64x2_impl *vint64x2_impl_cpu = &vint64x2_impl_generic; +static vuint64x2_impl *vuint64x2_impl_cpu = &vuint64x2_impl_generic; + +// 256-bit +static vint8x32_impl *vint8x32_impl_cpu = &vint8x32_impl_generic; +static vuint8x32_impl *vuint8x32_impl_cpu = &vuint8x32_impl_generic; +static vint16x16_impl *vint16x16_impl_cpu = &vint16x16_impl_generic; +static vuint16x16_impl *vuint16x16_impl_cpu = &vuint16x16_impl_generic; +static vint32x8_impl *vint32x8_impl_cpu = &vint32x8_impl_generic; +static vuint32x8_impl *vuint32x8_impl_cpu = &vuint32x8_impl_generic; +static vint64x4_impl *vint64x4_impl_cpu = &vint64x4_impl_generic; +static vuint64x4_impl *vuint64x4_impl_cpu = &vuint64x4_impl_generic; + +// 512-bit +static vint8x64_impl *vint8x64_impl_cpu = &vint8x64_impl_generic; +static vuint8x64_impl *vuint8x64_impl_cpu = &vuint8x64_impl_generic; +static vint16x32_impl *vint16x32_impl_cpu = &vint16x32_impl_generic; +static vuint16x32_impl *vuint16x32_impl_cpu = &vuint16x32_impl_generic; +static vint32x16_impl *vint32x16_impl_cpu = &vint32x16_impl_generic; +static vuint32x16_impl *vuint32x16_impl_cpu = &vuint32x16_impl_generic; +static vint64x8_impl *vint64x8_impl_cpu = &vint64x8_impl_generic; +static vuint64x8_impl *vuint64x8_impl_cpu = &vuint64x8_impl_generic; + +int vec_init(void) +{ + // This function is NOT thread safe. However, once vec + // is initialized, all of the vector functions are thread-safe. + // + // In fact, it's possible to use vec without calling + // vec_init() at all, but it would be completely useless since + // it would just use a generic implementation without any + // vectorization whatsoever (unless maybe the compiler is + // smart enough to optimize it into vectors) + + vec_get_CPU_features(); -#undef VEC_DECL_SPLAT -#undef VEC_DECL_LOAD -#undef VEC_DECL_STORE -#undef VEC_DECL_ADD -#undef VEC_DECL_SUB -#undef VEC_DECL_MUL -#undef VEC_DECL_DIV -#undef VEC_DECL_AND -#undef VEC_DECL_OR -#undef VEC_DECL_XOR -#undef VEC_DECL_AVG -#undef VEC_DECL_SHIFT -#undef VEC_DECL_NOT +#ifdef VEC_COMPILER_HAS_ALTIVEC + if (vec_CPU_have_ALTIVEC()) { + vint8x16_impl_cpu = &vint8x16_impl_altivec; + vuint8x16_impl_cpu = &vuint8x16_impl_altivec; + vint16x8_impl_cpu = &vint16x8_impl_altivec; + vuint16x8_impl_cpu = &vuint16x8_impl_altivec; + vint32x4_impl_cpu = &vint32x4_impl_altivec; + vuint32x4_impl_cpu = &vuint32x4_impl_altivec; +#ifdef VEC_COMPILER_HAS_ALTIVEC_VSX + if (vec_CPU_have_ALTIVEC_VSX()) { + vint64x2_impl_cpu = &vint64x2_impl_altivec; + vuint64x2_impl_cpu = &vuint64x2_impl_altivec; + } +#endif + } +#endif +#ifdef VEC_COMPILER_HAS_AVX512F + if (vec_CPU_have_AVX512F()) { + vint8x64_impl_cpu = &vint8x64_impl_avx512f; + vuint8x64_impl_cpu = &vuint8x64_impl_avx512f; + vint16x32_impl_cpu = &vint16x32_impl_avx512f; + vuint16x32_impl_cpu = &vuint16x32_impl_avx512f; + vint32x16_impl_cpu = &vint32x16_impl_avx512f; + vuint32x16_impl_cpu = &vuint32x16_impl_avx512f; + vint64x8_impl_cpu = &vint64x8_impl_avx512f; + vuint64x8_impl_cpu = &vuint64x8_impl_avx512f; + } +#endif +#ifdef VEC_COMPILER_HAS_AVX2 + if (vec_CPU_have_AVX2()) { + vint8x32_impl_cpu = &vint8x32_impl_avx2; + vuint8x32_impl_cpu = &vuint8x32_impl_avx2; + vint16x16_impl_cpu = &vint16x16_impl_avx2; + vuint16x16_impl_cpu = &vuint16x16_impl_avx2; + vint32x8_impl_cpu = &vint32x8_impl_avx2; + vuint32x8_impl_cpu = &vuint32x8_impl_avx2; + vint64x4_impl_cpu = &vint64x4_impl_avx2; + vuint64x4_impl_cpu = &vuint64x4_impl_avx2; + } +#endif +#ifdef VEC_COMPILER_HAS_SSE2 + if (vec_CPU_have_SSE2()) { + vint8x16_impl_cpu = &vint8x16_impl_sse2; + vuint8x16_impl_cpu = &vuint8x16_impl_sse2; + vint16x8_impl_cpu = &vint16x8_impl_sse2; + vuint16x8_impl_cpu = &vuint16x8_impl_sse2; +# ifdef VEC_COMPILER_HAS_SSE41 + if (vec_CPU_have_SSE41()) { + vint32x4_impl_cpu = &vint32x4_impl_sse41; + vuint32x4_impl_cpu = &vuint32x4_impl_sse41; + } else +# endif + { + vint32x4_impl_cpu = &vint32x4_impl_sse2; + vuint32x4_impl_cpu = &vuint32x4_impl_sse2; + } + vint64x2_impl_cpu = &vint64x2_impl_sse2; + vuint64x2_impl_cpu = &vuint64x2_impl_sse2; + } +#endif +#ifdef VEC_COMPILER_HAS_MMX + if (vec_CPU_have_MMX()) { + vint8x8_impl_cpu = &vint8x8_impl_mmx; + vuint8x8_impl_cpu = &vuint8x8_impl_mmx; + vint16x4_impl_cpu = &vint16x4_impl_mmx; + vuint16x4_impl_cpu = &vuint16x4_impl_mmx; + vint32x2_impl_cpu = &vint32x2_impl_mmx; + vuint32x2_impl_cpu = &vuint32x2_impl_mmx; + } +#endif + { + // do nothing, they're already set to generics + } +} -#undef VEC_DECL_CMPLT -#undef VEC_DECL_CMPGT -#undef VEC_DECL_CMPEQ -#undef VEC_DECL_CMPLE -#undef VEC_DECL_CMPGE +/* ---------------------------------------------------------------- */ -#undef VEC_GENERIC_SPLAT -#undef VEC_GENERIC_DIVIDE -#undef VEC_GENERIC_SHIFT -#undef VEC_GENERIC_SHIFTS -#undef VEC_GENERIC_LSHIFT -#undef VEC_GENERIC_RSHIFT -#undef VEC_GENERIC_LRSHIFT -#undef VEC_GENERIC_AVG -#undef VEC_GENERIC_THAN_OR_EQUAL -#undef VEC_GENERIC_COMPARISON -#undef VEC_GENERIC_COMPARISONS +#define VEC_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->splat) \ + return v##sign##int##bits##x##size##_impl_cpu->splat(x); \ + \ + return v##sign##int##bits##x##size##_fallback_splat(x); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const sign##int##bits##_t in[size]) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->load_aligned) \ + return v##sign##int##bits##x##size##_impl_cpu->load_aligned(in); \ + \ + VEC_ASSERT(0, "vec: load_aligned is required to be implemented"); \ + return (v##sign##int##bits##x##size){0}; \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->load) \ + return v##sign##int##bits##x##size##_impl_cpu->load(in); \ + \ + return v##sign##int##bits##x##size##_fallback_load(in); \ + } \ + \ + void v##sign##int##bits##x##size##_store_aligned(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->store_aligned) { \ + v##sign##int##bits##x##size##_impl_cpu->store_aligned(vec, out); \ + return; \ + } \ + \ + VEC_ASSERT(0, "vec: store_aligned is required to be implemented"); \ + } \ + \ + void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->store) { \ + v##sign##int##bits##x##size##_impl_cpu->store(vec, out); \ + return; \ + } \ + \ + v##sign##int##bits##x##size##_fallback_store(vec, out); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->add) \ + v##sign##int##bits##x##size##_impl_cpu->add(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_add(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->sub) \ + v##sign##int##bits##x##size##_impl_cpu->sub(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_sub(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->mul) \ + v##sign##int##bits##x##size##_impl_cpu->mul(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_mul(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_div(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->div) \ + v##sign##int##bits##x##size##_impl_cpu->div(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_div(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_avg(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->avg) \ + v##sign##int##bits##x##size##_impl_cpu->avg(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_avg(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->and) \ + v##sign##int##bits##x##size##_impl_cpu->and(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_and(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->or) \ + v##sign##int##bits##x##size##_impl_cpu->or(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_or(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->xor) \ + v##sign##int##bits##x##size##_impl_cpu->xor(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_xor(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_not(v##sign##int##bits##x##size vec) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->not) \ + v##sign##int##bits##x##size##_impl_cpu->not(vec); \ + \ + return v##sign##int##bits##x##size##_fallback_not(vec); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmplt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->cmplt) \ + v##sign##int##bits##x##size##_impl_cpu->cmplt(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmplt(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmple(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->cmple) \ + v##sign##int##bits##x##size##_impl_cpu->cmple(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmple(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->cmpeq) \ + v##sign##int##bits##x##size##_impl_cpu->cmpeq(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmpeq(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpge(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->cmpge) \ + v##sign##int##bits##x##size##_impl_cpu->cmpge(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmpge(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_cmpgt(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->cmpgt) \ + v##sign##int##bits##x##size##_impl_cpu->cmpgt(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_cmpgt(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->lshift) \ + v##sign##int##bits##x##size##_impl_cpu->lshift(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_lshift(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_rshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->rshift) \ + v##sign##int##bits##x##size##_impl_cpu->rshift(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_rshift(vec1, vec2); \ + } \ + \ + v##sign##int##bits##x##size v##sign##int##bits##x##size##_lrshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + if (v##sign##int##bits##x##size##_impl_cpu->lrshift) \ + v##sign##int##bits##x##size##_impl_cpu->lrshift(vec1, vec2); \ + \ + return v##sign##int##bits##x##size##_fallback_lrshift(vec1, vec2); \ + } -#undef VEC_VINT8X16 -#undef VEC_VINT16X8 -#undef VEC_VINT32X4 -#undef VEC_VINT64X2 -#undef VEC_VUINT8X16 -#undef VEC_VUINT16X8 -#undef VEC_VUINT32X4 -#undef VEC_VUINT64X2 +#define VEC_DEFINE_OPERATIONS(bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN( , bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN(u, bits, size) + +// 64-bit +VEC_DEFINE_OPERATIONS(8, 8) +VEC_DEFINE_OPERATIONS(16, 4) +VEC_DEFINE_OPERATIONS(32, 2) + +// 128-bit +VEC_DEFINE_OPERATIONS(8, 16) +VEC_DEFINE_OPERATIONS(16, 8) +VEC_DEFINE_OPERATIONS(32, 4) +VEC_DEFINE_OPERATIONS(64, 2) -#undef VEC_VINT8X32 -#undef VEC_VINT16X16 -#undef VEC_VINT32X8 -#undef VEC_VINT64X4 -#undef VEC_VUINT8X32 -#undef VEC_VUINT16X16 -#undef VEC_VUINT32X8 -#undef VEC_VUINT64X4 +// 256-bit +VEC_DEFINE_OPERATIONS(8, 32) +VEC_DEFINE_OPERATIONS(16, 16) +VEC_DEFINE_OPERATIONS(32, 8) +VEC_DEFINE_OPERATIONS(64, 4) -#undef VEC_VINT8X64 -#undef VEC_VINT16X32 -#undef VEC_VINT32X16 -#undef VEC_VINT64X8 -#undef VEC_VUINT8X64 -#undef VEC_VUINT16X32 -#undef VEC_VUINT32X16 -#undef VEC_VUINT64X8 +// 512-bit +VEC_DEFINE_OPERATIONS(8, 64) +VEC_DEFINE_OPERATIONS(16, 32) +VEC_DEFINE_OPERATIONS(32, 16) +VEC_DEFINE_OPERATIONS(64, 8) + +#undef VEC_DEFINE_OPERATIONS +#undef VEC_DEFINE_OPERATIONS_SIGN + +#endif /* VEC_IMPLEMENTATION */ #endif /* VEC_VEC_H_ */ diff -r 981cf0bc7f3a -r e05c257c6a23 src/vec.c --- a/src/vec.c Tue Nov 19 15:55:01 2024 -0500 +++ b/src/vec.c Wed Nov 20 04:10:37 2024 -0500 @@ -1,3 +1,2 @@ -#define VEC_EXTERN -#define VEC_EXTERN_DEFINE +#define VEC_IMPLEMENTATION #include "vec/vec.h" diff -r 981cf0bc7f3a -r e05c257c6a23 test/Makefile --- a/test/Makefile Tue Nov 19 15:55:01 2024 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -CFLAGS += -std=c99 -I../include - -# binary files -BINS = test-gcc test-generic test-host -OBJS = $(BINS:=.o) - -.PHONY: all clean test - -all: $(BINS) - -# suppress the platform-dependent hardware stuff so we only have -# GCC vector extensions -test-gcc: CFLAGS += -DVEC_SUPPRESS_HW - -# also suppress GCC extensions, leaving only the defaults -test-generic: CFLAGS += -DVEC_SUPPRESS_HW -DVEC_SUPPRESS_GCC - -$(OBJS): main.c - $(CC) $(CFLAGS) -o $@ -c $^ - -$(BINS): %: %.o - $(CC) $(LDFLAGS) -o $@ $^ - -clean: - $(RM) $(BINS) $(OBJS) - -test: clean $(BINS) - ./test-gcc - ./test-generic - ./test-host diff -r 981cf0bc7f3a -r e05c257c6a23 test/Makefile.ppc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile.ppc Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,3 @@ +CFLAGS += -maltivec + +include Makefile.template \ No newline at end of file diff -r 981cf0bc7f3a -r e05c257c6a23 test/Makefile.template --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile.template Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,40 @@ +CFLAGS += -g -O2 -std=c99 -I../include + +HEADERS = ../include/vec/vec.h \ + ../include/vec/impl/ppc/altivec.h \ + ../include/vec/impl/x86/avx2.h \ + ../include/vec/impl/x86/avx512f.h \ + ../include/vec/impl/x86/mmx.h \ + ../include/vec/impl/x86/sse2.h \ + ../include/vec/impl/x86/sse41.h \ + ../include/vec/impl/cpu.h \ + ../include/vec/impl/fallback.h \ + ../include/vec/impl/generic.h +BINS = test-generic test-host +OBJS = vec-generic.o vec-host.o test.o + +.PHONY: all clean test + +all: $(BINS) + +vec-generic.o: ../src/vec.c + $(CC) $(CFLAGS) -DVEC_SUPPRESS_HW=1 -c -o $@ $< + +vec-host.o: ../src/vec.c + $(CC) $(CFLAGS) -c -o $@ $< + +test.o: test.c + $(CC) $(CFLAGS) -c -o $@ $< + +test-generic: vec-generic.o test.o + $(CC) $(LDFLAGS) -o $@ $^ + +test-host: vec-host.o test.o + $(CC) $(LDFLAGS) -o $@ $^ + +clean: + $(RM) $(BINS) $(OBJS) + +test: clean $(BINS) + ./test-generic + ./test-host diff -r 981cf0bc7f3a -r e05c257c6a23 test/Makefile.x86 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/Makefile.x86 Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,3 @@ +CFLAGS += -mmmx -msse2 -msse4.1 -mavx2 -mavx512f + +include Makefile.template \ No newline at end of file diff -r 981cf0bc7f3a -r e05c257c6a23 test/main.c --- a/test/main.c Tue Nov 19 15:55:01 2024 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -#define VEC_EXTERN -#define VEC_EXTERN_DEFINE -#include "vec/vec.h" - -#include -#include - -#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) - -static const int8_t testval8[] = { - INT8_C(-80), INT8_C(-3), INT8_C(25), INT8_C(0x7F), - INT8_C(-42), INT8_C(27), INT8_C(24), INT8_C(0x40), -}; - -static const uint8_t testvalu8[] = { - UINT8_C(0x00), UINT8_C(0xFF), UINT8_C(0xFE), UINT8_C(0x7F), - UINT8_C(0xC0), UINT8_C(0x80), UINT8_C(0x20), UINT8_C(0x50), -}; - -static const int16_t testval16[] = { - INT16_C(-8000), INT16_C(-30), INT16_MAX, INT16_C(0x4000), - INT16_C(-42), INT16_C(250), INT16_MIN, INT16_C(0x500), -}; - -static const uint16_t testvalu16[] = { - UINT16_C(0x0000), UINT16_C(0xFFFF), UINT16_C(0xFEA), UINT16_C(0x7FF), - UINT16_C(0x7FFF), UINT16_C(0x8000), UINT16_C(0x20B), UINT16_C(0x50C), -}; - -static const int32_t testval32[] = { - INT32_C(-1000000), INT32_C(-3), INT32_C(0x00000000), INT32_C(0xFFFFFFFF), - INT32_C( -42), INT32_C(27), INT32_C(0xABCDEF03), INT32_C(0x00000FFF), - INT32_C(0xFFFFFFFF), INT32_C( 0), INT32_C(0xFFFFFFFE), INT32_C( 1), -}; - -static const uint32_t testvalu32[] = { - UINT32_C(0x00000000), UINT32_C(0xDEADBEEF), UINT32_C(42), UINT32_C(0x12340000), - UINT32_C(0xFFFFFFFF), UINT32_C(0xFEDCBA98), UINT32_C(17), UINT32_C(0x00012345), - UINT32_C(0xFFFFFFFF), UINT32_C(0xFFFFFFFE), UINT32_C( 0), UINT32_C( 1), -}; - -static const int64_t testval64[] = { - INT64_MAX, INT64_C(-3), INT64_C(0x00000000), INT64_C(0xFFFFFFFFF), - INT64_MIN, INT64_C(645366), INT64_C(0x12345ABCDE), INT64_C(0xF00000FFF), -}; - -static const uint64_t testvalu64[] = { - UINT64_MAX, UINT64_C(0x44354365), UINT64_C(0x00000000), UINT64_C(0xFFFFFFFFF), - UINT64_C(0xff), UINT64_C(645366), UINT64_C(0x12345ABCDE), UINT64_C(0xF00000FFF), -}; - -#define VTEST(sign, csign, bits, size) \ - static inline v##sign##int##bits##x##size vtest##sign##bits##x##size(const size_t start) \ - { \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(x); \ - for (size_t i = 0; i < size; i++) \ - x[i] = testval##sign##bits[(start + i) % ARRAY_SIZE(testval##sign##bits)]; \ - return v##sign##int##bits##x##size##_load_aligned(x); \ - } - -#define VPRINT(sign, csign, psign, bits, size) \ - static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \ - { \ - fputs("vector: ", file); \ - \ - V##csign##INT##bits##x##size##_ALIGNED_ARRAY(v); \ - \ - v##sign##int##bits##x##size##_store_aligned(vec, v); \ - \ - fprintf(file, "%" PRI ## psign ## bits, v[0]); \ - \ - for (int i = 1; i < size; i++) \ - fprintf(file, ", %" PRI ## psign ## bits, v[i]); \ - \ - fputs("\n", file); \ - \ - } - -#define DEF_VEC_TEST_FUNCS(bits, size) \ - VTEST(, , bits, size) VTEST(u, U, bits, size) \ - VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size) - -DEF_VEC_TEST_FUNCS(8, 16) - -#undef DEF_VEC_TEST_FUNCS -#undef VPRINT -#undef VTEST - -// ------------------------------------------------------------ - -#include "test_align.h" -#include "test_arith.h" -#include "test_compare.h" - -// ------------------------------------------------------------ - -int main(void) -{ - int ret = 0; - - ret |= test_align(); - ret |= test_arith(); - ret |= test_compare(); - - return ret; -} diff -r 981cf0bc7f3a -r e05c257c6a23 test/test.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test.c Wed Nov 20 04:10:37 2024 -0500 @@ -0,0 +1,123 @@ +#include "vec/vec.h" + +#include +#include + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) + +static const int8_t testval8[] = { + INT8_C(-80), INT8_C(-3), INT8_C(25), INT8_C(0x7F), + INT8_C(-42), INT8_C(27), INT8_C(24), INT8_C(0x40), +}; + +static const uint8_t testvalu8[] = { + UINT8_C(0x00), UINT8_C(0xFF), UINT8_C(0xFE), UINT8_C(0x7F), + UINT8_C(0xC0), UINT8_C(0x80), UINT8_C(0x20), UINT8_C(0x50), +}; + +static const int16_t testval16[] = { + INT16_C(-8000), INT16_C(-30), INT16_MAX, INT16_C(0x4000), + INT16_C(-42), INT16_C(250), INT16_MIN, INT16_C(0x500), +}; + +static const uint16_t testvalu16[] = { + UINT16_C(0x0000), UINT16_C(0xFFFF), UINT16_C(0xFEA), UINT16_C(0x7FF), + UINT16_C(0x7FFF), UINT16_C(0x8000), UINT16_C(0x20B), UINT16_C(0x50C), +}; + +static const int32_t testval32[] = { + INT32_C(-1000000), INT32_C(-3), INT32_C(0x00000000), INT32_C(0xFFFFFFFF), + INT32_C( -42), INT32_C(27), INT32_C(0xABCDEF03), INT32_C(0x00000FFF), + INT32_C(0xFFFFFFFF), INT32_C( 0), INT32_C(0xFFFFFFFE), INT32_C( 1), +}; + +static const uint32_t testvalu32[] = { + UINT32_C(0x00000000), UINT32_C(0xDEADBEEF), UINT32_C(42), UINT32_C(0x12340000), + UINT32_C(0xFFFFFFFF), UINT32_C(0xFEDCBA98), UINT32_C(17), UINT32_C(0x00012345), + UINT32_C(0xFFFFFFFF), UINT32_C(0xFFFFFFFE), UINT32_C( 0), UINT32_C( 1), +}; + +static const int64_t testval64[] = { + INT64_MAX, INT64_C(-3), INT64_C(0x00000000), INT64_C(0xFFFFFFFFF), + INT64_MIN, INT64_C(645366), INT64_C(0x12345ABCDE), INT64_C(0xF00000FFF), +}; + +static const uint64_t testvalu64[] = { + UINT64_MAX, UINT64_C(0x44354365), UINT64_C(0x00000000), UINT64_C(0xFFFFFFFFF), + UINT64_C(0xff), UINT64_C(645366), UINT64_C(0x12345ABCDE), UINT64_C(0xF00000FFF), +}; + +#define VTEST(sign, csign, bits, size) \ + static inline v##sign##int##bits##x##size vtest##sign##bits##x##size(const size_t start) \ + { \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(x); \ + for (size_t i = 0; i < size; i++) \ + x[i] = testval##sign##bits[(start + i) % ARRAY_SIZE(testval##sign##bits)]; \ + return v##sign##int##bits##x##size##_load_aligned(x); \ + } + +#define VPRINT(sign, csign, psign, bits, size) \ + static inline void print_v##sign##int##bits##x##size(FILE *file, v##sign##int##bits##x##size vec) \ + { \ + fputs("vector: ", file); \ + \ + V##csign##INT##bits##x##size##_ALIGNED_ARRAY(v); \ + \ + v##sign##int##bits##x##size##_store_aligned(vec, v); \ + \ + fprintf(file, "%" PRI ## psign ## bits, v[0]); \ + \ + for (int i = 1; i < size; i++) \ + fprintf(file, ", %" PRI ## psign ## bits, v[i]); \ + \ + fputs("\n", file); \ + \ + } + +#define DEF_VEC_TEST_FUNCS(bits, size) \ + VTEST(, , bits, size) VTEST(u, U, bits, size) \ + VPRINT(, , d, bits, size) VPRINT(u, U, u, bits, size) + +DEF_VEC_TEST_FUNCS(8, 8) +DEF_VEC_TEST_FUNCS(16, 4) +DEF_VEC_TEST_FUNCS(32, 2) + +DEF_VEC_TEST_FUNCS(8, 16) +DEF_VEC_TEST_FUNCS(16, 8) +DEF_VEC_TEST_FUNCS(32, 4) +DEF_VEC_TEST_FUNCS(64, 2) + +DEF_VEC_TEST_FUNCS(8, 32) +DEF_VEC_TEST_FUNCS(16, 16) +DEF_VEC_TEST_FUNCS(32, 8) +DEF_VEC_TEST_FUNCS(64, 4) + +DEF_VEC_TEST_FUNCS(8, 64) +DEF_VEC_TEST_FUNCS(16, 32) +DEF_VEC_TEST_FUNCS(32, 16) +DEF_VEC_TEST_FUNCS(64, 8) + +#undef DEF_VEC_TEST_FUNCS +#undef VPRINT +#undef VTEST + +// ------------------------------------------------------------ + +#include "test_align.h" +#include "test_arith.h" +#include "test_compare.h" + +// ------------------------------------------------------------ + +int main(void) +{ + int ret = 0; + + vec_init(); + + ret |= test_align(); + ret |= test_arith(); + ret |= test_compare(); + + return ret; +} diff -r 981cf0bc7f3a -r e05c257c6a23 test/test_align.h --- a/test/test_align.h Tue Nov 19 15:55:01 2024 -0500 +++ b/test/test_align.h Wed Nov 20 04:10:37 2024 -0500 @@ -31,7 +31,24 @@ RUN_TEST( , , bits, size) \ RUN_TEST(u, U, bits, size) + RUN_TESTS(8, 8) + RUN_TESTS(16, 4) + RUN_TESTS(32, 2) + RUN_TESTS(8, 16) + RUN_TESTS(16, 8) + RUN_TESTS(32, 4) + RUN_TESTS(64, 2) + + RUN_TESTS(8, 32) + RUN_TESTS(16, 16) + RUN_TESTS(32, 8) + RUN_TESTS(64, 4) + + RUN_TESTS(8, 64) + RUN_TESTS(16, 32) + RUN_TESTS(32, 16) + RUN_TESTS(64, 8) #undef RUN_TESTS #undef RUN_TEST diff -r 981cf0bc7f3a -r e05c257c6a23 test/test_arith.h --- a/test/test_arith.h Tue Nov 19 15:55:01 2024 -0500 +++ b/test/test_arith.h Wed Nov 20 04:10:37 2024 -0500 @@ -69,7 +69,24 @@ CREATE_TESTS_SIGN(, d, , bits, size) \ CREATE_TESTS_SIGN(u, u, U, bits, size) +CREATE_TESTS(8, 8) +CREATE_TESTS(16, 4) +CREATE_TESTS(32, 2) + CREATE_TESTS(8, 16) +CREATE_TESTS(16, 8) +CREATE_TESTS(32, 4) +CREATE_TESTS(64, 2) + +CREATE_TESTS(8, 32) +CREATE_TESTS(16, 16) +CREATE_TESTS(32, 8) +CREATE_TESTS(64, 4) + +CREATE_TESTS(8, 64) +CREATE_TESTS(16, 32) +CREATE_TESTS(32, 16) +CREATE_TESTS(64, 8) #undef CREATE_TESTS_SIGN #undef CREATE_TESTS @@ -109,7 +126,24 @@ RUN_TESTS_SIGN( , bits, size) \ RUN_TESTS_SIGN(u, bits, size) + RUN_TESTS(8, 8) + RUN_TESTS(16, 4) + RUN_TESTS(32, 2) + RUN_TESTS(8, 16) + RUN_TESTS(16, 8) + RUN_TESTS(32, 4) + RUN_TESTS(64, 2) + + RUN_TESTS(8, 32) + RUN_TESTS(16, 16) + RUN_TESTS(32, 8) + RUN_TESTS(64, 4) + + RUN_TESTS(8, 64) + RUN_TESTS(16, 32) + RUN_TESTS(32, 16) + RUN_TESTS(64, 8) #undef RUN_TESTS_SIGN #undef RUN_TESTS diff -r 981cf0bc7f3a -r e05c257c6a23 test/test_compare.h --- a/test/test_compare.h Tue Nov 19 15:55:01 2024 -0500 +++ b/test/test_compare.h Wed Nov 20 04:10:37 2024 -0500 @@ -32,7 +32,24 @@ #define CREATE_TESTS(bits, size) CREATE_TESTS_SIGN(, d, bits, size) CREATE_TESTS_SIGN(u, u, bits, size) +CREATE_TESTS(8, 8) +CREATE_TESTS(16, 4) +CREATE_TESTS(32, 2) + CREATE_TESTS(8, 16) +CREATE_TESTS(16, 8) +CREATE_TESTS(32, 4) +CREATE_TESTS(64, 2) + +CREATE_TESTS(8, 32) +CREATE_TESTS(16, 16) +CREATE_TESTS(32, 8) +CREATE_TESTS(64, 4) + +CREATE_TESTS(8, 64) +CREATE_TESTS(16, 32) +CREATE_TESTS(32, 16) +CREATE_TESTS(64, 8) #undef CREATE_TESTS_SIGN #undef CREATE_TESTS @@ -59,7 +76,24 @@ RUN_TESTS_SIGN( , bits, size) \ RUN_TESTS_SIGN(u, bits, size) + RUN_TESTS(8, 8) + RUN_TESTS(16, 4) + RUN_TESTS(32, 2) + RUN_TESTS(8, 16) + RUN_TESTS(16, 8) + RUN_TESTS(32, 4) + RUN_TESTS(64, 2) + + RUN_TESTS(8, 32) + RUN_TESTS(16, 16) + RUN_TESTS(32, 8) + RUN_TESTS(64, 4) + + RUN_TESTS(8, 64) + RUN_TESTS(16, 32) + RUN_TESTS(32, 16) + RUN_TESTS(64, 8) #undef RUN_TESTS_SIGN #undef RUN_TESTS