Mercurial > vec
changeset 39:f9ca85d2f14c
*: rearrange some things; add avx512bw support
author | Paper <paper@tflc.us> |
---|---|
date | Sat, 26 Apr 2025 15:31:39 -0400 |
parents | fd42f9b1b95e |
children | 55cadb1fac4b |
files | README include/vec/cpu.h include/vec/defs.h include/vec/impl/cpu.h include/vec/impl/generic.h include/vec/impl/ppc/altivec.h include/vec/impl/x86/avx2.h include/vec/impl/x86/avx512bw.h include/vec/impl/x86/avx512f.h include/vec/impl/x86/sse2.h include/vec/mem.h include/vec/vec.h test/Makefile.template test/Makefile.x86 test/test.c test/test_benchmark.h utils/gengeneric.c |
diffstat | 17 files changed, 1203 insertions(+), 743 deletions(-) [+] |
line wrap: on
line diff
--- a/README Sat Apr 26 02:54:44 2025 -0400 +++ b/README Sat Apr 26 15:31:39 2025 -0400 @@ -138,9 +138,9 @@ multiple translation units and pass different command line arguments to the compiler to enable SSE2/AVX2/Altivec etc, and detect the vector modes the CPU supports at runtime. vec provides an optional public API -specifically for this use-case within `vec/impl/cpu.h`; bear in mind -though that it is not thread-safe, so if your program is multithreaded -you'll want to cache the results on startup. +specifically for this use-case within `vec/cpu.h`; bear in mind though +that it is not thread-safe, so if your program is multithreaded you'll want +to cache the results on startup. The CPU vector detection API is extremely simple, and self-explanatory. You call `vec_get_CPU_features()', and it returns a bit-mask of the @@ -177,6 +177,9 @@ The heap-based API is based off the good old C malloc API: + /* heap allocation stuff is only defined here: */ + #include "vec/mem.h" + vec_int32 *q = vec_malloc(1024 * sizeof(vec_int32)); /* q is now aligned, and ready for use with a vector aligned load
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/cpu.h Sat Apr 26 15:31:39 2025 -0400 @@ -0,0 +1,516 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024-2025 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_CPU_H_ +#define VEC_CPU_H_ + +#include "defs.h" + +/* Detect CPU SIMD support. Much of this code was stolen from SDL. + * + * Simple DirectMedia Layer + * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org> + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. +*/ + +#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) +# include <sys/sysctl.h> // For AltiVec check +#elif defined(__OpenBSD__) && defined(__powerpc__) +# include <sys/types.h> +# include <sys/sysctl.h> // For AltiVec check +# include <machine/cpu.h> +#elif defined(__FreeBSD__) && defined(__powerpc__) +# include <machine/cpu.h> +# include <sys/auxv.h> +#elif defined(__ALTIVEC__) +# include <signal.h> +# include <setjmp.h> +#endif + +#ifdef __FreeBSD__ +# include <sys/param.h> +#endif + +#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__) +# include <unistd.h> +# include <sys/types.h> +# include <sys/stat.h> +# include <fcntl.h> +# include <elf.h> + +/*#include <asm/hwcap.h>*/ +# ifndef AT_HWCAP +# define AT_HWCAP 16 +# endif +# ifndef AT_PLATFORM +# define AT_PLATFORM 15 +# endif +# ifndef HWCAP_NEON +# define HWCAP_NEON (1 << 12) +# endif +#endif + +static inline int vec_CPU_have_CPUID(void) +{ + int has_CPUID = 0; + +#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) + __asm__ ( +" pushfl # Get original EFLAGS \n" +" popl %%eax \n" +" movl %%eax,%%ecx \n" +" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" +" pushl %%eax # Save new EFLAGS value on stack \n" +" popfl # Replace current EFLAGS value \n" +" pushfl # Get new EFLAGS \n" +" popl %%eax # Store new EFLAGS in EAX \n" +" xorl %%ecx,%%eax # Can not toggle ID bit, \n" +" jz 1f # Processor=80486 \n" +" movl $1,%0 # We have CPUID support \n" +"1: \n" + : "=m" (has_CPUID) + : + : "%eax", "%ecx" + ); +#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) +/* Technically, if this is being compiled under __x86_64__ then it has + CPUid by definition. But it's nice to be able to prove it. :) */ + __asm__ ( +" pushfq # Get original EFLAGS \n" +" popq %%rax \n" +" movq %%rax,%%rcx \n" +" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" +" pushq %%rax # Save new EFLAGS value on stack \n" +" popfq # Replace current EFLAGS value \n" +" pushfq # Get new EFLAGS \n" +" popq %%rax # Store new EFLAGS in EAX \n" +" xorl %%ecx,%%eax # Can not toggle ID bit, \n" +" jz 1f # Processor=80486 \n" +" movl $1,%0 # We have CPUID support \n" +"1: \n" + : "=m" (has_CPUID) + : + : "%rax", "%rcx" + ); +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) + __asm { + pushfd ; Get original EFLAGS + pop eax + mov ecx, eax + xor eax, 200000h ; Flip ID bit in EFLAGS + push eax ; Save new EFLAGS value on stack + popfd ; Replace current EFLAGS value + pushfd ; Get new EFLAGS + pop eax ; Store new EFLAGS in EAX + xor eax, ecx ; Can not toggle ID bit, + jz done ; Processor=80486 + mov has_CPUID,1 ; We have CPUID support +done: + } +#elif defined(_MSC_VER) && defined(_M_X64) + has_CPUID = 1; +#elif defined(__sun) && defined(__i386) + __asm ( +" pushfl \n" +" popl %eax \n" +" movl %eax,%ecx \n" +" xorl $0x200000,%eax \n" +" pushl %eax \n" +" popfl \n" +" pushfl \n" +" popl %eax \n" +" xorl %ecx,%eax \n" +" jz 1f \n" +" movl $1,-8(%ebp) \n" +"1: \n" + ); +#elif defined(__sun) && defined(__amd64) + __asm ( +" pushfq \n" +" popq %rax \n" +" movq %rax,%rcx \n" +" xorl $0x200000,%eax \n" +" pushq %rax \n" +" popfq \n" +" pushfq \n" +" popq %rax \n" +" xorl %ecx,%eax \n" +" jz 1f \n" +" movl $1,-8(%rbp) \n" +"1: \n" + ); +#endif + + return has_CPUID; +} + +#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm__ __volatile__( \ + " pushl %%ebx \n" \ + " xorl %%ecx,%%ecx \n" \ + " cpuid \n" \ + " movl %%ebx, %%esi \n" \ + " popl %%ebx \n" \ + : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ + : "a"(func)) +#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm__ __volatile__( \ + " pushq %%rbx \n" \ + " xorq %%rcx,%%rcx \n" \ + " cpuid \n" \ + " movq %%rbx, %%rsi \n" \ + " popq %%rbx \n" \ + : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ + : "a"(func)) +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) +# define VEC_CPU_CPUID(func, a, b, c, d) \ + __asm { \ + __asm mov eax, func \ + __asm xor ecx, ecx \ + __asm cpuid \ + __asm mov a, eax \ + __asm mov b, ebx \ + __asm mov c, ecx \ + __asm mov d, edx \ + } +#elif (defined(_MSC_VER) && defined(_M_X64)) +// Use __cpuidex instead of __cpuid because ICL does not clear ecx register +# define VEC_CPU_CPUID(func, a, b, c, d) \ + do { \ + int CPUInfo[4]; \ + __cpuidex(CPUInfo, func, 0); \ + a = CPUInfo[0]; \ + b = CPUInfo[1]; \ + c = CPUInfo[2]; \ + d = CPUInfo[3]; \ + } while (0) +#else +# define VEC_CPU_CPUID(func, a, b, c, d) \ + do { \ + a = b = c = d = 0; \ + (void)a; \ + (void)b; \ + (void)c; \ + (void)d; \ + } while (0) +#endif + +// --------------------------------------------------------------- + +static int vec_CPU_CPUIDFeatures[4]; +static int vec_CPU_CPUIDMaxFunction = 0; +static int vec_CPU_OSSavesYMM = 0; +static int vec_CPU_OSSavesZMM = 0; + +static inline void vec_CPU_get_CPUID_features(void) +{ + static int checked = 0; + if (!checked) { + checked = 1; + if (vec_CPU_have_CPUID()) { + int a, b, c, d; + VEC_CPU_CPUID(0, a, b, c, d); + vec_CPU_CPUIDMaxFunction = a; + if (vec_CPU_CPUIDMaxFunction >= 1) { + VEC_CPU_CPUID(1, a, b, c, d); + vec_CPU_CPUIDFeatures[0] = a; + vec_CPU_CPUIDFeatures[1] = b; + vec_CPU_CPUIDFeatures[2] = c; + vec_CPU_CPUIDFeatures[3] = d; + + // Check to make sure we can call xgetbv + if (c & 0x08000000) { + // Call xgetbv to see if YMM (etc) register state is saved +#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__)) + __asm__(".byte 0x0f, 0x01, 0xd0" + : "=a"(a) + : "c"(0) + : "%edx"); +#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1 + a = (int)_xgetbv(0); +#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) + __asm { + xor ecx, ecx + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 + mov a, eax + } +#endif + vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0; + vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0; + } + } + } + } +} + +#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) +static jmp_buf vec_jmpbuf; +static void vec_CPU_illegal_instruction(int sig) +{ + longjmp(vec_jmpbuf, 1); +} +#endif + +static int vec_CPU_have_ALTIVEC(void) +{ + volatile int altivec = 0; +#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__)) + int selectors[2] = { +# ifdef __OpenBSD__ + CTL_MACHDEP, CPU_ALTIVEC +# else + CTL_HW, HW_VECTORUNIT +# endif + }; + int hasVectorUnit = 0; + vec_uintsize length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if (!error) + altivec = (hasVectorUnit != 0); +#elif defined(__FreeBSD__) && defined(__powerpc__) + unsigned long cpufeatures = 0; + elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); + altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; +#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) + void (*handler)(int sig); + handler = signal(SIGILL, vec_CPU_illegal_instruction); + if (!setjmp(vec_jmpbuf)) { + vector unsigned char vec; + vec_and(vec, vec); + altivec = 1; + } + signal(SIGILL, handler); +#endif + return altivec; +} + +static int vec_CPU_have_ALTIVEC_VSX(void) +{ + volatile int vsx = 0; +#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__) +# warning Compiling UNTESTED code for VSX. + void (*handler)(int sig); + handler = signal(SIGILL, vec_CPU_illegal_instruction); + if (!setjmp(vec_jmpbuf)) { + // this is completely untested + //__asm__ __volatile__("mtspr 256, %0\n\t" + // "xxland %%v0, %%v0, %%v0" ::"r"(-1)); + //vsx = 1; + } + signal(SIGILL, handler); +#endif + return vsx; +} + +#define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000) +#define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000) +#define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000) +#define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001) +#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000) +#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000) +#define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000)) + +static inline int vec_CPU_have_AVX2(void) +{ + if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { + int a, b, c, d; + VEC_CPU_CPUID(7, a, b, c, d); + return b & 0x00000020; + (void)a, (void)c, (void)d; + } + return 0; +} + +static inline int vec_CPU_have_AVX512F(void) +{ + if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { + int a, b, c, d; + VEC_CPU_CPUID(7, a, b, c, d); + return b & 0x00010000; + (void)a, (void)c, (void)d; + } + return 0; +} + +#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL) +static int readProcAuxvForNeon(void) +{ + int neon = 0; + int fd; + + fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + Elf32_auxv_t aux; + while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { + if (aux.a_type == AT_HWCAP) { + neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON; + break; + } + } + close(fd); + } + return neon; +} +#endif + +static int vec_CPU_have_NEON(void) +{ +/* The way you detect NEON is a privileged instruction on ARM, so you have + query the OS kernel in a platform-specific way. :/ */ +#if defined(SDL_CPUINFO_DISABLED) + return 0; /* disabled */ +#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64)) +/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */ +/* Seems to have been removed */ +#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE +#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19 +#endif + /* All WinRT ARM devices are required to support NEON, but just in case. */ + return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0; +#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__) + return 1; /* ARMv8 always has non-optional NEON support. */ +#elif defined(__VITA__) + return 1; +#elif defined(__3DS__) + return 0; +#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) + /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */ + return 1; /* all Apple ARMv7 chips and later have NEON. */ +#elif defined(__APPLE__) + return 0; /* assume anything else from Apple doesn't have NEON. */ +#elif !defined(__arm__) + return 0; /* not an ARM CPU at all. */ +#elif defined(__OpenBSD__) + return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */ +#elif defined(HAVE_ELF_AUX_INFO) + unsigned long hasneon = 0; + if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0) + return 0; + + return ((hasneon & HWCAP_NEON) == HWCAP_NEON); +#elif defined(__QNXNTO__) + return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON; +#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON; +#elif defined(__linux__) + return readProcAuxvForNeon(); +#elif defined(__ANDROID__) + /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */ + { + AndroidCpuFamily cpu_family = android_getCpuFamily(); + if (cpu_family == ANDROID_CPU_FAMILY_ARM) { + uint64_t cpu_features = android_getCpuFeatures(); + if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) { + return 1; + } + } + return 0; + } +#elif defined(__RISCOS__) + /* Use the VFPSupport_Features SWI to access the MVFR registers */ + { + _kernel_swi_regs regs; + regs.r[0] = 0; + if (_kernel_swi(VFPSupport_Features, ®s, ®s) == NULL) { + if ((regs.r[2] & 0xFFF000) == 0x111000) { + return 1; + } + } + return 0; + } +#else +#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me. + return 0; +#endif +} + +enum { + VEC_CPU_HAS_ALTIVEC = (1 << 0), + VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1), + VEC_CPU_HAS_MMX = (1 << 2), + VEC_CPU_HAS_SSE = (1 << 3), + VEC_CPU_HAS_SSE2 = (1 << 4), + VEC_CPU_HAS_SSE3 = (1 << 5), + VEC_CPU_HAS_SSE41 = (1 << 6), + VEC_CPU_HAS_SSE42 = (1 << 7), + VEC_CPU_HAS_AVX = (1 << 8), + VEC_CPU_HAS_AVX2 = (1 << 9), + VEC_CPU_HAS_AVX512F = (1 << 10), + VEC_CPU_HAS_NEON = (1 << 11), +}; + +#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF) + +VEC_FUNC_IMPL uint32_t vec_get_CPU_features(void) +{ + static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET; + if (vec_CPU_features == VEC_CPU_FEATURES_RESET) { + vec_CPU_get_CPUID_features(); + vec_CPU_features = 0; + if (vec_CPU_have_ALTIVEC()) + vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; + if (vec_CPU_have_ALTIVEC_VSX()) + vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; + if (vec_CPU_have_MMX()) + vec_CPU_features |= VEC_CPU_HAS_MMX; + if (vec_CPU_have_SSE()) + vec_CPU_features |= VEC_CPU_HAS_SSE; + if (vec_CPU_have_SSE2()) + vec_CPU_features |= VEC_CPU_HAS_SSE2; + if (vec_CPU_have_SSE3()) + vec_CPU_features |= VEC_CPU_HAS_SSE3; + if (vec_CPU_have_SSE41()) + vec_CPU_features |= VEC_CPU_HAS_SSE41; + if (vec_CPU_have_SSE42()) + vec_CPU_features |= VEC_CPU_HAS_SSE42; + if (vec_CPU_have_AVX()) + vec_CPU_features |= VEC_CPU_HAS_AVX; + if (vec_CPU_have_AVX2()) + vec_CPU_features |= VEC_CPU_HAS_AVX2; + if (vec_CPU_have_AVX512F()) + vec_CPU_features |= VEC_CPU_HAS_AVX512F; + if (vec_CPU_have_NEON()) + vec_CPU_features |= VEC_CPU_HAS_NEON; + } + return vec_CPU_features; +} + +#endif /* VEC_CPU_H_ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/defs.h Sat Apr 26 15:31:39 2025 -0400 @@ -0,0 +1,126 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024-2025 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_DEFS_H_ +#define VEC_DEFS_H_ + +#include <string.h> +#include <stdlib.h> + +#ifdef VEC_CUSTOM_INTEGER_TYPEDEF +/* we already have custom integer typedefs; */ +# include "impl/integer.h" +#else +# if __cplusplus >= (201103L) +# include <cstdint> +# include <cstddef> +typedef std::size_t vec_uintsize; + +typedef std::uint8_t vec_uint8; +typedef std::uint16_t vec_uint16; +typedef std::uint32_t vec_uint32; +typedef std::uint64_t vec_uint64; +typedef std::uintmax_t vec_uintmax; +typedef std::uintptr_t vec_uintptr; + +typedef std::int8_t vec_int8; +typedef std::int16_t vec_int16; +typedef std::int32_t vec_int32; +typedef std::int64_t vec_int64; +typedef std::intmax_t vec_intmax; +# elif __STDC_VERSION__ >= 199901L +# include <stdint.h> +# include <stddef.h> +typedef uint8_t vec_uint8; +typedef uint16_t vec_uint16; +typedef uint32_t vec_uint32; +typedef uint64_t vec_uint64; +typedef uintmax_t vec_uintmax; +typedef uintptr_t vec_uintptr; +typedef size_t vec_uintsize; +typedef int8_t vec_int8; +typedef int16_t vec_int16; +typedef int32_t vec_int32; +typedef int64_t vec_int64; +typedef intmax_t vec_intmax; +# else +# error Unable to find integer types with known size. +# endif +#endif + +#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ + (((a) >= (x)) && \ + ((a) > x || (b) >= (y)) && \ + ((a) > x || (b) > (y) || (c) >= (z))) + +#ifdef __GNUC__ +# define VEC_GNUC_ATLEAST(x, y, z) \ + VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z) +#else +# define VEC_GNUC_ATLEAST(x, y, z) (0) +#endif + +/* GCC/clang attributes */ +#if defined(__has_attribute) +# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x) +#else +# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch) +#endif + +#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) +# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg) +#elif (__STDC_VERSION__ >= 201112L) +# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg) +#else +# define VEC_STATIC_ASSERT(x, msg) \ + extern int (*vec_impl_Static_assert_function_(void)) \ + [!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })] +#endif + +#if VEC_GNUC_HAS_ATTRIBUTE(__always_inline__, 4, 0, 0) +# define VEC_ALWAYS_INLINE __attribute__((__always_inline__)) +#else +# define VEC_ALWAYS_INLINE +#endif + +#define VEC_FUNC_IMPL static inline VEC_ALWAYS_INLINE + +/* --------------------------------------------------------------- */ +/* Get maximum value for type */ + +#define VEC_TYPE_SIGNED(t) (((t)(-1)) < ((t)0)) + +#define VEC_MAX_EX(t, TOPBIT) \ + (((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | \ + ((TOPBIT) << ((sizeof(t) * 8ULL) - 4ULL))) + +#define VEC_MAX_OF_UNSIGNED(t) VEC_MAX_EX(t, 0xFULL) +#define VEC_MAX_OF_SIGNED(t) VEC_MAX_EX(t, 0x7ULL) + +#define VEC_MAX_OF_TYPE(t) \ + ((unsigned long long)(VEC_TYPE_SIGNED(t) \ + ? VEC_MAX_OF_SIGNED(t) \ + : VEC_MAX_OF_UNSIGNED(t))) + +#endif /* VEC_DEFS_H */
--- a/include/vec/impl/cpu.h Sat Apr 26 02:54:44 2025 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,514 +0,0 @@ -/** - * vec - a tiny SIMD vector library in C99 - * - * Copyright (c) 2024-2025 Paper - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. -**/ - -#ifndef VEC_IMPL_CPU_H_ -#define VEC_IMPL_CPU_H_ - -/* Detect CPU SIMD support. Much of this code was stolen from SDL. - * - * Simple DirectMedia Layer - * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org> - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. -*/ - -#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) -# include <sys/sysctl.h> // For AltiVec check -#elif defined(__OpenBSD__) && defined(__powerpc__) -# include <sys/types.h> -# include <sys/sysctl.h> // For AltiVec check -# include <machine/cpu.h> -#elif defined(__FreeBSD__) && defined(__powerpc__) -# include <machine/cpu.h> -# include <sys/auxv.h> -#elif defined(__ALTIVEC__) -# include <signal.h> -# include <setjmp.h> -#endif - -#ifdef __FreeBSD__ -# include <sys/param.h> -#endif - -#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__) -# include <unistd.h> -# include <sys/types.h> -# include <sys/stat.h> -# include <fcntl.h> -# include <elf.h> - -/*#include <asm/hwcap.h>*/ -# ifndef AT_HWCAP -# define AT_HWCAP 16 -# endif -# ifndef AT_PLATFORM -# define AT_PLATFORM 15 -# endif -# ifndef HWCAP_NEON -# define HWCAP_NEON (1 << 12) -# endif -#endif - -static inline int vec_CPU_have_CPUID(void) -{ - int has_CPUID = 0; - -#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) - __asm__ ( -" pushfl # Get original EFLAGS \n" -" popl %%eax \n" -" movl %%eax,%%ecx \n" -" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" -" pushl %%eax # Save new EFLAGS value on stack \n" -" popfl # Replace current EFLAGS value \n" -" pushfl # Get new EFLAGS \n" -" popl %%eax # Store new EFLAGS in EAX \n" -" xorl %%ecx,%%eax # Can not toggle ID bit, \n" -" jz 1f # Processor=80486 \n" -" movl $1,%0 # We have CPUID support \n" -"1: \n" - : "=m" (has_CPUID) - : - : "%eax", "%ecx" - ); -#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) -/* Technically, if this is being compiled under __x86_64__ then it has - CPUid by definition. But it's nice to be able to prove it. :) */ - __asm__ ( -" pushfq # Get original EFLAGS \n" -" popq %%rax \n" -" movq %%rax,%%rcx \n" -" xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" -" pushq %%rax # Save new EFLAGS value on stack \n" -" popfq # Replace current EFLAGS value \n" -" pushfq # Get new EFLAGS \n" -" popq %%rax # Store new EFLAGS in EAX \n" -" xorl %%ecx,%%eax # Can not toggle ID bit, \n" -" jz 1f # Processor=80486 \n" -" movl $1,%0 # We have CPUID support \n" -"1: \n" - : "=m" (has_CPUID) - : - : "%rax", "%rcx" - ); -#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) - __asm { - pushfd ; Get original EFLAGS - pop eax - mov ecx, eax - xor eax, 200000h ; Flip ID bit in EFLAGS - push eax ; Save new EFLAGS value on stack - popfd ; Replace current EFLAGS value - pushfd ; Get new EFLAGS - pop eax ; Store new EFLAGS in EAX - xor eax, ecx ; Can not toggle ID bit, - jz done ; Processor=80486 - mov has_CPUID,1 ; We have CPUID support -done: - } -#elif defined(_MSC_VER) && defined(_M_X64) - has_CPUID = 1; -#elif defined(__sun) && defined(__i386) - __asm ( -" pushfl \n" -" popl %eax \n" -" movl %eax,%ecx \n" -" xorl $0x200000,%eax \n" -" pushl %eax \n" -" popfl \n" -" pushfl \n" -" popl %eax \n" -" xorl %ecx,%eax \n" -" jz 1f \n" -" movl $1,-8(%ebp) \n" -"1: \n" - ); -#elif defined(__sun) && defined(__amd64) - __asm ( -" pushfq \n" -" popq %rax \n" -" movq %rax,%rcx \n" -" xorl $0x200000,%eax \n" -" pushq %rax \n" -" popfq \n" -" pushfq \n" -" popq %rax \n" -" xorl %ecx,%eax \n" -" jz 1f \n" -" movl $1,-8(%rbp) \n" -"1: \n" - ); -#endif - - return has_CPUID; -} - -#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) -# define VEC_CPU_CPUID(func, a, b, c, d) \ - __asm__ __volatile__( \ - " pushl %%ebx \n" \ - " xorl %%ecx,%%ecx \n" \ - " cpuid \n" \ - " movl %%ebx, %%esi \n" \ - " popl %%ebx \n" \ - : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ - : "a"(func)) -#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) -# define VEC_CPU_CPUID(func, a, b, c, d) \ - __asm__ __volatile__( \ - " pushq %%rbx \n" \ - " xorq %%rcx,%%rcx \n" \ - " cpuid \n" \ - " movq %%rbx, %%rsi \n" \ - " popq %%rbx \n" \ - : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ - : "a"(func)) -#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) -# define VEC_CPU_CPUID(func, a, b, c, d) \ - __asm { \ - __asm mov eax, func \ - __asm xor ecx, ecx \ - __asm cpuid \ - __asm mov a, eax \ - __asm mov b, ebx \ - __asm mov c, ecx \ - __asm mov d, edx \ - } -#elif (defined(_MSC_VER) && defined(_M_X64)) -// Use __cpuidex instead of __cpuid because ICL does not clear ecx register -# define VEC_CPU_CPUID(func, a, b, c, d) \ - do { \ - int CPUInfo[4]; \ - __cpuidex(CPUInfo, func, 0); \ - a = CPUInfo[0]; \ - b = CPUInfo[1]; \ - c = CPUInfo[2]; \ - d = CPUInfo[3]; \ - } while (0) -#else -# define VEC_CPU_CPUID(func, a, b, c, d) \ - do { \ - a = b = c = d = 0; \ - (void)a; \ - (void)b; \ - (void)c; \ - (void)d; \ - } while (0) -#endif - -// --------------------------------------------------------------- - -static int vec_CPU_CPUIDFeatures[4]; -static int vec_CPU_CPUIDMaxFunction = 0; -static int vec_CPU_OSSavesYMM = 0; -static int vec_CPU_OSSavesZMM = 0; - -static inline void vec_CPU_get_CPUID_features(void) -{ - static int checked = 0; - if (!checked) { - checked = 1; - if (vec_CPU_have_CPUID()) { - int a, b, c, d; - VEC_CPU_CPUID(0, a, b, c, d); - vec_CPU_CPUIDMaxFunction = a; - if (vec_CPU_CPUIDMaxFunction >= 1) { - VEC_CPU_CPUID(1, a, b, c, d); - vec_CPU_CPUIDFeatures[0] = a; - vec_CPU_CPUIDFeatures[1] = b; - vec_CPU_CPUIDFeatures[2] = c; - vec_CPU_CPUIDFeatures[3] = d; - - // Check to make sure we can call xgetbv - if (c & 0x08000000) { - // Call xgetbv to see if YMM (etc) register state is saved -#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__)) - __asm__(".byte 0x0f, 0x01, 0xd0" - : "=a"(a) - : "c"(0) - : "%edx"); -#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1 - a = (int)_xgetbv(0); -#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) - __asm { - xor ecx, ecx - _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 - mov a, eax - } -#endif - vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0; - vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0; - } - } - } - } -} - -#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) -static jmp_buf vec_jmpbuf; -static void vec_CPU_illegal_instruction(int sig) -{ - longjmp(vec_jmpbuf, 1); -} -#endif - -static int vec_CPU_have_ALTIVEC(void) -{ - volatile int altivec = 0; -#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__)) - int selectors[2] = { -# ifdef __OpenBSD__ - CTL_MACHDEP, CPU_ALTIVEC -# else - CTL_HW, HW_VECTORUNIT -# endif - }; - int hasVectorUnit = 0; - vec_uintsize length = sizeof(hasVectorUnit); - int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); - if (!error) - altivec = (hasVectorUnit != 0); -#elif defined(__FreeBSD__) && defined(__powerpc__) - unsigned long cpufeatures = 0; - elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); - altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; -#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) - void (*handler)(int sig); - handler = signal(SIGILL, vec_CPU_illegal_instruction); - if (!setjmp(vec_jmpbuf)) { - vector unsigned char vec; - vec_and(vec, vec); - altivec = 1; - } - signal(SIGILL, handler); -#endif - return altivec; -} - -static int vec_CPU_have_ALTIVEC_VSX(void) -{ - volatile int vsx = 0; -#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__) -# warning Compiling UNTESTED code for VSX. - void (*handler)(int sig); - handler = signal(SIGILL, vec_CPU_illegal_instruction); - if (!setjmp(vec_jmpbuf)) { - // this is completely untested - //__asm__ __volatile__("mtspr 256, %0\n\t" - // "xxland %%v0, %%v0, %%v0" ::"r"(-1)); - //vsx = 1; - } - signal(SIGILL, handler); -#endif - return vsx; -} - -#define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000) -#define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000) -#define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000) -#define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001) -#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000) -#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000) -#define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000)) - -static inline int vec_CPU_have_AVX2(void) -{ - if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { - int a, b, c, d; - VEC_CPU_CPUID(7, a, b, c, d); - return b & 0x00000020; - (void)a, (void)c, (void)d; - } - return 0; -} - -static inline int vec_CPU_have_AVX512F(void) -{ - if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { - int a, b, c, d; - VEC_CPU_CPUID(7, a, b, c, d); - return b & 0x00000020; - (void)a, (void)c, (void)d; - } - return 0; -} - -#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL) -static int readProcAuxvForNeon(void) -{ - int neon = 0; - int fd; - - fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); - if (fd >= 0) { - Elf32_auxv_t aux; - while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { - if (aux.a_type == AT_HWCAP) { - neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON; - break; - } - } - close(fd); - } - return neon; -} -#endif - -static int vec_CPU_have_NEON(void) -{ -/* The way you detect NEON is a privileged instruction on ARM, so you have - query the OS kernel in a platform-specific way. :/ */ -#if defined(SDL_CPUINFO_DISABLED) - return 0; /* disabled */ -#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64)) -/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */ -/* Seems to have been removed */ -#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE -#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19 -#endif - /* All WinRT ARM devices are required to support NEON, but just in case. */ - return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0; -#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__) - return 1; /* ARMv8 always has non-optional NEON support. */ -#elif defined(__VITA__) - return 1; -#elif defined(__3DS__) - return 0; -#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) - /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */ - return 1; /* all Apple ARMv7 chips and later have NEON. */ -#elif defined(__APPLE__) - return 0; /* assume anything else from Apple doesn't have NEON. */ -#elif !defined(__arm__) - return 0; /* not an ARM CPU at all. */ -#elif defined(__OpenBSD__) - return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */ -#elif defined(HAVE_ELF_AUX_INFO) - unsigned long hasneon = 0; - if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0) - return 0; - - return ((hasneon & HWCAP_NEON) == HWCAP_NEON); -#elif defined(__QNXNTO__) - return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON; -#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL) - return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON; -#elif defined(__linux__) - return readProcAuxvForNeon(); -#elif defined(__ANDROID__) - /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */ - { - AndroidCpuFamily cpu_family = android_getCpuFamily(); - if (cpu_family == ANDROID_CPU_FAMILY_ARM) { - uint64_t cpu_features = android_getCpuFeatures(); - if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) { - return 1; - } - } - return 0; - } -#elif defined(__RISCOS__) - /* Use the VFPSupport_Features SWI to access the MVFR registers */ - { - _kernel_swi_regs regs; - regs.r[0] = 0; - if (_kernel_swi(VFPSupport_Features, ®s, ®s) == NULL) { - if ((regs.r[2] & 0xFFF000) == 0x111000) { - return 1; - } - } - return 0; - } -#else -#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me. - return 0; -#endif -} - -enum { - VEC_CPU_HAS_ALTIVEC = (1 << 0), - VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1), - VEC_CPU_HAS_MMX = (1 << 2), - VEC_CPU_HAS_SSE = (1 << 3), - VEC_CPU_HAS_SSE2 = (1 << 4), - VEC_CPU_HAS_SSE3 = (1 << 5), - VEC_CPU_HAS_SSE41 = (1 << 6), - VEC_CPU_HAS_SSE42 = (1 << 7), - VEC_CPU_HAS_AVX = (1 << 8), - VEC_CPU_HAS_AVX2 = (1 << 9), - VEC_CPU_HAS_AVX512F = (1 << 10), - VEC_CPU_HAS_NEON = (1 << 11), -}; - -#define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF) - -VEC_FUNC_IMPL uint32_t vec_get_CPU_features(void) -{ - static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET; - if (vec_CPU_features == VEC_CPU_FEATURES_RESET) { - vec_CPU_get_CPUID_features(); - vec_CPU_features = 0; - if (vec_CPU_have_ALTIVEC()) - vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; - if (vec_CPU_have_ALTIVEC_VSX()) - vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; - if (vec_CPU_have_MMX()) - vec_CPU_features |= VEC_CPU_HAS_MMX; - if (vec_CPU_have_SSE()) - vec_CPU_features |= VEC_CPU_HAS_SSE; - if (vec_CPU_have_SSE2()) - vec_CPU_features |= VEC_CPU_HAS_SSE2; - if (vec_CPU_have_SSE3()) - vec_CPU_features |= VEC_CPU_HAS_SSE3; - if (vec_CPU_have_SSE41()) - vec_CPU_features |= VEC_CPU_HAS_SSE41; - if (vec_CPU_have_SSE42()) - vec_CPU_features |= VEC_CPU_HAS_SSE42; - if (vec_CPU_have_AVX()) - vec_CPU_features |= VEC_CPU_HAS_AVX; - if (vec_CPU_have_AVX2()) - vec_CPU_features |= VEC_CPU_HAS_AVX2; - if (vec_CPU_have_AVX512F()) - vec_CPU_features |= VEC_CPU_HAS_AVX512F; - if (vec_CPU_have_NEON()) - vec_CPU_features |= VEC_CPU_HAS_NEON; - } - return vec_CPU_features; -} - -#endif /* VEC_IMPL_CPU_H_ */
--- a/include/vec/impl/generic.h Sat Apr 26 02:54:44 2025 -0400 +++ b/include/vec/impl/generic.h Sat Apr 26 15:31:39 2025 -0400 @@ -28,10 +28,6 @@ #ifndef VEC_IMPL_GENERIC_H_ #define VEC_IMPL_GENERIC_H_ -#include <string.h> - -// ----------------------------------------------------------------- - #define VEC_GENERIC_OPERATION(op, sign, bits, size) \ do { \ int i; \
--- a/include/vec/impl/ppc/altivec.h Sat Apr 26 02:54:44 2025 -0400 +++ b/include/vec/impl/ppc/altivec.h Sat Apr 26 15:31:39 2025 -0400 @@ -27,8 +27,6 @@ #ifndef VEC_IMPL_PPC_ALTIVEC_H_ #define VEC_IMPL_PPC_ALTIVEC_H_ -#include <altivec.h> - /* GCC 4.2.1 on Mac OS X doesn't have these for some reason */ #ifdef vec_mul # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
--- a/include/vec/impl/x86/avx2.h Sat Apr 26 02:54:44 2025 -0400 +++ b/include/vec/impl/x86/avx2.h Sat Apr 26 15:31:39 2025 -0400 @@ -157,6 +157,14 @@ /* -------------------------------------------------------------------- */ /* generic ops */ +#define VEC_AVX2_SPLAT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx2 = _mm256_set1_epi##bits(x); \ + return vec; \ + } + #define VEC_AVX2_LOAD_ALIGNED(sign, bits, size) \ VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ { \ @@ -195,6 +203,11 @@ /* ------------------------------------------------------------------------ */ /* 8x32 */ +#ifndef VINT8x32_SPLAT_DEFINED +VEC_AVX2_SPLAT(/* nothing */, 8, 32) +# define VINT8x32_SPLAT_DEFINED +#endif + #ifndef VINT8x32_LOAD_ALIGNED_DEFINED VEC_AVX2_LOAD_ALIGNED(/* nothing */, 8, 32) # define VINT8x32_LOAD_ALIGNED_DEFINED @@ -242,6 +255,11 @@ /* u8x32 */ +#ifndef VUINT8x32_SPLAT_DEFINED +VEC_AVX2_SPLAT(u, 8, 32) +# define VUINT8x32_SPLAT_DEFINED +#endif + #ifndef VUINT8x32_LOAD_ALIGNED_DEFINED VEC_AVX2_LOAD_ALIGNED(u, 8, 32) # define VUINT8x32_LOAD_ALIGNED_DEFINED @@ -290,6 +308,11 @@ /* ------------------------------------------------------------------------ */ /* 16x16 */ +#ifndef VINT16x16_SPLAT_DEFINED +VEC_AVX2_SPLAT(/* nothing */, 16, 16) +# define VINT16x16_SPLAT_DEFINED +#endif + #ifndef VINT16x16_LOAD_ALIGNED_DEFINED VEC_AVX2_LOAD_ALIGNED(/* nothing */, 16, 16) # define VINT16x16_LOAD_ALIGNED_DEFINED @@ -337,6 +360,11 @@ /* u16x16 */ +#ifndef VUINT16x16_SPLAT_DEFINED +VEC_AVX2_SPLAT(u, 16, 16) +# define VUINT16x16_SPLAT_DEFINED +#endif + #ifndef VUINT16x16_LOAD_ALIGNED_DEFINED VEC_AVX2_LOAD_ALIGNED(u, 16, 16) # define VUINT16x16_LOAD_ALIGNED_DEFINED @@ -385,6 +413,11 @@ /* ------------------------------------------------------------------------ */ /* 32x8 */ +#ifndef VINT32x8_SPLAT_DEFINED +VEC_AVX2_SPLAT(/* nothing */, 32, 8) +# define VINT32x8_SPLAT_DEFINED +#endif + #ifndef VINT32x8_LOAD_ALIGNED_DEFINED VEC_AVX2_LOAD_ALIGNED(/* nothing */, 32, 8) # define VINT32x8_LOAD_ALIGNED_DEFINED @@ -432,6 +465,11 @@ /* u32x8 */ +#ifndef VUINT32x8_SPLAT_DEFINED +VEC_AVX2_SPLAT(u, 32, 8) +# define VUINT32x8_SPLAT_DEFINED +#endif + #ifndef VUINT32x8_LOAD_ALIGNED_DEFINED VEC_AVX2_LOAD_ALIGNED(u, 32, 8) # define VUINT32x8_LOAD_ALIGNED_DEFINED
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/x86/avx512bw.h Sat Apr 26 15:31:39 2025 -0400 @@ -0,0 +1,166 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024-2025 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_X86_AVX512BW_H_ +#define VEC_IMPL_X86_AVX512BW_H_ + +#define VEC_AVX512BW_OP(SIGN, BITS, SIZE, INTLSIGN, SECONDSIGN, OP, NAME) \ + VEC_FUNC_IMPL v##SIGN##int##BITS##x##SIZE v##SIGN##int##BITS##x##SIZE##_##NAME(v##SIGN##int##BITS##x##SIZE vec1, v##SECONDSIGN##int##BITS##x##SIZE vec2) \ + { \ + vec1.avx512f = _mm512_##OP##_ep##INTLSIGN##BITS(vec1.avx512f, vec2.avx512f); \ + \ + return vec1; \ + } + +/* -- 8x64 */ + +#ifndef VINT8x64_MIN_DEFINED +VEC_AVX512BW_OP(, 8, 64, i, , min, min) +# define VINT8x64_MIN_DEFINED +#endif + +#ifndef VINT8x64_MAX_DEFINED +VEC_AVX512BW_OP(, 8, 64, i, , max, max) +# define VINT8x64_MAX_DEFINED +#endif + +#ifndef VINT8x64_ADD_DEFINED +VEC_AVX512BW_OP(, 8, 64, i, , add, add) +# define VINT8x64_ADD_DEFINED +#endif + +#ifndef VINT8x64_SUB_DEFINED +VEC_AVX512BW_OP(, 8, 64, i, , sub, sub) +# define VINT8x64_SUB_DEFINED +#endif + +/* -- unsigned */ + +#ifndef VINT8x64_MIN_DEFINED +VEC_AVX512BW_OP(u, 8, 64, u, u, min, min) +# define VINT8x64_MIN_DEFINED +#endif + +#ifndef VINT8x64_MAX_DEFINED +VEC_AVX512BW_OP(u, 8, 64, u, u, max, max) +# define VINT8x64_MAX_DEFINED +#endif + +#ifndef VINT8x64_ADD_DEFINED +VEC_AVX512BW_OP(u, 8, 64, i, u, add, add) +# define VINT8x64_ADD_DEFINED +#endif + +#ifndef VINT8x64_SUB_DEFINED +VEC_AVX512BW_OP(u, 8, 64, i, u, sub, sub) +# define VINT8x64_SUB_DEFINED +#endif + +/* -- 16x32 */ + +#ifndef VINT16x32_MIN_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, , min, min) +# define VINT16x32_MIN_DEFINED +#endif + +#ifndef VINT16x32_MAX_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, , max, max) +# define VINT16x32_MAX_DEFINED +#endif + +#ifndef VINT16x32_ADD_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, , add, add) +# define VINT16x32_ADD_DEFINED +#endif + +#ifndef VINT16x32_SUB_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, , sub, sub) +# define VINT16x32_SUB_DEFINED +#endif + +#ifndef VINT16x32_MUL_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, , mullo, mul) +# define VINT16x32_MUL_DEFINED +#endif + +#ifndef VINT16x32_LSHIFT_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, u, sllv, lshift) +# define VINT16x32_LSHIFT_DEFINED +#endif + +#ifndef VINT16x32_RSHIFT_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, u, srav, rshift) +# define VINT16x32_RSHIFT_DEFINED +#endif + +#ifndef VINT16x32_LRSHIFT_DEFINED +VEC_AVX512BW_OP(, 16, 32, i, u, srlv, lrshift) +# define VINT16x32_LRSHIFT_DEFINED +#endif + +/* -- unsigned */ + +#ifndef VUINT16x32_MIN_DEFINED +VEC_AVX512BW_OP(u, 16, 32, u, u, min, min) +# define VUINT16x32_MIN_DEFINED +#endif + +#ifndef VUINT16x32_MAX_DEFINED +VEC_AVX512BW_OP(u, 16, 32, u, u, max, max) +# define VUINT16x32_MAX_DEFINED +#endif + +#ifndef VUINT16x32_ADD_DEFINED +VEC_AVX512BW_OP(u, 16, 32, i, u, add, add) +# define VUINT16x32_ADD_DEFINED +#endif + +#ifndef VUINT16x32_SUB_DEFINED +VEC_AVX512BW_OP(u, 16, 32, i, u, sub, sub) +# define VUINT16x32_SUB_DEFINED +#endif + +#ifndef VUINT16x32_MUL_DEFINED +VEC_AVX512BW_OP(u, 16, 32, i, u, mullo, mul) +# define VUINT16x32_MUL_DEFINED +#endif + +#ifndef VUINT16x32_LSHIFT_DEFINED +VEC_AVX512BW_OP(u, 16, 32, i, u, sllv, lshift) +# define VUINT16x32_LSHIFT_DEFINED +#endif + +#ifndef VUINT16x32_RSHIFT_DEFINED +VEC_AVX512BW_OP(u, 16, 32, i, u, srlv, rshift) +# define VUINT16x32_RSHIFT_DEFINED +#endif + +#ifndef VUINT16x32_LRSHIFT_DEFINED +VEC_AVX512BW_OP(u, 16, 32, i, u, srlv, lrshift) +# define VUINT16x32_LRSHIFT_DEFINED +#endif + +/* no mul for 8-bit */ + +#endif /* VEC_IMPL_X86_AVX512BW_H_ */ \ No newline at end of file
--- a/include/vec/impl/x86/avx512f.h Sat Apr 26 02:54:44 2025 -0400 +++ b/include/vec/impl/x86/avx512f.h Sat Apr 26 15:31:39 2025 -0400 @@ -133,6 +133,14 @@ /* ------------------------------------------------------------------------ */ +#define VEC_AVX512F_SPLAT(sign, bits, size) \ + VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##size x) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.avx512f = _mm512_set1_epi##bits(x); \ + return vec; \ + } + #define VEC_AVX512F_LOAD_ALIGNED(sign, bits, size) \ VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_load_aligned(const vec_##sign##int##bits in[size]) \ { \ @@ -168,6 +176,183 @@ return vec1; \ } +/* ------------------------------------------------------------------------ */ +/* 8x64; there is very little we can do here */ + +#ifndef VINT8x64_SPLAT_DEFINED +VEC_AVX512F_SPLAT(, 8, 64) +# define VINT8x64_SPLAT_DEFINED +#endif + +#ifndef VINT8x64_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(, 8, 64) +# define VINT8x64_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT8x64_LOAD_DEFINED +VEC_AVX512F_LOAD(, 8, 64) +# define VINT8x64_LOAD_DEFINED +#endif + +#ifndef VINT8x64_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(, 8, 64) +# define VINT8x64_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT8x64_STORE_DEFINED +VEC_AVX512F_STORE(, 8, 64) +# define VINT8x64_STORE_DEFINED +#endif + +#ifndef VINT8x64_AND_DEFINED +VEC_AVX512F_BITWISE(and, , 8, 64) +# define VINT8x64_AND_DEFINED +#endif + +#ifndef VINT8x64_OR_DEFINED +VEC_AVX512F_BITWISE(or, , 8, 64) +# define VINT8x64_OR_DEFINED +#endif + +#ifndef VINT8x64_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, , 8, 64) +# define VINT8x64_XOR_DEFINED +#endif + +/* ---- unsigned */ + +#ifndef VUINT8x64_SPLAT_DEFINED +VEC_AVX512F_SPLAT(u, 8, 64) +# define VUINT8x64_SPLAT_DEFINED +#endif + +#ifndef VUINT8x64_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(u, 8, 64) +# define VUINT8x64_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x64_LOAD_DEFINED +VEC_AVX512F_LOAD(u, 8, 64) +# define VUINT8x64_LOAD_DEFINED +#endif + +#ifndef VUINT8x64_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(u, 8, 64) +# define VUINT8x64_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT8x64_STORE_DEFINED +VEC_AVX512F_STORE(u, 8, 64) +# define VUINT8x64_STORE_DEFINED +#endif + +#ifndef VUINT8x64_AND_DEFINED +VEC_AVX512F_BITWISE(and, u, 8, 64) +# define VUINT8x64_AND_DEFINED +#endif + +#ifndef VUINT8x64_OR_DEFINED +VEC_AVX512F_BITWISE(or, u, 8, 64) +# define VUINT8x64_OR_DEFINED +#endif + +#ifndef VUINT8x64_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, u, 8, 64) +# define VUINT8x64_XOR_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ +/* 16x32; there is very little we can do here */ + +#ifndef VINT16x32_SPLAT_DEFINED +VEC_AVX512F_SPLAT(, 16, 32) +# define VINT16x32_SPLAT_DEFINED +#endif + +#ifndef VINT16x32_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(, 16, 32) +# define VINT16x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VINT16x32_LOAD_DEFINED +VEC_AVX512F_LOAD(, 16, 32) +# define VINT16x32_LOAD_DEFINED +#endif + +#ifndef VINT16x32_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(, 16, 32) +# define VINT16x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VINT16x32_STORE_DEFINED +VEC_AVX512F_STORE(, 16, 32) +# define VINT16x32_STORE_DEFINED +#endif + +#ifndef VINT16x32_AND_DEFINED +VEC_AVX512F_BITWISE(and, , 16, 32) +# define VINT16x32_AND_DEFINED +#endif + +#ifndef VINT16x32_OR_DEFINED +VEC_AVX512F_BITWISE(or, , 16, 32) +# define VINT16x32_OR_DEFINED +#endif + +#ifndef VINT16x32_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, , 16, 32) +# define VINT16x32_XOR_DEFINED +#endif + +/* ---- unsigned */ + +#ifndef VUINT16x32_SPLAT_DEFINED +VEC_AVX512F_SPLAT(u, 16, 32) +# define VUINT16x32_SPLAT_DEFINED +#endif + +#ifndef VUINT16x32_LOAD_ALIGNED_DEFINED +VEC_AVX512F_LOAD_ALIGNED(u, 16, 32) +# define VUINT16x32_LOAD_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x32_LOAD_DEFINED +VEC_AVX512F_LOAD(u, 16, 32) +# define VUINT16x32_LOAD_DEFINED +#endif + +#ifndef VUINT16x32_STORE_ALIGNED_DEFINED +VEC_AVX512F_STORE_ALIGNED(u, 16, 32) +# define VUINT16x32_STORE_ALIGNED_DEFINED +#endif + +#ifndef VUINT16x32_STORE_DEFINED +VEC_AVX512F_STORE(u, 16, 32) +# define VUINT16x32_STORE_DEFINED +#endif + +#ifndef VUINT16x32_AND_DEFINED +VEC_AVX512F_BITWISE(and, u, 16, 32) +# define VUINT16x32_AND_DEFINED +#endif + +#ifndef VUINT16x32_OR_DEFINED +VEC_AVX512F_BITWISE(or, u, 16, 32) +# define VUINT16x32_OR_DEFINED +#endif + +#ifndef VUINT16x32_XOR_DEFINED +VEC_AVX512F_BITWISE(xor, u, 16, 32) +# define VUINT16x32_XOR_DEFINED +#endif + +/* ------------------------------------------------------------------------ */ + +#ifndef VINT32x16_SPLAT_DEFINED +VEC_AVX512F_SPLAT(, 32, 16) +# define VINT32x16_SPLAT_DEFINED +#endif + #ifndef VINT32x16_LOAD_ALIGNED_DEFINED VEC_AVX512F_LOAD_ALIGNED(, 32, 16) # define VINT32x16_LOAD_ALIGNED_DEFINED @@ -218,6 +403,11 @@ # define VINT32x16_XOR_DEFINED #endif +#ifndef VUINT32x16_SPLAT_DEFINED +VEC_AVX512F_SPLAT(u, 32, 16) +# define VUINT32x16_SPLAT_DEFINED +#endif + #ifndef VUINT32x16_LOAD_ALIGNED_DEFINED VEC_AVX512F_LOAD_ALIGNED(u, 32, 16) # define VUINT32x16_LOAD_ALIGNED_DEFINED @@ -268,6 +458,11 @@ # define VUINT32x16_XOR_DEFINED #endif +#ifndef VINT64x8_SPLAT_DEFINED +VEC_AVX512F_SPLAT(, 64, 8) +# define VINT64x8_SPLAT_DEFINED +#endif + #ifndef VINT64x8_LOAD_ALIGNED_DEFINED VEC_AVX512F_LOAD_ALIGNED(, 64, 8) # define VINT64x8_LOAD_ALIGNED_DEFINED @@ -318,6 +513,11 @@ # define VINT64x8_XOR_DEFINED #endif +#ifndef VUINT64x8_SPLAT_DEFINED +VEC_AVX512F_SPLAT(u, 64, 8) +# define VUINT64x8_SPLAT_DEFINED +#endif + #ifndef VUINT64x8_LOAD_ALIGNED_DEFINED VEC_AVX512F_LOAD_ALIGNED(u, 64, 8) # define VUINT64x8_LOAD_ALIGNED_DEFINED
--- a/include/vec/impl/x86/sse2.h Sat Apr 26 02:54:44 2025 -0400 +++ b/include/vec/impl/x86/sse2.h Sat Apr 26 15:31:39 2025 -0400 @@ -25,8 +25,6 @@ #ifndef VEC_IMPL_X86_SSE2_H_ #define VEC_IMPL_X86_SSE2_H_ -#include <emmintrin.h> - /* eh */ #define VEC_SSE2_SET1_8(x) _mm_set1_epi8(x) #define VEC_SSE2_SET1_16(x) _mm_set1_epi16(x)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/mem.h Sat Apr 26 15:31:39 2025 -0400 @@ -0,0 +1,123 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024-2025 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_MEM_H_ +#define VEC_MEM_H_ + +#include "defs.h" + +#define VEC_MALLOC_ALIGNMENT (256) + +VEC_STATIC_ASSERT(!(VEC_MALLOC_ALIGNMENT & (VEC_MALLOC_ALIGNMENT - 1)) + && (VEC_MALLOC_ALIGNMENT > 0), + "VEC_MALLOC_ALIGNMENT must be a power of two"); + +typedef unsigned char vec_alignment_type; + +VEC_STATIC_ASSERT(VEC_MALLOC_ALIGNMENT > VEC_MAX_OF_TYPE(vec_alignment_type), + "VEC_MALLOC_ALIGNMENT cannot fit in the pointer alignment data"); + +#define VEC_MALLOC_ADDITIONAL_SIZE (sizeof(vec_alignment_type) + (VEC_MALLOC_ALIGNMENT - 1)) +#define VEC_MALLOC_MAX_SIZE (SIZE_MAX - VEC_MALLOC_ADDITIONAL_SIZE) + +VEC_FUNC_IMPL void *vec_internal_align_ptr_(void *q) +{ + vec_alignment_type diff; + + diff = (((uintptr_t)q + (VEC_MALLOC_ALIGNMENT - 1)) & ~(VEC_MALLOC_ALIGNMENT - 1)) - (uintptr_t)q; + q = (char *)q + diff; + + memcpy((char *)q - sizeof(diff), &diff, sizeof(diff)); + + return q; +} + +/* reverses vec_align_ptr */ +VEC_FUNC_IMPL void *vec_internal_unalign_ptr_(void *q) +{ + vec_alignment_type diff; + + memcpy(&diff, (char *)q - sizeof(diff), sizeof(diff)); + q = (char *)q - diff; + + return q; +} + +VEC_FUNC_IMPL void *vec_malloc(size_t size) +{ + void *q; + + if (size > VEC_MALLOC_MAX_SIZE) + return NULL; + + /* allocate space for the diff (we have to do this, + * for realloc has no way of knowing the original ptr) */ + q = malloc(size + VEC_MALLOC_ADDITIONAL_SIZE); + if (!q) + return NULL; + + return vec_internal_align_ptr_(q); +} + +VEC_FUNC_IMPL void *vec_calloc(size_t count, size_t nmemb) +{ + size_t size; + void *q; + + size = count * nmemb; + if ((size && size / count != nmemb) + || size > VEC_MALLOC_MAX_SIZE) + return NULL; /* nope */ + + q = calloc(size + VEC_MALLOC_ADDITIONAL_SIZE, 1); + if (!q) + return NULL; + + return vec_internal_align_ptr_(q); +} + +VEC_FUNC_IMPL void *vec_realloc(void *ptr, size_t newsize) +{ + void *q; + + if (!ptr) + return vec_malloc(newsize); + + if (newsize > VEC_MALLOC_MAX_SIZE) + return NULL; + + q = realloc(vec_internal_unalign_ptr_(ptr), VEC_MALLOC_ADDITIONAL_SIZE); + if (!q) + return NULL; + + return vec_internal_align_ptr_(q); +} + +VEC_FUNC_IMPL void vec_free(void *ptr) +{ + if (ptr) + free(vec_internal_unalign_ptr_(ptr)); +} + +#endif /* VEC_MEM_H_ */
--- a/include/vec/vec.h Sat Apr 26 02:54:44 2025 -0400 +++ b/include/vec/vec.h Sat Apr 26 15:31:39 2025 -0400 @@ -25,116 +25,7 @@ #ifndef VEC_VEC_H_ #define VEC_VEC_H_ -#ifdef __cplusplus -extern "C" { -#endif - - -#ifdef VEC_HAVE_IMPL_INTEGER_H -# include "impl/integer.h" -#else -# if __cplusplus >= (201103L) -# include <cstdint> -# include <cstddef> -typedef std::size_t vec_uintsize; - -typedef std::uint8_t vec_uint8; -typedef std::uint16_t vec_uint16; -typedef std::uint32_t vec_uint32; -typedef std::uint64_t vec_uint64; -typedef std::uintmax_t vec_uintmax; -typedef std::uintptr_t vec_uintptr; - -typedef std::int8_t vec_int8; -typedef std::int16_t vec_int16; -typedef std::int32_t vec_int32; -typedef std::int64_t vec_int64; -typedef std::intmax_t vec_intmax; -# elif __STDC_VERSION__ >= 199901L -# include <stdint.h> -# include <stddef.h> -typedef uint8_t vec_uint8; -typedef uint16_t vec_uint16; -typedef uint32_t vec_uint32; -typedef uint64_t vec_uint64; -typedef uintmax_t vec_uintmax; -typedef uintptr_t vec_uintptr; -typedef size_t vec_uintsize; -typedef int8_t vec_int8; -typedef int16_t vec_int16; -typedef int32_t vec_int32; -typedef int64_t vec_int64; -typedef intmax_t vec_intmax; -# else -# error Unable to find integer types with known size. -# endif -#endif - -#include <string.h> -#include <stdlib.h> - -#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ - (((a) >= (x)) && \ - ((a) > x || (b) >= (y)) && \ - ((a) > x || (b) > (y) || (c) >= (z))) - -#ifdef __GNUC__ -# define VEC_GNUC_ATLEAST(x, y, z) \ - VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z) -#else -# define VEC_GNUC_ATLEAST(x, y, z) (0) -#endif - -/* GCC/clang attributes */ -#if defined(__has_attribute) -# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) __has_attribute(x) -#else -# define VEC_GNUC_HAS_ATTRIBUTE(x, major, minor, patch) VEC_GNUC_ATLEAST(major, minor, patch) -#endif - -#if (__cplusplus >= 201103L) || (__STDC_VERSION__ >= 202311L) -# define VEC_STATIC_ASSERT(x, msg) static_assert(x, msg) -#elif (__STDC_VERSION__ >= 201112L) -# define VEC_STATIC_ASSERT(x, msg) _Static_assert(x, msg) -#else -# define VEC_STATIC_ASSERT(x, msg) \ - extern int (*vec_impl_Static_assert_function_(void)) \ - [!!sizeof (struct { int __error_if_negative: (x) ? 2 : -1; })] -#endif - -#ifndef VEC_ASSERT -# ifndef VEC_DISABLE_ASSERTIONS -# include <assert.h> -# define VEC_ASSERT(x, msg) assert(msg && x) -# else -# define VEC_ASSERT(x, msg) -# endif -#endif - -#if VEC_GNUC_HAS_ATTRIBUTE(__always_inline__, 4, 0, 0) -# define VEC_ALWAYS_INLINE __attribute__((__always_inline__)) -#else -# define VEC_ALWAYS_INLINE -#endif - -#define VEC_FUNC_IMPL static inline VEC_ALWAYS_INLINE - -/* --------------------------------------------------------------- */ -/* Get maximum value for type */ - -#define VEC_TYPE_SIGNED(t) (((t)(-1)) < ((t)0)) - -#define VEC_MAX_EX(t, TOPBIT) \ - (((0x1ULL << ((sizeof(t) * 8ULL) - 1ULL)) - 1ULL) | \ - ((TOPBIT) << ((sizeof(t) * 8ULL) - 4ULL))) - -#define VEC_MAX_OF_UNSIGNED(t) VEC_MAX_EX(t, 0xFULL) -#define VEC_MAX_OF_SIGNED(t) VEC_MAX_EX(t, 0x7ULL) - -#define VEC_MAX_OF_TYPE(t) \ - ((unsigned long long)(VEC_TYPE_SIGNED(t) \ - ? VEC_MAX_OF_SIGNED(t) \ - : VEC_MAX_OF_UNSIGNED(t))) +#include "defs.h" /* --------------------------------------------------------------- */ /* Detect compiler SIMD support */ @@ -386,6 +277,9 @@ #ifdef __AVX512F__ # include <immintrin.h> +# ifdef __AVX512BW__ +# define VEC_COMPILER_HAS_AVX512BW +# endif # define VEC_COMPILER_HAS_AVX512F # if VINT8x64_ALIGNMENT < VEC_AVX512F_ALIGNMENT # undef VINT8x64_ALIGNMENT @@ -423,6 +317,10 @@ #endif +#ifdef __cplusplus +extern "C" { +#endif + /* --------------------------------------------------------------- */ /* bit shift */ @@ -497,7 +395,6 @@ }; \ unsigned char vec_unaligned_##var##_[((length) * sizeof(type)) + (align) - 1]; \ type *var = ((union vec_aligned_union_##var##_ *)(((vec_uintptr)vec_unaligned_##var##_ + (align - 1)) & ~(align - 1)))->arr; \ - VEC_ASSERT(((vec_uintptr)var) % align == 0, "vec: VEC_ALIGNED_ARRAY result is actually not aligned") # define VEC_ALIGNED_ARRAY_SIZEOF(var, align) \ (sizeof(vec_unaligned_##var##_) - (align - 1)) #endif @@ -1024,6 +921,10 @@ /* ------------------------------------------------------------------------ */ /* finally; we can import the real implementations */ +#ifdef VEC_COMPILER_HAS_AVX512BW +# include "impl/x86/avx512bw.h" +#endif + #ifdef VEC_COMPILER_HAS_AVX512F # include "impl/x86/avx512f.h" #endif @@ -1051,97 +952,6 @@ #include "impl/generic.h" /* ------------------------------------------------------------------------ */ -/* very minimal aligned malloc */ - -#define VEC_MALLOC_ALIGNMENT (64) - -VEC_STATIC_ASSERT(!(VEC_MALLOC_ALIGNMENT & (VEC_MALLOC_ALIGNMENT - 1)) - && (VEC_MALLOC_ALIGNMENT > 0), - "VEC_MALLOC_ALIGNMENT must be a power of two"); - -typedef unsigned char vec_alignment_type; - -#define VEC_MALLOC_ADDITIONAL_SIZE (sizeof(vec_alignment_type) + (VEC_MALLOC_ALIGNMENT - 1)) -#define VEC_MALLOC_MAX_SIZE (SIZE_MAX - VEC_MALLOC_ADDITIONAL_SIZE) - -VEC_FUNC_IMPL void *vec_internal_align_ptr_(void *q) -{ - vec_alignment_type diff; - - diff = (((uintptr_t)q + (VEC_MALLOC_ALIGNMENT - 1)) & ~(VEC_MALLOC_ALIGNMENT - 1)) - (uintptr_t)q; - q = (char *)q + diff; - - memcpy((char *)q - sizeof(diff), &diff, sizeof(diff)); - - return q; -} - -/* reverses vec_align_ptr */ -VEC_FUNC_IMPL void *vec_internal_unalign_ptr_(void *q) -{ - vec_alignment_type diff; - - memcpy(&diff, (char *)q - sizeof(diff), sizeof(diff)); - q = (char *)q - diff; - - return q; -} - -VEC_FUNC_IMPL void *vec_malloc(size_t size) -{ - void *q; - - if (size > VEC_MALLOC_MAX_SIZE) - return NULL; - - /* allocate space for the diff (we have to do this, - * for realloc has no way of knowing the original ptr) */ - q = malloc(size + VEC_MALLOC_ADDITIONAL_SIZE); - if (!q) - return NULL; - - return vec_internal_align_ptr_(q); -} - -VEC_FUNC_IMPL void *vec_calloc(size_t count, size_t nmemb) -{ - size_t size; - void *q; - - size = count * nmemb; - if ((size && size / count != nmemb) - || size > VEC_MALLOC_MAX_SIZE) - return NULL; /* nope */ - - q = calloc(size + VEC_MALLOC_ADDITIONAL_SIZE, 1); - if (!q) - return NULL; - - return vec_internal_align_ptr_(q); -} - -VEC_FUNC_IMPL void *vec_realloc(void *ptr, size_t newsize) -{ - void *q; - - if (!ptr) - return vec_malloc(newsize); - - if (newsize > VEC_MALLOC_MAX_SIZE) - return NULL; - - q = realloc(vec_internal_unalign_ptr_(ptr), VEC_MALLOC_ADDITIONAL_SIZE); - if (!q) - return NULL; - - return vec_internal_align_ptr_(q); -} - -VEC_FUNC_IMPL void vec_free(void *ptr) -{ - if (ptr) - free(vec_internal_unalign_ptr_(ptr)); -} #ifdef __cplusplus }
--- a/test/Makefile.template Sat Apr 26 02:54:44 2025 -0400 +++ b/test/Makefile.template Sat Apr 26 15:31:39 2025 -0400 @@ -3,19 +3,26 @@ CXXFLAGS += $(CPPFLAGS) -std=c++11 HEADERS = ../include/vec/vec.h \ + ../include/vec/cpu.h \ + ../include/vec/mem.h \ + ../include/vec/defs.h \ ../include/vec/impl/ppc/altivec.h \ ../include/vec/impl/x86/avx2.h \ ../include/vec/impl/x86/avx512f.h \ + ../include/vec/impl/x86/avx512bw.h \ ../include/vec/impl/x86/mmx.h \ ../include/vec/impl/x86/sse2.h \ + ../include/vec/impl/x86/sse3.h \ ../include/vec/impl/x86/sse41.h \ - ../include/vec/impl/cpu.h \ + ../include/vec/impl/x86/sse42.h \ ../include/vec/impl/generic.h \ test_align.h \ test_arith.h \ test_compare.h \ test_shift.h \ - test_benchmark.h + test_benchmark.h \ + test_benchmark_vec.c \ + test_benchmark_simple.c BINS = test-generic test-host test-cxx OBJS = test.o test-cxx.o test_benchmark_simple.o test_benchmark_vec.o
--- a/test/Makefile.x86 Sat Apr 26 02:54:44 2025 -0400 +++ b/test/Makefile.x86 Sat Apr 26 15:31:39 2025 -0400 @@ -1,3 +1,3 @@ -CPPFLAGS += -mmmx -msse2 -mavx512f +CPPFLAGS += -mmmx -msse2 -msse3 -msse4.1 -msse4.2 -mavx512f -mavx512bw include Makefile.template \ No newline at end of file
--- a/test/test.c Sat Apr 26 02:54:44 2025 -0400 +++ b/test/test.c Sat Apr 26 15:31:39 2025 -0400 @@ -1,4 +1,5 @@ #include "vec/vec.h" +#include "vec/mem.h" #include <stdio.h> #include <string.h>
--- a/test/test_benchmark.h Sat Apr 26 02:54:44 2025 -0400 +++ b/test/test_benchmark.h Sat Apr 26 15:31:39 2025 -0400 @@ -2,10 +2,6 @@ /* ------------------------------------------------------------------------ */ /* simple benchmark for getting the min/max range of an audio sample. */ -/* prevent GCC from optimizing these function calls away - i think there's - * probably a better way to do this, but I haven't found it yet :) */ - - extern void test_benchmark_sample_minmax_simple_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax); extern void test_benchmark_sample_minmax_vec_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax); @@ -14,19 +10,19 @@ int32_t min, max; clock_t start, end; int i; - int16_t *q = vec_malloc(16000000u * 2u); + int16_t *q = vec_malloc(16000001u * 2u); - printf("\nsigned 16-bit audio sample min/max - 1 thousand passes - 16000000 samples\n\n"); + printf("\nsigned 16-bit audio sample min/max - 1 thousand passes - 16000001 samples\n\n"); /* generate random sample values */ - for (i = 0; i < 16000000; i++) + for (i = 0; i < 16000001; i++) q[i] = rand(); start = clock(); for (i = 0; i < 1000; i++) { min = INT32_MAX; max = INT32_MIN; - test_benchmark_sample_minmax_vec_impl(q, 16000000u, &min, &max); + test_benchmark_sample_minmax_vec_impl(q, 16000001u, &min, &max); } end = clock(); @@ -36,7 +32,7 @@ for (i = 0; i < 1000; i++) { min = INT32_MAX; max = INT32_MIN; - test_benchmark_sample_minmax_simple_impl(q, 16000000u, &min, &max); + test_benchmark_sample_minmax_simple_impl(q, 16000001u, &min, &max); } end = clock();
--- a/utils/gengeneric.c Sat Apr 26 02:54:44 2025 -0400 +++ b/utils/gengeneric.c Sat Apr 26 15:31:39 2025 -0400 @@ -66,10 +66,6 @@ "#ifndef VEC_IMPL_GENERIC_H_\n" "#define VEC_IMPL_GENERIC_H_\n" "\n" - "#include <string.h>\n" - "\n" - "// -----------------------------------------------------------------\n" - "\n" "#define VEC_GENERIC_OPERATION(op, sign, bits, size) \\\n" " do { \\\n" " int i; \\\n"