Mercurial > vec
view src/cpu.c @ 23:e26874655738
*: huge refactor, new major release (hahaha)
I keep finding things that are broken...
The problem NOW was that vec would unintentionally build some
functions with extended instruction sets, which is Bad and would
mean that for all intents and purposes the CPU detection was
completely broken.
Now vec is no longer header only either. Boohoo. However this gives
a lot more flexibility to vec since we no longer want or need to
care about C++ crap.
The NEON and Altivec implementations have not been updated which
means they won't compile hence why they're commented out in the
cmake build file.
author | Paper <paper@tflc.us> |
---|---|
date | Sun, 24 Nov 2024 02:52:40 -0500 (8 weeks ago) |
parents | |
children | 92156fe32755 |
line wrap: on
line source
/** * vec - a tiny SIMD vector library in C99 * * Copyright (c) 2024 Paper * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. **/ /* Detect CPU SIMD support. Much of this code was stolen from SDL. * * Simple DirectMedia Layer * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org> * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. */ #include "vec/cpu.h" #if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) # include <sys/sysctl.h> // For AltiVec check #elif defined(__OpenBSD__) && defined(__powerpc__) # include <sys/types.h> # include <sys/sysctl.h> // For AltiVec check # include <machine/cpu.h> #elif defined(__FreeBSD__) && defined(__powerpc__) # include <machine/cpu.h> # include <sys/auxv.h> #elif defined(__ALTIVEC__) # include <signal.h> # include <setjmp.h> #endif #ifdef __FreeBSD__ # include <sys/param.h> #endif #if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__) # include <unistd.h> # include <sys/types.h> # include <sys/stat.h> # include <fcntl.h> # include <elf.h> /*#include <asm/hwcap.h>*/ # ifndef AT_HWCAP # define AT_HWCAP 16 # endif # ifndef AT_PLATFORM # define AT_PLATFORM 15 # endif # ifndef HWCAP_NEON # define HWCAP_NEON (1 << 12) # endif #endif static inline int vec_CPU_have_CPUID(void) { int has_CPUID = 0; #if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) __asm__ ( " pushfl # Get original EFLAGS \n" " popl %%eax \n" " movl %%eax,%%ecx \n" " xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" " pushl %%eax # Save new EFLAGS value on stack \n" " popfl # Replace current EFLAGS value \n" " pushfl # Get new EFLAGS \n" " popl %%eax # Store new EFLAGS in EAX \n" " xorl %%ecx,%%eax # Can not toggle ID bit, \n" " jz 1f # Processor=80486 \n" " movl $1,%0 # We have CPUID support \n" "1: \n" : "=m" (has_CPUID) : : "%eax", "%ecx" ); #elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) /* Technically, if this is being compiled under __x86_64__ then it has CPUid by definition. But it's nice to be able to prove it. :) */ __asm__ ( " pushfq # Get original EFLAGS \n" " popq %%rax \n" " movq %%rax,%%rcx \n" " xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" " pushq %%rax # Save new EFLAGS value on stack \n" " popfq # Replace current EFLAGS value \n" " pushfq # Get new EFLAGS \n" " popq %%rax # Store new EFLAGS in EAX \n" " xorl %%ecx,%%eax # Can not toggle ID bit, \n" " jz 1f # Processor=80486 \n" " movl $1,%0 # We have CPUID support \n" "1: \n" : "=m" (has_CPUID) : : "%rax", "%rcx" ); #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) __asm { pushfd ; Get original EFLAGS pop eax mov ecx, eax xor eax, 200000h ; Flip ID bit in EFLAGS push eax ; Save new EFLAGS value on stack popfd ; Replace current EFLAGS value pushfd ; Get new EFLAGS pop eax ; Store new EFLAGS in EAX xor eax, ecx ; Can not toggle ID bit, jz done ; Processor=80486 mov has_CPUID,1 ; We have CPUID support done: } #elif defined(_MSC_VER) && defined(_M_X64) has_CPUID = 1; #elif defined(__sun) && defined(__i386) __asm ( " pushfl \n" " popl %eax \n" " movl %eax,%ecx \n" " xorl $0x200000,%eax \n" " pushl %eax \n" " popfl \n" " pushfl \n" " popl %eax \n" " xorl %ecx,%eax \n" " jz 1f \n" " movl $1,-8(%ebp) \n" "1: \n" ); #elif defined(__sun) && defined(__amd64) __asm ( " pushfq \n" " popq %rax \n" " movq %rax,%rcx \n" " xorl $0x200000,%eax \n" " pushq %rax \n" " popfq \n" " pushfq \n" " popq %rax \n" " xorl %ecx,%eax \n" " jz 1f \n" " movl $1,-8(%rbp) \n" "1: \n" ); #endif return has_CPUID; } #if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) # define VEC_CPU_CPUID(func, a, b, c, d) \ __asm__ __volatile__( \ " pushl %%ebx \n" \ " xorl %%ecx,%%ecx \n" \ " cpuid \n" \ " movl %%ebx, %%esi \n" \ " popl %%ebx \n" \ : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ : "a"(func)) #elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) # define VEC_CPU_CPUID(func, a, b, c, d) \ __asm__ __volatile__( \ " pushq %%rbx \n" \ " xorq %%rcx,%%rcx \n" \ " cpuid \n" \ " movq %%rbx, %%rsi \n" \ " popq %%rbx \n" \ : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ : "a"(func)) #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) # define VEC_CPU_CPUID(func, a, b, c, d) \ __asm { \ __asm mov eax, func \ __asm xor ecx, ecx \ __asm cpuid \ __asm mov a, eax \ __asm mov b, ebx \ __asm mov c, ecx \ __asm mov d, edx \ } #elif (defined(_MSC_VER) && defined(_M_X64)) // Use __cpuidex instead of __cpuid because ICL does not clear ecx register # define VEC_CPU_CPUID(func, a, b, c, d) \ do { \ int CPUInfo[4]; \ __cpuidex(CPUInfo, func, 0); \ a = CPUInfo[0]; \ b = CPUInfo[1]; \ c = CPUInfo[2]; \ d = CPUInfo[3]; \ } while (0) #else # define VEC_CPU_CPUID(func, a, b, c, d) \ do { \ a = b = c = d = 0; \ (void)a; \ (void)b; \ (void)c; \ (void)d; \ } while (0) #endif // --------------------------------------------------------------- static int vec_CPU_CPUIDFeatures[4]; static int vec_CPU_CPUIDMaxFunction = 0; static int vec_CPU_OSSavesYMM = 0; static int vec_CPU_OSSavesZMM = 0; static inline void vec_CPU_get_CPUID_features(void) { static int checked = 0; if (!checked) { checked = 1; if (vec_CPU_have_CPUID()) { int a, b, c, d; VEC_CPU_CPUID(0, a, b, c, d); vec_CPU_CPUIDMaxFunction = a; if (vec_CPU_CPUIDMaxFunction >= 1) { VEC_CPU_CPUID(1, a, b, c, d); vec_CPU_CPUIDFeatures[0] = a; vec_CPU_CPUIDFeatures[1] = b; vec_CPU_CPUIDFeatures[2] = c; vec_CPU_CPUIDFeatures[3] = d; // Check to make sure we can call xgetbv if (c & 0x08000000) { // Call xgetbv to see if YMM (etc) register state is saved #if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__)) __asm__(".byte 0x0f, 0x01, 0xd0" : "=a"(a) : "c"(0) : "%edx"); #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1 a = (int)_xgetbv(0); #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) __asm { xor ecx, ecx _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 mov a, eax } #endif vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0; vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0; } } } } } #if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) static jmp_buf vec_jmpbuf; static void vec_CPU_illegal_instruction(int sig) { longjmp(vec_jmpbuf, 1); } #endif static int vec_CPU_have_ALTIVEC(void) { volatile int altivec = 0; #if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__)) int selectors[2] = { # ifdef __OpenBSD__ CTL_MACHDEP, CPU_ALTIVEC # else CTL_HW, HW_VECTORUNIT # endif }; int hasVectorUnit = 0; vec_uintsize length = sizeof(hasVectorUnit); int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); if (!error) altivec = (hasVectorUnit != 0); #elif defined(__FreeBSD__) && defined(__powerpc__) unsigned long cpufeatures = 0; elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; #elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) void (*handler)(int sig); handler = signal(SIGILL, vec_CPU_illegal_instruction); if (!setjmp(vec_jmpbuf)) { vector unsigned char vec; vec_and(vec, vec); altivec = 1; } signal(SIGILL, handler); #endif return altivec; } static int vec_CPU_have_ALTIVEC_VSX(void) { volatile int vsx = 0; #if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__) # warning Compiling UNTESTED code for VSX. void (*handler)(int sig); handler = signal(SIGILL, vec_CPU_illegal_instruction); if (!setjmp(vec_jmpbuf)) { // this is completely untested //__asm__ __volatile__("mtspr 256, %0\n\t" // "xxland %%v0, %%v0, %%v0" ::"r"(-1)); //vsx = 1; } signal(SIGILL, handler); #endif return vsx; } #define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000) #define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000) #define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000) #define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001) #define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000) #define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000) #define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000)) static inline int vec_CPU_have_AVX2(void) { if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { int a, b, c, d; VEC_CPU_CPUID(7, a, b, c, d); return b & 0x00000020; (void)a, (void)c, (void)d; } return 0; } static inline int vec_CPU_have_AVX512F(void) { if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { int a, b, c, d; VEC_CPU_CPUID(7, a, b, c, d); return b & 0x00000020; (void)a, (void)c, (void)d; } return 0; } #if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL) static int readProcAuxvForNeon(void) { int neon = 0; int fd; fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); if (fd >= 0) { Elf32_auxv_t aux; while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { if (aux.a_type == AT_HWCAP) { neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON; break; } } close(fd); } return neon; } #endif static int vec_CPU_have_NEON(void) { /* The way you detect NEON is a privileged instruction on ARM, so you have query the OS kernel in a platform-specific way. :/ */ #if defined(SDL_CPUINFO_DISABLED) return 0; /* disabled */ #elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64)) /* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */ /* Seems to have been removed */ #ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE #define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19 #endif /* All WinRT ARM devices are required to support NEON, but just in case. */ return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0; #elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__) return 1; /* ARMv8 always has non-optional NEON support. */ #elif defined(__VITA__) return 1; #elif defined(__3DS__) return 0; #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */ return 1; /* all Apple ARMv7 chips and later have NEON. */ #elif defined(__APPLE__) return 0; /* assume anything else from Apple doesn't have NEON. */ #elif !defined(__arm__) return 0; /* not an ARM CPU at all. */ #elif defined(__OpenBSD__) return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */ #elif defined(HAVE_ELF_AUX_INFO) unsigned long hasneon = 0; if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0) return 0; return ((hasneon & HWCAP_NEON) == HWCAP_NEON); #elif defined(__QNXNTO__) return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON; #elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL) return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON; #elif defined(__linux__) return readProcAuxvForNeon(); #elif defined(__ANDROID__) /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */ { AndroidCpuFamily cpu_family = android_getCpuFamily(); if (cpu_family == ANDROID_CPU_FAMILY_ARM) { uint64_t cpu_features = android_getCpuFeatures(); if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) { return 1; } } return 0; } #elif defined(__RISCOS__) /* Use the VFPSupport_Features SWI to access the MVFR registers */ { _kernel_swi_regs regs; regs.r[0] = 0; if (_kernel_swi(VFPSupport_Features, ®s, ®s) == NULL) { if ((regs.r[2] & 0xFFF000) == 0x111000) { return 1; } } return 0; } #else #warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me. return 0; #endif } #define VEC_CPU_FEATURES_RESET VEC_UINT32_C(0xFFFFFFFF) static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET; vec_uint32 vec_get_CPU_features(void) { if (vec_CPU_features == VEC_CPU_FEATURES_RESET) { vec_CPU_get_CPUID_features(); vec_CPU_features = 0; if (vec_CPU_have_ALTIVEC()) vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; if (vec_CPU_have_ALTIVEC_VSX()) vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; if (vec_CPU_have_MMX()) vec_CPU_features |= VEC_CPU_HAS_MMX; if (vec_CPU_have_SSE()) vec_CPU_features |= VEC_CPU_HAS_SSE; if (vec_CPU_have_SSE2()) vec_CPU_features |= VEC_CPU_HAS_SSE2; if (vec_CPU_have_SSE3()) vec_CPU_features |= VEC_CPU_HAS_SSE3; if (vec_CPU_have_SSE41()) vec_CPU_features |= VEC_CPU_HAS_SSE41; if (vec_CPU_have_SSE42()) vec_CPU_features |= VEC_CPU_HAS_SSE42; if (vec_CPU_have_AVX()) vec_CPU_features |= VEC_CPU_HAS_AVX; if (vec_CPU_have_AVX2()) vec_CPU_features |= VEC_CPU_HAS_AVX2; if (vec_CPU_have_AVX512F()) vec_CPU_features |= VEC_CPU_HAS_AVX512F; if (vec_CPU_have_NEON()) vec_CPU_features |= VEC_CPU_HAS_NEON; } return vec_CPU_features; }