Mercurial > vec
view src/cpu.c @ 26:6c91cd9a2f2d
include/vec/vec: fix vec_avg implementation
now it's exactly the same as AltiVec's
author | Paper <paper@tflc.us> |
---|---|
date | Mon, 25 Nov 2024 04:43:22 +0000 |
parents | 92156fe32755 |
children |
line wrap: on
line source
/** * vec - a tiny SIMD vector library in C99 * * Copyright (c) 2024 Paper * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. **/ /* Detect CPU SIMD support. Much of this code was stolen from SDL. * * Simple DirectMedia Layer * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org> * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. */ #include "vec/vec.h" #include "vec/cpu.h" #if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) # include <sys/sysctl.h> // For AltiVec check #elif defined(__OpenBSD__) && defined(__powerpc__) # include <sys/types.h> # include <sys/sysctl.h> // For AltiVec check # include <machine/cpu.h> #elif defined(__FreeBSD__) && defined(__powerpc__) # include <machine/cpu.h> # include <sys/auxv.h> #elif defined(VEC_COMPILER_HAS_ALTIVEC) # include <signal.h> # include <setjmp.h> #endif #ifdef __FreeBSD__ # include <sys/param.h> #endif #if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__) # include <unistd.h> # include <sys/types.h> # include <sys/stat.h> # include <fcntl.h> # include <elf.h> /*#include <asm/hwcap.h>*/ # ifndef AT_HWCAP # define AT_HWCAP 16 # endif # ifndef AT_PLATFORM # define AT_PLATFORM 15 # endif # ifndef HWCAP_NEON # define HWCAP_NEON (1 << 12) # endif #endif static inline int vec_CPU_have_CPUID(void) { int has_CPUID = 0; #if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) __asm__ ( " pushfl # Get original EFLAGS \n" " popl %%eax \n" " movl %%eax,%%ecx \n" " xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" " pushl %%eax # Save new EFLAGS value on stack \n" " popfl # Replace current EFLAGS value \n" " pushfl # Get new EFLAGS \n" " popl %%eax # Store new EFLAGS in EAX \n" " xorl %%ecx,%%eax # Can not toggle ID bit, \n" " jz 1f # Processor=80486 \n" " movl $1,%0 # We have CPUID support \n" "1: \n" : "=m" (has_CPUID) : : "%eax", "%ecx" ); #elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) /* Technically, if this is being compiled under __x86_64__ then it has CPUid by definition. But it's nice to be able to prove it. :) */ __asm__ ( " pushfq # Get original EFLAGS \n" " popq %%rax \n" " movq %%rax,%%rcx \n" " xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n" " pushq %%rax # Save new EFLAGS value on stack \n" " popfq # Replace current EFLAGS value \n" " pushfq # Get new EFLAGS \n" " popq %%rax # Store new EFLAGS in EAX \n" " xorl %%ecx,%%eax # Can not toggle ID bit, \n" " jz 1f # Processor=80486 \n" " movl $1,%0 # We have CPUID support \n" "1: \n" : "=m" (has_CPUID) : : "%rax", "%rcx" ); #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) __asm { pushfd ; Get original EFLAGS pop eax mov ecx, eax xor eax, 200000h ; Flip ID bit in EFLAGS push eax ; Save new EFLAGS value on stack popfd ; Replace current EFLAGS value pushfd ; Get new EFLAGS pop eax ; Store new EFLAGS in EAX xor eax, ecx ; Can not toggle ID bit, jz done ; Processor=80486 mov has_CPUID,1 ; We have CPUID support done: } #elif defined(_MSC_VER) && defined(_M_X64) has_CPUID = 1; #elif defined(__sun) && defined(__i386) __asm ( " pushfl \n" " popl %eax \n" " movl %eax,%ecx \n" " xorl $0x200000,%eax \n" " pushl %eax \n" " popfl \n" " pushfl \n" " popl %eax \n" " xorl %ecx,%eax \n" " jz 1f \n" " movl $1,-8(%ebp) \n" "1: \n" ); #elif defined(__sun) && defined(__amd64) __asm ( " pushfq \n" " popq %rax \n" " movq %rax,%rcx \n" " xorl $0x200000,%eax \n" " pushq %rax \n" " popfq \n" " pushfq \n" " popq %rax \n" " xorl %ecx,%eax \n" " jz 1f \n" " movl $1,-8(%rbp) \n" "1: \n" ); #endif return has_CPUID; } #if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__) # define VEC_CPU_CPUID(func, a, b, c, d) \ __asm__ __volatile__( \ " pushl %%ebx \n" \ " xorl %%ecx,%%ecx \n" \ " cpuid \n" \ " movl %%ebx, %%esi \n" \ " popl %%ebx \n" \ : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ : "a"(func)) #elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) # define VEC_CPU_CPUID(func, a, b, c, d) \ __asm__ __volatile__( \ " pushq %%rbx \n" \ " xorq %%rcx,%%rcx \n" \ " cpuid \n" \ " movq %%rbx, %%rsi \n" \ " popq %%rbx \n" \ : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \ : "a"(func)) #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) # define VEC_CPU_CPUID(func, a, b, c, d) \ __asm { \ __asm mov eax, func \ __asm xor ecx, ecx \ __asm cpuid \ __asm mov a, eax \ __asm mov b, ebx \ __asm mov c, ecx \ __asm mov d, edx \ } #elif (defined(_MSC_VER) && defined(_M_X64)) // Use __cpuidex instead of __cpuid because ICL does not clear ecx register # define VEC_CPU_CPUID(func, a, b, c, d) \ do { \ int CPUInfo[4]; \ __cpuidex(CPUInfo, func, 0); \ a = CPUInfo[0]; \ b = CPUInfo[1]; \ c = CPUInfo[2]; \ d = CPUInfo[3]; \ } while (0) #else # define VEC_CPU_CPUID(func, a, b, c, d) \ do { \ a = b = c = d = 0; \ (void)a; \ (void)b; \ (void)c; \ (void)d; \ } while (0) #endif // --------------------------------------------------------------- static int vec_CPU_CPUIDFeatures[4]; static int vec_CPU_CPUIDMaxFunction = 0; static int vec_CPU_OSSavesYMM = 0; static int vec_CPU_OSSavesZMM = 0; static inline void vec_CPU_get_CPUID_features(void) { static int checked = 0; if (!checked) { checked = 1; if (vec_CPU_have_CPUID()) { int a, b, c, d; VEC_CPU_CPUID(0, a, b, c, d); vec_CPU_CPUIDMaxFunction = a; if (vec_CPU_CPUIDMaxFunction >= 1) { VEC_CPU_CPUID(1, a, b, c, d); vec_CPU_CPUIDFeatures[0] = a; vec_CPU_CPUIDFeatures[1] = b; vec_CPU_CPUIDFeatures[2] = c; vec_CPU_CPUIDFeatures[3] = d; // Check to make sure we can call xgetbv if (c & 0x08000000) { // Call xgetbv to see if YMM (etc) register state is saved #if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__)) __asm__(".byte 0x0f, 0x01, 0xd0" : "=a"(a) : "c"(0) : "%edx"); #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1 a = (int)_xgetbv(0); #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__) __asm { xor ecx, ecx _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 mov a, eax } #endif vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0; vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0; } } } } } #if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) static jmp_buf vec_jmpbuf; static void vec_CPU_illegal_instruction(int sig) { longjmp(vec_jmpbuf, 1); } #endif static int vec_CPU_have_ALTIVEC(void) { volatile int altivec = 0; #if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__)) int selectors[2] = { # ifdef __OpenBSD__ CTL_MACHDEP, CPU_ALTIVEC # else CTL_HW, HW_VECTORUNIT # endif }; int hasVectorUnit = 0; vec_uintsize length = sizeof(hasVectorUnit); int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); if (!error) altivec = (hasVectorUnit != 0); #elif defined(__FreeBSD__) && defined(__powerpc__) unsigned long cpufeatures = 0; elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures)); altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC; #elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__) void (*handler)(int sig); handler = signal(SIGILL, vec_CPU_illegal_instruction); if (!setjmp(vec_jmpbuf)) { __asm__ __volatile__("mtspr 256, %0\n\t" "vand %%v0, %%v0, %%v0" ::"r"(-1)); altivec = 1; } signal(SIGILL, handler); #endif return altivec; } static int vec_CPU_have_ALTIVEC_VSX(void) { volatile int vsx = 0; #if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__) # warning Compiling UNTESTED code for VSX. void (*handler)(int sig); handler = signal(SIGILL, vec_CPU_illegal_instruction); if (!setjmp(vec_jmpbuf)) { // this is completely untested //__asm__ __volatile__("mtspr 256, %0\n\t" // "xxland %%v0, %%v0, %%v0" ::"r"(-1)); //vsx = 1; } signal(SIGILL, handler); #endif return vsx; } #define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000) #define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000) #define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000) #define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001) #define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000) #define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000) #define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000)) static inline int vec_CPU_have_AVX2(void) { if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { int a, b, c, d; VEC_CPU_CPUID(7, a, b, c, d); return b & 0x00000020; (void)a, (void)c, (void)d; } return 0; } static inline int vec_CPU_have_AVX512F(void) { if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) { int a, b, c, d; VEC_CPU_CPUID(7, a, b, c, d); return b & 0x00000020; (void)a, (void)c, (void)d; } return 0; } #if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL) static int readProcAuxvForNeon(void) { int neon = 0; int fd; fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); if (fd >= 0) { Elf32_auxv_t aux; while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { if (aux.a_type == AT_HWCAP) { neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON; break; } } close(fd); } return neon; } #endif static int vec_CPU_have_NEON(void) { /* The way you detect NEON is a privileged instruction on ARM, so you have query the OS kernel in a platform-specific way. :/ */ #if defined(SDL_CPUINFO_DISABLED) return 0; /* disabled */ #elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64)) /* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */ /* Seems to have been removed */ #ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE #define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19 #endif /* All WinRT ARM devices are required to support NEON, but just in case. */ return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0; #elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__) return 1; /* ARMv8 always has non-optional NEON support. */ #elif defined(__VITA__) return 1; #elif defined(__3DS__) return 0; #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */ return 1; /* all Apple ARMv7 chips and later have NEON. */ #elif defined(__APPLE__) return 0; /* assume anything else from Apple doesn't have NEON. */ #elif !defined(__arm__) return 0; /* not an ARM CPU at all. */ #elif defined(__OpenBSD__) return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */ #elif defined(HAVE_ELF_AUX_INFO) unsigned long hasneon = 0; if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0) return 0; return ((hasneon & HWCAP_NEON) == HWCAP_NEON); #elif defined(__QNXNTO__) return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON; #elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL) return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON; #elif defined(__linux__) return readProcAuxvForNeon(); #elif defined(__ANDROID__) /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */ { AndroidCpuFamily cpu_family = android_getCpuFamily(); if (cpu_family == ANDROID_CPU_FAMILY_ARM) { uint64_t cpu_features = android_getCpuFeatures(); if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) { return 1; } } return 0; } #elif defined(__RISCOS__) /* Use the VFPSupport_Features SWI to access the MVFR registers */ { _kernel_swi_regs regs; regs.r[0] = 0; if (_kernel_swi(VFPSupport_Features, ®s, ®s) == NULL) { if ((regs.r[2] & 0xFFF000) == 0x111000) { return 1; } } return 0; } #else #warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me. return 0; #endif } #define VEC_CPU_FEATURES_RESET VEC_UINT32_C(0xFFFFFFFF) static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET; vec_uint32 vec_get_CPU_features(void) { if (vec_CPU_features == VEC_CPU_FEATURES_RESET) { vec_CPU_get_CPUID_features(); vec_CPU_features = 0; if (vec_CPU_have_ALTIVEC()) vec_CPU_features |= VEC_CPU_HAS_ALTIVEC; if (vec_CPU_have_ALTIVEC_VSX()) vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX; if (vec_CPU_have_MMX()) vec_CPU_features |= VEC_CPU_HAS_MMX; if (vec_CPU_have_SSE()) vec_CPU_features |= VEC_CPU_HAS_SSE; if (vec_CPU_have_SSE2()) vec_CPU_features |= VEC_CPU_HAS_SSE2; if (vec_CPU_have_SSE3()) vec_CPU_features |= VEC_CPU_HAS_SSE3; if (vec_CPU_have_SSE41()) vec_CPU_features |= VEC_CPU_HAS_SSE41; if (vec_CPU_have_SSE42()) vec_CPU_features |= VEC_CPU_HAS_SSE42; if (vec_CPU_have_AVX()) vec_CPU_features |= VEC_CPU_HAS_AVX; if (vec_CPU_have_AVX2()) vec_CPU_features |= VEC_CPU_HAS_AVX2; if (vec_CPU_have_AVX512F()) vec_CPU_features |= VEC_CPU_HAS_AVX512F; if (vec_CPU_have_NEON()) vec_CPU_features |= VEC_CPU_HAS_NEON; } return vec_CPU_features; }