view src/cpu.c @ 25:92156fe32755

impl/ppc/altivec: update to new implementation the signed average function is wrong; it needs to round up the number when only one of them is odd, but that doesn't necessarily seem to be true because altivec is weird, and that's what we need to emulate the quirks for. ugh. also the altivec backend uses the generic functions instead of fallbacks because it does indeed use the exact same memory structure as the generic implementation...
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 11:15:59 +0000
parents e26874655738
children
line wrap: on
line source

/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

/* Detect CPU SIMD support. Much of this code was stolen from SDL.
 *
 * Simple DirectMedia Layer
 * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
*/

#include "vec/vec.h"
#include "vec/cpu.h"

#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
# include <sys/sysctl.h> // For AltiVec check
#elif defined(__OpenBSD__) && defined(__powerpc__)
# include <sys/types.h>
# include <sys/sysctl.h> // For AltiVec check
# include <machine/cpu.h>
#elif defined(__FreeBSD__) && defined(__powerpc__)
# include <machine/cpu.h>
# include <sys/auxv.h>
#elif defined(VEC_COMPILER_HAS_ALTIVEC)
# include <signal.h>
# include <setjmp.h>
#endif

#ifdef __FreeBSD__
# include <sys/param.h>
#endif

#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
# include <unistd.h>
# include <sys/types.h>
# include <sys/stat.h>
# include <fcntl.h>
# include <elf.h>

/*#include <asm/hwcap.h>*/
# ifndef AT_HWCAP
# define AT_HWCAP 16
# endif
# ifndef AT_PLATFORM
#  define AT_PLATFORM 15
# endif
# ifndef HWCAP_NEON
#  define HWCAP_NEON (1 << 12)
# endif
#endif

static inline int vec_CPU_have_CPUID(void)
{
	int has_CPUID = 0;

#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
	__asm__ (
"        pushfl                      # Get original EFLAGS             \n"
"        popl    %%eax                                                 \n"
"        movl    %%eax,%%ecx                                           \n"
"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
"        pushl   %%eax               # Save new EFLAGS value on stack  \n"
"        popfl                       # Replace current EFLAGS value    \n"
"        pushfl                      # Get new EFLAGS                  \n"
"        popl    %%eax               # Store new EFLAGS in EAX         \n"
"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
"        jz      1f                  # Processor=80486                 \n"
"        movl    $1,%0               # We have CPUID support           \n"
"1:                                                                    \n"
	: "=m" (has_CPUID)
	:
	: "%eax", "%ecx"
	);
#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
/* Technically, if this is being compiled under __x86_64__ then it has
   CPUid by definition.  But it's nice to be able to prove it.  :)      */
	__asm__ (
"        pushfq                      # Get original EFLAGS             \n"
"        popq    %%rax                                                 \n"
"        movq    %%rax,%%rcx                                           \n"
"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
"        pushq   %%rax               # Save new EFLAGS value on stack  \n"
"        popfq                       # Replace current EFLAGS value    \n"
"        pushfq                      # Get new EFLAGS                  \n"
"        popq    %%rax               # Store new EFLAGS in EAX         \n"
"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
"        jz      1f                  # Processor=80486                 \n"
"        movl    $1,%0               # We have CPUID support           \n"
"1:                                                                    \n"
	: "=m" (has_CPUID)
	:
	: "%rax", "%rcx"
	);
#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
	__asm {
		pushfd                      ; Get original EFLAGS
		pop     eax
		mov     ecx, eax
		xor     eax, 200000h        ; Flip ID bit in EFLAGS
		push    eax                 ; Save new EFLAGS value on stack
		popfd                       ; Replace current EFLAGS value
		pushfd                      ; Get new EFLAGS
		pop     eax                 ; Store new EFLAGS in EAX
		xor     eax, ecx            ; Can not toggle ID bit,
		jz      done                ; Processor=80486
		mov     has_CPUID,1         ; We have CPUID support
done:
	}
#elif defined(_MSC_VER) && defined(_M_X64)
	has_CPUID = 1;
#elif defined(__sun) && defined(__i386)
	__asm (
"       pushfl                 \n"
"       popl    %eax           \n"
"       movl    %eax,%ecx      \n"
"       xorl    $0x200000,%eax \n"
"       pushl   %eax           \n"
"       popfl                  \n"
"       pushfl                 \n"
"       popl    %eax           \n"
"       xorl    %ecx,%eax      \n"
"       jz      1f             \n"
"       movl    $1,-8(%ebp)    \n"
"1:                            \n"
	);
#elif defined(__sun) && defined(__amd64)
	__asm (
"       pushfq                 \n"
"       popq    %rax           \n"
"       movq    %rax,%rcx      \n"
"       xorl    $0x200000,%eax \n"
"       pushq   %rax           \n"
"       popfq                  \n"
"       pushfq                 \n"
"       popq    %rax           \n"
"       xorl    %ecx,%eax      \n"
"       jz      1f             \n"
"       movl    $1,-8(%rbp)    \n"
"1:                            \n"
	);
#endif

	return has_CPUID;
}

#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
# define VEC_CPU_CPUID(func, a, b, c, d) \
	__asm__ __volatile__( \
		"        pushl %%ebx        \n" \
		"        xorl %%ecx,%%ecx   \n" \
		"        cpuid              \n" \
		"        movl %%ebx, %%esi  \n" \
		"        popl %%ebx         \n" \
		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
		: "a"(func))
#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
# define VEC_CPU_CPUID(func, a, b, c, d) \
	__asm__ __volatile__( \
		"        pushq %%rbx        \n" \
		"        xorq %%rcx,%%rcx   \n" \
		"        cpuid              \n" \
		"        movq %%rbx, %%rsi  \n" \
		"        popq %%rbx         \n" \
		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
		: "a"(func))
#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
# define VEC_CPU_CPUID(func, a, b, c, d) \
	__asm { \
		__asm mov eax, func \
		__asm xor ecx, ecx \
		__asm cpuid \
		__asm mov a, eax \
		__asm mov b, ebx \
		__asm mov c, ecx \
		__asm mov d, edx \
	}
#elif (defined(_MSC_VER) && defined(_M_X64))
// Use __cpuidex instead of __cpuid because ICL does not clear ecx register
# define VEC_CPU_CPUID(func, a, b, c, d) \
	do { \
		int CPUInfo[4]; \
		__cpuidex(CPUInfo, func, 0); \
		a = CPUInfo[0]; \
		b = CPUInfo[1]; \
		c = CPUInfo[2]; \
		d = CPUInfo[3]; \
	} while (0)
#else
# define VEC_CPU_CPUID(func, a, b, c, d) \
	do { \
		a = b = c = d = 0; \
		(void)a; \
		(void)b; \
		(void)c; \
		(void)d; \
	} while (0)
#endif

// ---------------------------------------------------------------

static int vec_CPU_CPUIDFeatures[4];
static int vec_CPU_CPUIDMaxFunction = 0;
static int vec_CPU_OSSavesYMM = 0;
static int vec_CPU_OSSavesZMM = 0;

static inline void vec_CPU_get_CPUID_features(void)
{
	static int checked = 0;
	if (!checked) {
		checked = 1;
		if (vec_CPU_have_CPUID()) {
			int a, b, c, d;
			VEC_CPU_CPUID(0, a, b, c, d);
			vec_CPU_CPUIDMaxFunction = a;
			if (vec_CPU_CPUIDMaxFunction >= 1) {
				VEC_CPU_CPUID(1, a, b, c, d);
				vec_CPU_CPUIDFeatures[0] = a;
				vec_CPU_CPUIDFeatures[1] = b;
				vec_CPU_CPUIDFeatures[2] = c;
				vec_CPU_CPUIDFeatures[3] = d;

				// Check to make sure we can call xgetbv
				if (c & 0x08000000) {
					// Call xgetbv to see if YMM (etc) register state is saved
#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
					__asm__(".byte 0x0f, 0x01, 0xd0"
							: "=a"(a)
							: "c"(0)
							: "%edx");
#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
					a = (int)_xgetbv(0);
#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
					__asm {
						xor ecx, ecx
						_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
						mov a, eax
					}
#endif
					vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
					vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
				}
			}
		}
	}
}

#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
static jmp_buf vec_jmpbuf;
static void vec_CPU_illegal_instruction(int sig)
{
	longjmp(vec_jmpbuf, 1);
}
#endif

static int vec_CPU_have_ALTIVEC(void)
{
	volatile int altivec = 0;
#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
	int selectors[2] = {
# ifdef __OpenBSD__
		CTL_MACHDEP, CPU_ALTIVEC
# else
		CTL_HW, HW_VECTORUNIT
# endif
	};
	int hasVectorUnit = 0;
	vec_uintsize length = sizeof(hasVectorUnit);
	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
	if (!error)
		altivec = (hasVectorUnit != 0);
#elif defined(__FreeBSD__) && defined(__powerpc__)
	unsigned long cpufeatures = 0;
	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
	void (*handler)(int sig);
	handler = signal(SIGILL, vec_CPU_illegal_instruction);
	if (!setjmp(vec_jmpbuf)) {
		__asm__ __volatile__("mtspr 256, %0\n\t"
		    "vand %%v0, %%v0, %%v0" ::"r"(-1));
		altivec = 1;
	}
	signal(SIGILL, handler);
#endif
	return altivec;
}

static int vec_CPU_have_ALTIVEC_VSX(void)
{
	volatile int vsx = 0;
#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
# warning Compiling UNTESTED code for VSX.
	void (*handler)(int sig);
	handler = signal(SIGILL, vec_CPU_illegal_instruction);
	if (!setjmp(vec_jmpbuf)) {
		// this is completely untested
		//__asm__ __volatile__("mtspr 256, %0\n\t"
		//			 "xxland %%v0, %%v0, %%v0" ::"r"(-1));
		//vsx = 1;
	}
	signal(SIGILL, handler);
#endif
	return vsx;
}

#define vec_CPU_have_MMX()   (vec_CPU_CPUIDFeatures[3] & 0x00800000)
#define vec_CPU_have_SSE()   (vec_CPU_CPUIDFeatures[3] & 0x02000000)
#define vec_CPU_have_SSE2()  (vec_CPU_CPUIDFeatures[3] & 0x04000000)
#define vec_CPU_have_SSE3()  (vec_CPU_CPUIDFeatures[2] & 0x00000001)
#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
#define vec_CPU_have_AVX()   (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))

static inline int vec_CPU_have_AVX2(void)
{
	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
		int a, b, c, d;
		VEC_CPU_CPUID(7, a, b, c, d);
		return b & 0x00000020;
		(void)a, (void)c, (void)d;
	}
	return 0;
}

static inline int vec_CPU_have_AVX512F(void)
{
	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
		int a, b, c, d;
		VEC_CPU_CPUID(7, a, b, c, d);
		return b & 0x00000020;
		(void)a, (void)c, (void)d;
	}
	return 0;
}

#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
static int readProcAuxvForNeon(void)
{
	int neon = 0;
	int fd;

	fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
	if (fd >= 0) {
		Elf32_auxv_t aux;
		while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
			if (aux.a_type == AT_HWCAP) {
				neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
				break;
			}
		}
		close(fd);
	}
	return neon;
}
#endif

static int vec_CPU_have_NEON(void)
{
/* The way you detect NEON is a privileged instruction on ARM, so you have
   query the OS kernel in a platform-specific way. :/ */
#if defined(SDL_CPUINFO_DISABLED)
	return 0; /* disabled */
#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
/* Seems to have been removed */
#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
#endif
	/* All WinRT ARM devices are required to support NEON, but just in case. */
	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
	return 1; /* ARMv8 always has non-optional NEON support. */
#elif defined(__VITA__)
	return 1;
#elif defined(__3DS__)
	return 0;
#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
	/* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
	return 1; /* all Apple ARMv7 chips and later have NEON. */
#elif defined(__APPLE__)
	return 0; /* assume anything else from Apple doesn't have NEON. */
#elif !defined(__arm__)
	return 0; /* not an ARM CPU at all. */
#elif defined(__OpenBSD__)
	return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
#elif defined(HAVE_ELF_AUX_INFO)
	unsigned long hasneon = 0;
	if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
		return 0;

	return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
#elif defined(__QNXNTO__)
	return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
	return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
#elif defined(__linux__)
	return readProcAuxvForNeon();
#elif defined(__ANDROID__)
	/* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
	{
		AndroidCpuFamily cpu_family = android_getCpuFamily();
		if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
			uint64_t cpu_features = android_getCpuFeatures();
			if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
				return 1;
			}
		}
		return 0;
	}
#elif defined(__RISCOS__)
	/* Use the VFPSupport_Features SWI to access the MVFR registers */
	{
		_kernel_swi_regs regs;
		regs.r[0] = 0;
		if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
			if ((regs.r[2] & 0xFFF000) == 0x111000) {
				return 1;
			}
		}
		return 0;
	}
#else
#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
	return 0;
#endif
}

#define VEC_CPU_FEATURES_RESET VEC_UINT32_C(0xFFFFFFFF)

static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;

vec_uint32 vec_get_CPU_features(void)
{
	if (vec_CPU_features == VEC_CPU_FEATURES_RESET) {
		vec_CPU_get_CPUID_features();
		vec_CPU_features = 0;
		if (vec_CPU_have_ALTIVEC())
			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
		if (vec_CPU_have_ALTIVEC_VSX())
			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
		if (vec_CPU_have_MMX())
			vec_CPU_features |= VEC_CPU_HAS_MMX;
		if (vec_CPU_have_SSE())
			vec_CPU_features |= VEC_CPU_HAS_SSE;
		if (vec_CPU_have_SSE2())
			vec_CPU_features |= VEC_CPU_HAS_SSE2;
		if (vec_CPU_have_SSE3())
			vec_CPU_features |= VEC_CPU_HAS_SSE3;
		if (vec_CPU_have_SSE41())
			vec_CPU_features |= VEC_CPU_HAS_SSE41;
		if (vec_CPU_have_SSE42())
			vec_CPU_features |= VEC_CPU_HAS_SSE42;
		if (vec_CPU_have_AVX())
			vec_CPU_features |= VEC_CPU_HAS_AVX;
		if (vec_CPU_have_AVX2())
			vec_CPU_features |= VEC_CPU_HAS_AVX2;
		if (vec_CPU_have_AVX512F())
			vec_CPU_features |= VEC_CPU_HAS_AVX512F;
		if (vec_CPU_have_NEON())
			vec_CPU_features |= VEC_CPU_HAS_NEON;
	}
	return vec_CPU_features;
}