view src/cpu.c @ 24:e49e70f7012f

impl/x86: add static assertions for alignment and size
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 03:32:53 -0500
parents e26874655738
children 92156fe32755
line wrap: on
line source

/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

/* Detect CPU SIMD support. Much of this code was stolen from SDL.
 *
 * Simple DirectMedia Layer
 * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
*/

#include "vec/cpu.h"

#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
# include <sys/sysctl.h> // For AltiVec check
#elif defined(__OpenBSD__) && defined(__powerpc__)
# include <sys/types.h>
# include <sys/sysctl.h> // For AltiVec check
# include <machine/cpu.h>
#elif defined(__FreeBSD__) && defined(__powerpc__)
# include <machine/cpu.h>
# include <sys/auxv.h>
#elif defined(__ALTIVEC__)
# include <signal.h>
# include <setjmp.h>
#endif

#ifdef __FreeBSD__
# include <sys/param.h>
#endif

#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
# include <unistd.h>
# include <sys/types.h>
# include <sys/stat.h>
# include <fcntl.h>
# include <elf.h>

/*#include <asm/hwcap.h>*/
# ifndef AT_HWCAP
# define AT_HWCAP 16
# endif
# ifndef AT_PLATFORM
#  define AT_PLATFORM 15
# endif
# ifndef HWCAP_NEON
#  define HWCAP_NEON (1 << 12)
# endif
#endif

static inline int vec_CPU_have_CPUID(void)
{
	int has_CPUID = 0;

#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
	__asm__ (
"        pushfl                      # Get original EFLAGS             \n"
"        popl    %%eax                                                 \n"
"        movl    %%eax,%%ecx                                           \n"
"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
"        pushl   %%eax               # Save new EFLAGS value on stack  \n"
"        popfl                       # Replace current EFLAGS value    \n"
"        pushfl                      # Get new EFLAGS                  \n"
"        popl    %%eax               # Store new EFLAGS in EAX         \n"
"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
"        jz      1f                  # Processor=80486                 \n"
"        movl    $1,%0               # We have CPUID support           \n"
"1:                                                                    \n"
	: "=m" (has_CPUID)
	:
	: "%eax", "%ecx"
	);
#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
/* Technically, if this is being compiled under __x86_64__ then it has
   CPUid by definition.  But it's nice to be able to prove it.  :)      */
	__asm__ (
"        pushfq                      # Get original EFLAGS             \n"
"        popq    %%rax                                                 \n"
"        movq    %%rax,%%rcx                                           \n"
"        xorl    $0x200000,%%eax     # Flip ID bit in EFLAGS           \n"
"        pushq   %%rax               # Save new EFLAGS value on stack  \n"
"        popfq                       # Replace current EFLAGS value    \n"
"        pushfq                      # Get new EFLAGS                  \n"
"        popq    %%rax               # Store new EFLAGS in EAX         \n"
"        xorl    %%ecx,%%eax         # Can not toggle ID bit,          \n"
"        jz      1f                  # Processor=80486                 \n"
"        movl    $1,%0               # We have CPUID support           \n"
"1:                                                                    \n"
	: "=m" (has_CPUID)
	:
	: "%rax", "%rcx"
	);
#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
	__asm {
		pushfd                      ; Get original EFLAGS
		pop     eax
		mov     ecx, eax
		xor     eax, 200000h        ; Flip ID bit in EFLAGS
		push    eax                 ; Save new EFLAGS value on stack
		popfd                       ; Replace current EFLAGS value
		pushfd                      ; Get new EFLAGS
		pop     eax                 ; Store new EFLAGS in EAX
		xor     eax, ecx            ; Can not toggle ID bit,
		jz      done                ; Processor=80486
		mov     has_CPUID,1         ; We have CPUID support
done:
	}
#elif defined(_MSC_VER) && defined(_M_X64)
	has_CPUID = 1;
#elif defined(__sun) && defined(__i386)
	__asm (
"       pushfl                 \n"
"       popl    %eax           \n"
"       movl    %eax,%ecx      \n"
"       xorl    $0x200000,%eax \n"
"       pushl   %eax           \n"
"       popfl                  \n"
"       pushfl                 \n"
"       popl    %eax           \n"
"       xorl    %ecx,%eax      \n"
"       jz      1f             \n"
"       movl    $1,-8(%ebp)    \n"
"1:                            \n"
	);
#elif defined(__sun) && defined(__amd64)
	__asm (
"       pushfq                 \n"
"       popq    %rax           \n"
"       movq    %rax,%rcx      \n"
"       xorl    $0x200000,%eax \n"
"       pushq   %rax           \n"
"       popfq                  \n"
"       pushfq                 \n"
"       popq    %rax           \n"
"       xorl    %ecx,%eax      \n"
"       jz      1f             \n"
"       movl    $1,-8(%rbp)    \n"
"1:                            \n"
	);
#endif

	return has_CPUID;
}

#if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
# define VEC_CPU_CPUID(func, a, b, c, d) \
	__asm__ __volatile__( \
		"        pushl %%ebx        \n" \
		"        xorl %%ecx,%%ecx   \n" \
		"        cpuid              \n" \
		"        movl %%ebx, %%esi  \n" \
		"        popl %%ebx         \n" \
		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
		: "a"(func))
#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
# define VEC_CPU_CPUID(func, a, b, c, d) \
	__asm__ __volatile__( \
		"        pushq %%rbx        \n" \
		"        xorq %%rcx,%%rcx   \n" \
		"        cpuid              \n" \
		"        movq %%rbx, %%rsi  \n" \
		"        popq %%rbx         \n" \
		: "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
		: "a"(func))
#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
# define VEC_CPU_CPUID(func, a, b, c, d) \
	__asm { \
		__asm mov eax, func \
		__asm xor ecx, ecx \
		__asm cpuid \
		__asm mov a, eax \
		__asm mov b, ebx \
		__asm mov c, ecx \
		__asm mov d, edx \
	}
#elif (defined(_MSC_VER) && defined(_M_X64))
// Use __cpuidex instead of __cpuid because ICL does not clear ecx register
# define VEC_CPU_CPUID(func, a, b, c, d) \
	do { \
		int CPUInfo[4]; \
		__cpuidex(CPUInfo, func, 0); \
		a = CPUInfo[0]; \
		b = CPUInfo[1]; \
		c = CPUInfo[2]; \
		d = CPUInfo[3]; \
	} while (0)
#else
# define VEC_CPU_CPUID(func, a, b, c, d) \
	do { \
		a = b = c = d = 0; \
		(void)a; \
		(void)b; \
		(void)c; \
		(void)d; \
	} while (0)
#endif

// ---------------------------------------------------------------

static int vec_CPU_CPUIDFeatures[4];
static int vec_CPU_CPUIDMaxFunction = 0;
static int vec_CPU_OSSavesYMM = 0;
static int vec_CPU_OSSavesZMM = 0;

static inline void vec_CPU_get_CPUID_features(void)
{
	static int checked = 0;
	if (!checked) {
		checked = 1;
		if (vec_CPU_have_CPUID()) {
			int a, b, c, d;
			VEC_CPU_CPUID(0, a, b, c, d);
			vec_CPU_CPUIDMaxFunction = a;
			if (vec_CPU_CPUIDMaxFunction >= 1) {
				VEC_CPU_CPUID(1, a, b, c, d);
				vec_CPU_CPUIDFeatures[0] = a;
				vec_CPU_CPUIDFeatures[1] = b;
				vec_CPU_CPUIDFeatures[2] = c;
				vec_CPU_CPUIDFeatures[3] = d;

				// Check to make sure we can call xgetbv
				if (c & 0x08000000) {
					// Call xgetbv to see if YMM (etc) register state is saved
#if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
					__asm__(".byte 0x0f, 0x01, 0xd0"
							: "=a"(a)
							: "c"(0)
							: "%edx");
#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
					a = (int)_xgetbv(0);
#elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
					__asm {
						xor ecx, ecx
						_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
						mov a, eax
					}
#endif
					vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
					vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
				}
			}
		}
	}
}

#if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
static jmp_buf vec_jmpbuf;
static void vec_CPU_illegal_instruction(int sig)
{
	longjmp(vec_jmpbuf, 1);
}
#endif

static int vec_CPU_have_ALTIVEC(void)
{
	volatile int altivec = 0;
#if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
	int selectors[2] = {
# ifdef __OpenBSD__
		CTL_MACHDEP, CPU_ALTIVEC
# else
		CTL_HW, HW_VECTORUNIT
# endif
	};
	int hasVectorUnit = 0;
	vec_uintsize length = sizeof(hasVectorUnit);
	int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
	if (!error)
		altivec = (hasVectorUnit != 0);
#elif defined(__FreeBSD__) && defined(__powerpc__)
	unsigned long cpufeatures = 0;
	elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
	altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
#elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
	void (*handler)(int sig);
	handler = signal(SIGILL, vec_CPU_illegal_instruction);
	if (!setjmp(vec_jmpbuf)) {
		vector unsigned char vec;
		vec_and(vec, vec);
		altivec = 1;
	}
	signal(SIGILL, handler);
#endif
	return altivec;
}

static int vec_CPU_have_ALTIVEC_VSX(void)
{
	volatile int vsx = 0;
#if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
# warning Compiling UNTESTED code for VSX.
	void (*handler)(int sig);
	handler = signal(SIGILL, vec_CPU_illegal_instruction);
	if (!setjmp(vec_jmpbuf)) {
		// this is completely untested
		//__asm__ __volatile__("mtspr 256, %0\n\t"
		//			 "xxland %%v0, %%v0, %%v0" ::"r"(-1));
		//vsx = 1;
	}
	signal(SIGILL, handler);
#endif
	return vsx;
}

#define vec_CPU_have_MMX()   (vec_CPU_CPUIDFeatures[3] & 0x00800000)
#define vec_CPU_have_SSE()   (vec_CPU_CPUIDFeatures[3] & 0x02000000)
#define vec_CPU_have_SSE2()  (vec_CPU_CPUIDFeatures[3] & 0x04000000)
#define vec_CPU_have_SSE3()  (vec_CPU_CPUIDFeatures[2] & 0x00000001)
#define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
#define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
#define vec_CPU_have_AVX()   (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))

static inline int vec_CPU_have_AVX2(void)
{
	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
		int a, b, c, d;
		VEC_CPU_CPUID(7, a, b, c, d);
		return b & 0x00000020;
		(void)a, (void)c, (void)d;
	}
	return 0;
}

static inline int vec_CPU_have_AVX512F(void)
{
	if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
		int a, b, c, d;
		VEC_CPU_CPUID(7, a, b, c, d);
		return b & 0x00000020;
		(void)a, (void)c, (void)d;
	}
	return 0;
}

#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
static int readProcAuxvForNeon(void)
{
	int neon = 0;
	int fd;

	fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
	if (fd >= 0) {
		Elf32_auxv_t aux;
		while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
			if (aux.a_type == AT_HWCAP) {
				neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
				break;
			}
		}
		close(fd);
	}
	return neon;
}
#endif

static int vec_CPU_have_NEON(void)
{
/* The way you detect NEON is a privileged instruction on ARM, so you have
   query the OS kernel in a platform-specific way. :/ */
#if defined(SDL_CPUINFO_DISABLED)
	return 0; /* disabled */
#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
/* Seems to have been removed */
#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
#endif
	/* All WinRT ARM devices are required to support NEON, but just in case. */
	return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
	return 1; /* ARMv8 always has non-optional NEON support. */
#elif defined(__VITA__)
	return 1;
#elif defined(__3DS__)
	return 0;
#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
	/* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
	return 1; /* all Apple ARMv7 chips and later have NEON. */
#elif defined(__APPLE__)
	return 0; /* assume anything else from Apple doesn't have NEON. */
#elif !defined(__arm__)
	return 0; /* not an ARM CPU at all. */
#elif defined(__OpenBSD__)
	return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
#elif defined(HAVE_ELF_AUX_INFO)
	unsigned long hasneon = 0;
	if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
		return 0;

	return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
#elif defined(__QNXNTO__)
	return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
	return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
#elif defined(__linux__)
	return readProcAuxvForNeon();
#elif defined(__ANDROID__)
	/* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
	{
		AndroidCpuFamily cpu_family = android_getCpuFamily();
		if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
			uint64_t cpu_features = android_getCpuFeatures();
			if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
				return 1;
			}
		}
		return 0;
	}
#elif defined(__RISCOS__)
	/* Use the VFPSupport_Features SWI to access the MVFR registers */
	{
		_kernel_swi_regs regs;
		regs.r[0] = 0;
		if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
			if ((regs.r[2] & 0xFFF000) == 0x111000) {
				return 1;
			}
		}
		return 0;
	}
#else
#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
	return 0;
#endif
}

#define VEC_CPU_FEATURES_RESET VEC_UINT32_C(0xFFFFFFFF)

static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;

vec_uint32 vec_get_CPU_features(void)
{
	if (vec_CPU_features == VEC_CPU_FEATURES_RESET) {
		vec_CPU_get_CPUID_features();
		vec_CPU_features = 0;
		if (vec_CPU_have_ALTIVEC())
			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
		if (vec_CPU_have_ALTIVEC_VSX())
			vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
		if (vec_CPU_have_MMX())
			vec_CPU_features |= VEC_CPU_HAS_MMX;
		if (vec_CPU_have_SSE())
			vec_CPU_features |= VEC_CPU_HAS_SSE;
		if (vec_CPU_have_SSE2())
			vec_CPU_features |= VEC_CPU_HAS_SSE2;
		if (vec_CPU_have_SSE3())
			vec_CPU_features |= VEC_CPU_HAS_SSE3;
		if (vec_CPU_have_SSE41())
			vec_CPU_features |= VEC_CPU_HAS_SSE41;
		if (vec_CPU_have_SSE42())
			vec_CPU_features |= VEC_CPU_HAS_SSE42;
		if (vec_CPU_have_AVX())
			vec_CPU_features |= VEC_CPU_HAS_AVX;
		if (vec_CPU_have_AVX2())
			vec_CPU_features |= VEC_CPU_HAS_AVX2;
		if (vec_CPU_have_AVX512F())
			vec_CPU_features |= VEC_CPU_HAS_AVX512F;
		if (vec_CPU_have_NEON())
			vec_CPU_features |= VEC_CPU_HAS_NEON;
	}
	return vec_CPU_features;
}