comparison src/cpu.c @ 23:e26874655738

*: huge refactor, new major release (hahaha) I keep finding things that are broken... The problem NOW was that vec would unintentionally build some functions with extended instruction sets, which is Bad and would mean that for all intents and purposes the CPU detection was completely broken. Now vec is no longer header only either. Boohoo. However this gives a lot more flexibility to vec since we no longer want or need to care about C++ crap. The NEON and Altivec implementations have not been updated which means they won't compile hence why they're commented out in the cmake build file.
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 02:52:40 -0500
parents
children 92156fe32755
comparison
equal deleted inserted replaced
22:fbcd3fa6f8fc 23:e26874655738
1 /**
2 * vec - a tiny SIMD vector library in C99
3 *
4 * Copyright (c) 2024 Paper
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 **/
24
25 /* Detect CPU SIMD support. Much of this code was stolen from SDL.
26 *
27 * Simple DirectMedia Layer
28 * Copyright (C) 1997-2024 Sam Lantinga <slouken@libsdl.org>
29 *
30 * This software is provided 'as-is', without any express or implied
31 * warranty. In no event will the authors be held liable for any damages
32 * arising from the use of this software.
33 *
34 * Permission is granted to anyone to use this software for any purpose,
35 * including commercial applications, and to alter it and redistribute it
36 * freely, subject to the following restrictions:
37 *
38 * 1. The origin of this software must not be misrepresented; you must not
39 * claim that you wrote the original software. If you use this software
40 * in a product, an acknowledgment in the product documentation would be
41 * appreciated but is not required.
42 * 2. Altered source versions must be plainly marked as such, and must not be
43 * misrepresented as being the original software.
44 * 3. This notice may not be removed or altered from any source distribution.
45 */
46
47 #include "vec/cpu.h"
48
49 #if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
50 # include <sys/sysctl.h> // For AltiVec check
51 #elif defined(__OpenBSD__) && defined(__powerpc__)
52 # include <sys/types.h>
53 # include <sys/sysctl.h> // For AltiVec check
54 # include <machine/cpu.h>
55 #elif defined(__FreeBSD__) && defined(__powerpc__)
56 # include <machine/cpu.h>
57 # include <sys/auxv.h>
58 #elif defined(__ALTIVEC__)
59 # include <signal.h>
60 # include <setjmp.h>
61 #endif
62
63 #ifdef __FreeBSD__
64 # include <sys/param.h>
65 #endif
66
67 #if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__)
68 # include <unistd.h>
69 # include <sys/types.h>
70 # include <sys/stat.h>
71 # include <fcntl.h>
72 # include <elf.h>
73
74 /*#include <asm/hwcap.h>*/
75 # ifndef AT_HWCAP
76 # define AT_HWCAP 16
77 # endif
78 # ifndef AT_PLATFORM
79 # define AT_PLATFORM 15
80 # endif
81 # ifndef HWCAP_NEON
82 # define HWCAP_NEON (1 << 12)
83 # endif
84 #endif
85
86 static inline int vec_CPU_have_CPUID(void)
87 {
88 int has_CPUID = 0;
89
90 #if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
91 __asm__ (
92 " pushfl # Get original EFLAGS \n"
93 " popl %%eax \n"
94 " movl %%eax,%%ecx \n"
95 " xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n"
96 " pushl %%eax # Save new EFLAGS value on stack \n"
97 " popfl # Replace current EFLAGS value \n"
98 " pushfl # Get new EFLAGS \n"
99 " popl %%eax # Store new EFLAGS in EAX \n"
100 " xorl %%ecx,%%eax # Can not toggle ID bit, \n"
101 " jz 1f # Processor=80486 \n"
102 " movl $1,%0 # We have CPUID support \n"
103 "1: \n"
104 : "=m" (has_CPUID)
105 :
106 : "%eax", "%ecx"
107 );
108 #elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
109 /* Technically, if this is being compiled under __x86_64__ then it has
110 CPUid by definition. But it's nice to be able to prove it. :) */
111 __asm__ (
112 " pushfq # Get original EFLAGS \n"
113 " popq %%rax \n"
114 " movq %%rax,%%rcx \n"
115 " xorl $0x200000,%%eax # Flip ID bit in EFLAGS \n"
116 " pushq %%rax # Save new EFLAGS value on stack \n"
117 " popfq # Replace current EFLAGS value \n"
118 " pushfq # Get new EFLAGS \n"
119 " popq %%rax # Store new EFLAGS in EAX \n"
120 " xorl %%ecx,%%eax # Can not toggle ID bit, \n"
121 " jz 1f # Processor=80486 \n"
122 " movl $1,%0 # We have CPUID support \n"
123 "1: \n"
124 : "=m" (has_CPUID)
125 :
126 : "%rax", "%rcx"
127 );
128 #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
129 __asm {
130 pushfd ; Get original EFLAGS
131 pop eax
132 mov ecx, eax
133 xor eax, 200000h ; Flip ID bit in EFLAGS
134 push eax ; Save new EFLAGS value on stack
135 popfd ; Replace current EFLAGS value
136 pushfd ; Get new EFLAGS
137 pop eax ; Store new EFLAGS in EAX
138 xor eax, ecx ; Can not toggle ID bit,
139 jz done ; Processor=80486
140 mov has_CPUID,1 ; We have CPUID support
141 done:
142 }
143 #elif defined(_MSC_VER) && defined(_M_X64)
144 has_CPUID = 1;
145 #elif defined(__sun) && defined(__i386)
146 __asm (
147 " pushfl \n"
148 " popl %eax \n"
149 " movl %eax,%ecx \n"
150 " xorl $0x200000,%eax \n"
151 " pushl %eax \n"
152 " popfl \n"
153 " pushfl \n"
154 " popl %eax \n"
155 " xorl %ecx,%eax \n"
156 " jz 1f \n"
157 " movl $1,-8(%ebp) \n"
158 "1: \n"
159 );
160 #elif defined(__sun) && defined(__amd64)
161 __asm (
162 " pushfq \n"
163 " popq %rax \n"
164 " movq %rax,%rcx \n"
165 " xorl $0x200000,%eax \n"
166 " pushq %rax \n"
167 " popfq \n"
168 " pushfq \n"
169 " popq %rax \n"
170 " xorl %ecx,%eax \n"
171 " jz 1f \n"
172 " movl $1,-8(%rbp) \n"
173 "1: \n"
174 );
175 #endif
176
177 return has_CPUID;
178 }
179
180 #if (defined(__GNUC__) || defined(__llvm__)) && defined(__i386__)
181 # define VEC_CPU_CPUID(func, a, b, c, d) \
182 __asm__ __volatile__( \
183 " pushl %%ebx \n" \
184 " xorl %%ecx,%%ecx \n" \
185 " cpuid \n" \
186 " movl %%ebx, %%esi \n" \
187 " popl %%ebx \n" \
188 : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
189 : "a"(func))
190 #elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__)
191 # define VEC_CPU_CPUID(func, a, b, c, d) \
192 __asm__ __volatile__( \
193 " pushq %%rbx \n" \
194 " xorq %%rcx,%%rcx \n" \
195 " cpuid \n" \
196 " movq %%rbx, %%rsi \n" \
197 " popq %%rbx \n" \
198 : "=a"(a), "=S"(b), "=c"(c), "=d"(d) \
199 : "a"(func))
200 #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
201 # define VEC_CPU_CPUID(func, a, b, c, d) \
202 __asm { \
203 __asm mov eax, func \
204 __asm xor ecx, ecx \
205 __asm cpuid \
206 __asm mov a, eax \
207 __asm mov b, ebx \
208 __asm mov c, ecx \
209 __asm mov d, edx \
210 }
211 #elif (defined(_MSC_VER) && defined(_M_X64))
212 // Use __cpuidex instead of __cpuid because ICL does not clear ecx register
213 # define VEC_CPU_CPUID(func, a, b, c, d) \
214 do { \
215 int CPUInfo[4]; \
216 __cpuidex(CPUInfo, func, 0); \
217 a = CPUInfo[0]; \
218 b = CPUInfo[1]; \
219 c = CPUInfo[2]; \
220 d = CPUInfo[3]; \
221 } while (0)
222 #else
223 # define VEC_CPU_CPUID(func, a, b, c, d) \
224 do { \
225 a = b = c = d = 0; \
226 (void)a; \
227 (void)b; \
228 (void)c; \
229 (void)d; \
230 } while (0)
231 #endif
232
233 // ---------------------------------------------------------------
234
235 static int vec_CPU_CPUIDFeatures[4];
236 static int vec_CPU_CPUIDMaxFunction = 0;
237 static int vec_CPU_OSSavesYMM = 0;
238 static int vec_CPU_OSSavesZMM = 0;
239
240 static inline void vec_CPU_get_CPUID_features(void)
241 {
242 static int checked = 0;
243 if (!checked) {
244 checked = 1;
245 if (vec_CPU_have_CPUID()) {
246 int a, b, c, d;
247 VEC_CPU_CPUID(0, a, b, c, d);
248 vec_CPU_CPUIDMaxFunction = a;
249 if (vec_CPU_CPUIDMaxFunction >= 1) {
250 VEC_CPU_CPUID(1, a, b, c, d);
251 vec_CPU_CPUIDFeatures[0] = a;
252 vec_CPU_CPUIDFeatures[1] = b;
253 vec_CPU_CPUIDFeatures[2] = c;
254 vec_CPU_CPUIDFeatures[3] = d;
255
256 // Check to make sure we can call xgetbv
257 if (c & 0x08000000) {
258 // Call xgetbv to see if YMM (etc) register state is saved
259 #if (defined(__GNUC__) || defined(__llvm__)) && (defined(__i386__) || defined(__x86_64__))
260 __asm__(".byte 0x0f, 0x01, 0xd0"
261 : "=a"(a)
262 : "c"(0)
263 : "%edx");
264 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && (_MSC_FULL_VER >= 160040219) // VS2010 SP1
265 a = (int)_xgetbv(0);
266 #elif (defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)
267 __asm {
268 xor ecx, ecx
269 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0
270 mov a, eax
271 }
272 #endif
273 vec_CPU_OSSavesYMM = ((a & 6) == 6) ? 1 : 0;
274 vec_CPU_OSSavesZMM = (vec_CPU_OSSavesYMM && ((a & 0xe0) == 0xe0)) ? 1 : 0;
275 }
276 }
277 }
278 }
279 }
280
281 #if !((defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))) && defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
282 static jmp_buf vec_jmpbuf;
283 static void vec_CPU_illegal_instruction(int sig)
284 {
285 longjmp(vec_jmpbuf, 1);
286 }
287 #endif
288
289 static int vec_CPU_have_ALTIVEC(void)
290 {
291 volatile int altivec = 0;
292 #if (defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))) || (defined(__OpenBSD__) && defined(__powerpc__))
293 int selectors[2] = {
294 # ifdef __OpenBSD__
295 CTL_MACHDEP, CPU_ALTIVEC
296 # else
297 CTL_HW, HW_VECTORUNIT
298 # endif
299 };
300 int hasVectorUnit = 0;
301 vec_uintsize length = sizeof(hasVectorUnit);
302 int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
303 if (!error)
304 altivec = (hasVectorUnit != 0);
305 #elif defined(__FreeBSD__) && defined(__powerpc__)
306 unsigned long cpufeatures = 0;
307 elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
308 altivec = cpufeatures & PPC_FEATURE_HAS_ALTIVEC;
309 #elif defined(VEC_COMPILER_HAS_ALTIVEC) && defined(__GNUC__)
310 void (*handler)(int sig);
311 handler = signal(SIGILL, vec_CPU_illegal_instruction);
312 if (!setjmp(vec_jmpbuf)) {
313 vector unsigned char vec;
314 vec_and(vec, vec);
315 altivec = 1;
316 }
317 signal(SIGILL, handler);
318 #endif
319 return altivec;
320 }
321
322 static int vec_CPU_have_ALTIVEC_VSX(void)
323 {
324 volatile int vsx = 0;
325 #if defined(VEC_COMPILER_HAS_ALTIVEC_VSX) && defined(__GNUC__)
326 # warning Compiling UNTESTED code for VSX.
327 void (*handler)(int sig);
328 handler = signal(SIGILL, vec_CPU_illegal_instruction);
329 if (!setjmp(vec_jmpbuf)) {
330 // this is completely untested
331 //__asm__ __volatile__("mtspr 256, %0\n\t"
332 // "xxland %%v0, %%v0, %%v0" ::"r"(-1));
333 //vsx = 1;
334 }
335 signal(SIGILL, handler);
336 #endif
337 return vsx;
338 }
339
340 #define vec_CPU_have_MMX() (vec_CPU_CPUIDFeatures[3] & 0x00800000)
341 #define vec_CPU_have_SSE() (vec_CPU_CPUIDFeatures[3] & 0x02000000)
342 #define vec_CPU_have_SSE2() (vec_CPU_CPUIDFeatures[3] & 0x04000000)
343 #define vec_CPU_have_SSE3() (vec_CPU_CPUIDFeatures[2] & 0x00000001)
344 #define vec_CPU_have_SSE41() (vec_CPU_CPUIDFeatures[2] & 0x00080000)
345 #define vec_CPU_have_SSE42() (vec_CPU_CPUIDFeatures[2] & 0x00100000)
346 #define vec_CPU_have_AVX() (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDFeatures[2] & 0x10000000))
347
348 static inline int vec_CPU_have_AVX2(void)
349 {
350 if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
351 int a, b, c, d;
352 VEC_CPU_CPUID(7, a, b, c, d);
353 return b & 0x00000020;
354 (void)a, (void)c, (void)d;
355 }
356 return 0;
357 }
358
359 static inline int vec_CPU_have_AVX512F(void)
360 {
361 if (vec_CPU_OSSavesYMM && (vec_CPU_CPUIDMaxFunction >= 7)) {
362 int a, b, c, d;
363 VEC_CPU_CPUID(7, a, b, c, d);
364 return b & 0x00000020;
365 (void)a, (void)c, (void)d;
366 }
367 return 0;
368 }
369
370 #if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL)
371 static int readProcAuxvForNeon(void)
372 {
373 int neon = 0;
374 int fd;
375
376 fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
377 if (fd >= 0) {
378 Elf32_auxv_t aux;
379 while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
380 if (aux.a_type == AT_HWCAP) {
381 neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON;
382 break;
383 }
384 }
385 close(fd);
386 }
387 return neon;
388 }
389 #endif
390
391 static int vec_CPU_have_NEON(void)
392 {
393 /* The way you detect NEON is a privileged instruction on ARM, so you have
394 query the OS kernel in a platform-specific way. :/ */
395 #if defined(SDL_CPUINFO_DISABLED)
396 return 0; /* disabled */
397 #elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64))
398 /* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */
399 /* Seems to have been removed */
400 #ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
401 #define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
402 #endif
403 /* All WinRT ARM devices are required to support NEON, but just in case. */
404 return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0;
405 #elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__)
406 return 1; /* ARMv8 always has non-optional NEON support. */
407 #elif defined(__VITA__)
408 return 1;
409 #elif defined(__3DS__)
410 return 0;
411 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7)
412 /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */
413 return 1; /* all Apple ARMv7 chips and later have NEON. */
414 #elif defined(__APPLE__)
415 return 0; /* assume anything else from Apple doesn't have NEON. */
416 #elif !defined(__arm__)
417 return 0; /* not an ARM CPU at all. */
418 #elif defined(__OpenBSD__)
419 return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */
420 #elif defined(HAVE_ELF_AUX_INFO)
421 unsigned long hasneon = 0;
422 if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0)
423 return 0;
424
425 return ((hasneon & HWCAP_NEON) == HWCAP_NEON);
426 #elif defined(__QNXNTO__)
427 return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON;
428 #elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL)
429 return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON;
430 #elif defined(__linux__)
431 return readProcAuxvForNeon();
432 #elif defined(__ANDROID__)
433 /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */
434 {
435 AndroidCpuFamily cpu_family = android_getCpuFamily();
436 if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
437 uint64_t cpu_features = android_getCpuFeatures();
438 if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) {
439 return 1;
440 }
441 }
442 return 0;
443 }
444 #elif defined(__RISCOS__)
445 /* Use the VFPSupport_Features SWI to access the MVFR registers */
446 {
447 _kernel_swi_regs regs;
448 regs.r[0] = 0;
449 if (_kernel_swi(VFPSupport_Features, &regs, &regs) == NULL) {
450 if ((regs.r[2] & 0xFFF000) == 0x111000) {
451 return 1;
452 }
453 }
454 return 0;
455 }
456 #else
457 #warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me.
458 return 0;
459 #endif
460 }
461
462 #define VEC_CPU_FEATURES_RESET VEC_UINT32_C(0xFFFFFFFF)
463
464 static vec_uint32 vec_CPU_features = VEC_CPU_FEATURES_RESET;
465
466 vec_uint32 vec_get_CPU_features(void)
467 {
468 if (vec_CPU_features == VEC_CPU_FEATURES_RESET) {
469 vec_CPU_get_CPUID_features();
470 vec_CPU_features = 0;
471 if (vec_CPU_have_ALTIVEC())
472 vec_CPU_features |= VEC_CPU_HAS_ALTIVEC;
473 if (vec_CPU_have_ALTIVEC_VSX())
474 vec_CPU_features |= VEC_CPU_HAS_ALTIVEC_VSX;
475 if (vec_CPU_have_MMX())
476 vec_CPU_features |= VEC_CPU_HAS_MMX;
477 if (vec_CPU_have_SSE())
478 vec_CPU_features |= VEC_CPU_HAS_SSE;
479 if (vec_CPU_have_SSE2())
480 vec_CPU_features |= VEC_CPU_HAS_SSE2;
481 if (vec_CPU_have_SSE3())
482 vec_CPU_features |= VEC_CPU_HAS_SSE3;
483 if (vec_CPU_have_SSE41())
484 vec_CPU_features |= VEC_CPU_HAS_SSE41;
485 if (vec_CPU_have_SSE42())
486 vec_CPU_features |= VEC_CPU_HAS_SSE42;
487 if (vec_CPU_have_AVX())
488 vec_CPU_features |= VEC_CPU_HAS_AVX;
489 if (vec_CPU_have_AVX2())
490 vec_CPU_features |= VEC_CPU_HAS_AVX2;
491 if (vec_CPU_have_AVX512F())
492 vec_CPU_features |= VEC_CPU_HAS_AVX512F;
493 if (vec_CPU_have_NEON())
494 vec_CPU_features |= VEC_CPU_HAS_NEON;
495 }
496 return vec_CPU_features;
497 }