Mercurial > vec
changeset 18:cf04071d2148
impl: initial NEON support; test: verify bit shifting functions
author | Paper <paper@tflc.us> |
---|---|
date | Wed, 20 Nov 2024 14:33:19 -0500 |
parents | 41dd962abdd1 |
children | 4de858e14464 627d548b23c8 |
files | include/vec/impl/arm/neon.h include/vec/impl/cpu.h include/vec/impl/ppc/altivec.h include/vec/vec.h test/Makefile test/Makefile.template test/test.c test/test_shift.h test/vec.pc |
diffstat | 9 files changed, 770 insertions(+), 267 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/arm/neon.h Wed Nov 20 14:33:19 2024 -0500 @@ -0,0 +1,489 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_IMPL_ARM_NEON_H_ +#define VEC_IMPL_ARM_NEON_H_ + +#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vld1_##sign##bits(in); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + vstore_lane_##bits(sign, vec.neon, out); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_add, \ + v##sign##int##bits##x##size##_neon_sub, \ + v##sign##int##bits##x##size##_neon_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_neon_and, \ + v##sign##int##bits##x##size##_neon_or, \ + v##sign##int##bits##x##size##_neon_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_neon_lshift, \ + /* .rshift = */ NULL, \ + /* .lrshift = */ NULL, \ + }; + +#define VEC_DEFINE_OPERATIONS(bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +// Ok, we'll start out with the 64-bit types. + +#define vadd_8 vadd_s8 +#define vadd_16 vadd_s16 +#define vadd_32 vadd_s32 +#define vsub_8 vsub_s8 +#define vsub_16 vsub_s16 +#define vsub_32 vsub_s32 +#define vmul_8 vmul_s8 +#define vmul_16 vmul_s16 +#define vmul_32 vmul_s32 +#define vshl_8 vshl_s8 +#define vshl_16 vshl_s16 +#define vshl_32 vshl_s32 +#define veor_8 veor_s8 +#define veor_16 veor_s16 +#define veor_32 veor_s32 +#define vorr_8 vorr_s8 +#define vorr_16 vorr_s16 +#define vorr_32 vorr_s32 +#define vand_8 vand_s8 +#define vand_16 vand_s16 +#define vand_32 vand_s32 +#define vld1_8 vld1_s8 +#define vld1_16 vld1_s16 +#define vld1_32 vld1_s32 +#define vget_lane_8 vget_lane_s8 +#define vget_lane_16 vget_lane_s16 +#define vget_lane_32 vget_lane_s32 +#define vstore_lane_8(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##8(vec, 0); \ + out[1] = vget_lane_##sign##8(vec, 1); \ + out[2] = vget_lane_##sign##8(vec, 2); \ + out[3] = vget_lane_##sign##8(vec, 3); \ + out[4] = vget_lane_##sign##8(vec, 4); \ + out[5] = vget_lane_##sign##8(vec, 5); \ + out[6] = vget_lane_##sign##8(vec, 6); \ + out[7] = vget_lane_##sign##8(vec, 7); \ + } while (0) +#define vstore_lane_16(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##16(vec, 0); \ + out[1] = vget_lane_##sign##16(vec, 1); \ + out[2] = vget_lane_##sign##16(vec, 2); \ + out[3] = vget_lane_##sign##16(vec, 3); \ + } while (0) +#define vstore_lane_32(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##32(vec, 0); \ + out[1] = vget_lane_##sign##32(vec, 1); \ + } while (0) +#define vreinterpret_8_u8(x) vreinterpret_s8_u8(x) +#define vreinterpret_16_u16(x) vreinterpret_s16_u16(x) +#define vreinterpret_32_u32(x) vreinterpret_s32_u32(x) + +VEC_DEFINE_OPERATIONS(8, 8) +VEC_DEFINE_OPERATIONS(16, 4) +VEC_DEFINE_OPERATIONS(32, 2) + +#undef vadd_8 +#undef vadd_16 +#undef vadd_32 +#undef vsub_8 +#undef vsub_16 +#undef vsub_32 +#undef vmul_8 +#undef vmul_16 +#undef vmul_32 +#undef vshl_8 +#undef vshl_16 +#undef vshl_32 +#undef veor_8 +#undef veor_16 +#undef veor_32 +#undef vorr_8 +#undef vorr_16 +#undef vorr_32 +#undef vand_8 +#undef vand_16 +#undef vand_32 +#undef vld1_8 +#undef vld1_16 +#undef vld1_32 +#undef vget_lane_8 +#undef vget_lane_16 +#undef vget_lane_32 +#undef vstore_lane_8 +#undef vstore_lane_16 +#undef vstore_lane_32 +#undef vreinterpret_8_u8 +#undef vreinterpret_16_u16 +#undef vreinterpret_32_u32 + +/////////////////////////////////////////////////////////////////////////////// +// 128-bit + +// Now we can go ahead and do the 128-bit ones. + +// NEON doesn't have native 64-bit multiplication, so we have +// to do it ourselves +static inline int64x2_t vmulq_s64(const int64x2_t a, const int64x2_t b) +{ + const uint32x2_t ac = vreinterpret_u32_s32(vmovn_s64(a)); + const uint32x2_t pr = vreinterpret_u32_s32(vmovn_s64(b)); + + const int32x4_t hi = vmulq_s32(vreinterpretq_s32_s64(b), vreinterpretq_s32_s64(a)); + + return vreinterpretq_s64_u64(vmlal_u32(vreinterpretq_u64_s64(vshlq_n_s64(vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s32(hi))), 32)), ac, pr)); +} + +static inline uint64x2_t vmulq_u64(const uint64x2_t a, const uint64x2_t b) +{ + const uint32x2_t ac = vmovn_u64(a); + const uint32x2_t pr = vmovn_u64(b); + + const uint32x4_t hi = vmulq_u32(vreinterpretq_u32_u64(b), vreinterpretq_u32_u64(a)); + + return vmlal_u32(vshlq_n_u64(vpaddlq_u32(hi), 32), ac, pr); +} + +#define vadd_8 vaddq_s8 +#define vadd_16 vaddq_s16 +#define vadd_32 vaddq_s32 +#define vadd_64 vaddq_s64 +#define vadd_u8 vaddq_u8 +#define vadd_u16 vaddq_u16 +#define vadd_u32 vaddq_u32 +#define vadd_u64 vaddq_u64 +#define vsub_8 vsubq_s8 +#define vsub_16 vsubq_s16 +#define vsub_32 vsubq_s32 +#define vsub_64 vsubq_s64 +#define vsub_u8 vsubq_u8 +#define vsub_u16 vsubq_u16 +#define vsub_u32 vsubq_u32 +#define vsub_u64 vsubq_u64 +#define vmul_8 vmulq_s8 +#define vmul_16 vmulq_s16 +#define vmul_32 vmulq_s32 +#define vmul_64 vmulq_s64 +#define vmul_u8 vmulq_u8 +#define vmul_u16 vmulq_u16 +#define vmul_u32 vmulq_u32 +#define vmul_u64 vmulq_u64 +#define vshl_8 vshlq_s8 +#define vshl_16 vshlq_s16 +#define vshl_32 vshlq_s32 +#define vshl_64 vshlq_s64 +#define vshl_u8 vshlq_u8 +#define vshl_u16 vshlq_u16 +#define vshl_u32 vshlq_u32 +#define vshl_u64 vshlq_u64 +#define veor_8 veorq_s8 +#define veor_16 veorq_s16 +#define veor_32 veorq_s32 +#define veor_64 veorq_s64 +#define veor_u8 veorq_u8 +#define veor_u16 veorq_u16 +#define veor_u32 veorq_u32 +#define veor_u64 veorq_u64 +#define vorr_8 vorrq_s8 +#define vorr_16 vorrq_s16 +#define vorr_32 vorrq_s32 +#define vorr_64 vorrq_s64 +#define vorr_u8 vorrq_u8 +#define vorr_u16 vorrq_u16 +#define vorr_u32 vorrq_u32 +#define vorr_u64 vorrq_u64 +#define vand_8 vandq_s8 +#define vand_16 vandq_s16 +#define vand_32 vandq_s32 +#define vand_64 vandq_s64 +#define vand_u8 vandq_u8 +#define vand_u16 vandq_u16 +#define vand_u32 vandq_u32 +#define vand_u64 vandq_u64 +#define vld1_8 vld1q_s8 +#define vld1_16 vld1q_s16 +#define vld1_32 vld1q_s32 +#define vld1_64 vld1q_s64 +#define vld1_u8 vld1q_u8 +#define vld1_u16 vld1q_u16 +#define vld1_u32 vld1q_u32 +#define vld1_u64 vld1q_u64 +#define vget_lane_8 vgetq_lane_s8 +#define vget_lane_16 vgetq_lane_s16 +#define vget_lane_32 vgetq_lane_s32 +#define vget_lane_64 vgetq_lane_s64 +#define vget_lane_u8 vgetq_lane_u8 +#define vget_lane_u16 vgetq_lane_u16 +#define vget_lane_u32 vgetq_lane_u32 +#define vget_lane_u64 vgetq_lane_u64 +#define vstore_lane_8(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##8(vec, 0); \ + out[1] = vget_lane_##sign##8(vec, 1); \ + out[2] = vget_lane_##sign##8(vec, 2); \ + out[3] = vget_lane_##sign##8(vec, 3); \ + out[4] = vget_lane_##sign##8(vec, 4); \ + out[5] = vget_lane_##sign##8(vec, 5); \ + out[6] = vget_lane_##sign##8(vec, 6); \ + out[7] = vget_lane_##sign##8(vec, 7); \ + out[8] = vget_lane_##sign##8(vec, 8); \ + out[9] = vget_lane_##sign##8(vec, 9); \ + out[10] = vget_lane_##sign##8(vec, 10); \ + out[11] = vget_lane_##sign##8(vec, 11); \ + out[12] = vget_lane_##sign##8(vec, 12); \ + out[13] = vget_lane_##sign##8(vec, 13); \ + out[14] = vget_lane_##sign##8(vec, 14); \ + out[15] = vget_lane_##sign##8(vec, 15); \ + } while (0) +#define vstore_lane_16(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##16(vec, 0); \ + out[1] = vget_lane_##sign##16(vec, 1); \ + out[2] = vget_lane_##sign##16(vec, 2); \ + out[3] = vget_lane_##sign##16(vec, 3); \ + out[4] = vget_lane_##sign##16(vec, 4); \ + out[5] = vget_lane_##sign##16(vec, 5); \ + out[6] = vget_lane_##sign##16(vec, 6); \ + out[7] = vget_lane_##sign##16(vec, 7); \ + } while (0) +#define vstore_lane_32(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##32(vec, 0); \ + out[1] = vget_lane_##sign##32(vec, 1); \ + out[2] = vget_lane_##sign##32(vec, 2); \ + out[3] = vget_lane_##sign##32(vec, 3); \ + } while (0) +#define vstore_lane_64(sign, vec, out) \ + do { \ + out[0] = vget_lane_##sign##64(vec, 0); \ + out[1] = vget_lane_##sign##64(vec, 1); \ + } while (0) +#define vreinterpret_8_u8(x) vreinterpretq_s8_u8(x) +#define vreinterpret_16_u16(x) vreinterpretq_s16_u16(x) +#define vreinterpret_32_u32(x) vreinterpretq_s32_u32(x) +#define vreinterpret_64_u64(x) vreinterpretq_s64_u64(x) + +#define VEC_DEFINE_OPERATIONS_SIGN(sign, csign, bits, size) \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_load_aligned(const vec_##sign##int##bits in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vld1_##sign##bits(in); \ + return vec; \ + } \ + \ + static void v##sign##int##bits##x##size##_neon_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ + { \ + vstore_lane_##bits(sign, vec.neon, out); \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vadd_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vsub_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vmul_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_lshift(v##sign##int##bits##x##size vec1, vuint##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vshl_##sign##bits(vec1.neon, vreinterpret_##bits##_u##bits(vec2.neon)); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vand_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = vorr_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_neon_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + vec.neon = veor_##sign##bits(vec1.neon, vec2.neon); \ + return vec; \ + } \ + \ + static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_neon = { \ + /* .splat = */ NULL, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_load_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_store_aligned, \ + v##sign##int##bits##x##size##_neon_add, \ + v##sign##int##bits##x##size##_neon_sub, \ + v##sign##int##bits##x##size##_neon_mul, \ + /* .div = */ NULL, \ + /* .avg = */ NULL, \ + v##sign##int##bits##x##size##_neon_and, \ + v##sign##int##bits##x##size##_neon_or, \ + v##sign##int##bits##x##size##_neon_xor, \ + /* .not = */ NULL, \ + v##sign##int##bits##x##size##_neon_lshift, \ + /* .rshift = */ NULL, \ + /* .lrshift = */ NULL, \ + }; + +#define VEC_DEFINE_OPERATIONS(bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN( , , bits, size) \ + VEC_DEFINE_OPERATIONS_SIGN(u, U, bits, size) + +VEC_DEFINE_OPERATIONS(8, 16) +VEC_DEFINE_OPERATIONS(16, 8) +VEC_DEFINE_OPERATIONS(32, 4) +VEC_DEFINE_OPERATIONS(64, 2) + +#undef vadd_8 +#undef vadd_16 +#undef vadd_32 +#undef vadd_64 +#undef vsub_8 +#undef vsub_16 +#undef vsub_32 +#undef vsub_64 +#undef vmul_8 +#undef vmul_16 +#undef vmul_32 +#undef vmul_64 +#undef vshl_8 +#undef vshl_16 +#undef vshl_32 +#undef vshl_64 +#undef veor_8 +#undef veor_16 +#undef veor_32 +#undef veor_64 +#undef vorr_8 +#undef vorr_16 +#undef vorr_32 +#undef vorr_64 +#undef vand_8 +#undef vand_16 +#undef vand_32 +#undef vand_64 +#undef vld1_8 +#undef vld1_16 +#undef vld1_32 +#undef vld1_64 +#undef vget_lane_8 +#undef vget_lane_16 +#undef vget_lane_32 +#undef vget_lane_64 +#undef vstore_lane_8 +#undef vstore_lane_16 +#undef vstore_lane_32 +#undef vstore_lane_64 + +#undef VEC_DEFINE_OPERATIONS +#undef VEC_DEFINE_OPERATIONS_SIGN + +#endif /* VEC_IMPL_ARM_NEON_H_ */
--- a/include/vec/impl/cpu.h Wed Nov 20 12:02:15 2024 -0500 +++ b/include/vec/impl/cpu.h Wed Nov 20 14:33:19 2024 -0500 @@ -47,23 +47,42 @@ * 3. This notice may not be removed or altered from any source distribution. */ -# if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) -# include <sys/sysctl.h> // For AltiVec check -# elif defined(__OpenBSD__) && defined(__powerpc__) -# include <sys/types.h> -# include <sys/sysctl.h> // For AltiVec check -# include <machine/cpu.h> -# elif defined(__FreeBSD__) && defined(__powerpc__) -# include <machine/cpu.h> -# include <sys/auxv.h> -# elif defined(__ALTIVEC__) -# include <signal.h> -# include <setjmp.h> +#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) +# include <sys/sysctl.h> // For AltiVec check +#elif defined(__OpenBSD__) && defined(__powerpc__) +# include <sys/types.h> +# include <sys/sysctl.h> // For AltiVec check +# include <machine/cpu.h> +#elif defined(__FreeBSD__) && defined(__powerpc__) +# include <machine/cpu.h> +# include <sys/auxv.h> +#elif defined(__ALTIVEC__) +# include <signal.h> +# include <setjmp.h> +#endif + +#ifdef __FreeBSD__ +# include <sys/param.h> +#endif + +#if (defined(__linux__) || defined(__ANDROID__)) && defined(__arm__) +# include <unistd.h> +# include <sys/types.h> +# include <sys/stat.h> +# include <fcntl.h> +# include <elf.h> + +/*#include <asm/hwcap.h>*/ +# ifndef AT_HWCAP +# define AT_HWCAP 16 # endif - -# ifdef __FreeBSD__ -# include <sys/param.h> +# ifndef AT_PLATFORM +# define AT_PLATFORM 15 # endif +# ifndef HWCAP_NEON +# define HWCAP_NEON (1 << 12) +# endif +#endif static inline int vec_CPU_have_CPUID(void) { @@ -348,6 +367,98 @@ return 0; } +#if defined(__linux__) && defined(__arm__) && !defined(HAVE_GETAUXVAL) +static int readProcAuxvForNeon(void) +{ + int neon = 0; + int fd; + + fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + Elf32_auxv_t aux; + while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { + if (aux.a_type == AT_HWCAP) { + neon = (aux.a_un.a_val & HWCAP_NEON) == HWCAP_NEON; + break; + } + } + close(fd); + } + return neon; +} +#endif + +static int vec_CPU_have_NEON(void) +{ +/* The way you detect NEON is a privileged instruction on ARM, so you have + query the OS kernel in a platform-specific way. :/ */ +#if defined(SDL_CPUINFO_DISABLED) + return 0; /* disabled */ +#elif (defined(__WINDOWS__) || defined(__WINRT__) || defined(__GDK__)) && (defined(_M_ARM) || defined(_M_ARM64)) +/* Visual Studio, for ARM, doesn't define __ARM_ARCH. Handle this first. */ +/* Seems to have been removed */ +#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE +#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19 +#endif + /* All WinRT ARM devices are required to support NEON, but just in case. */ + return IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE) != 0; +#elif (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || defined(__aarch64__) + return 1; /* ARMv8 always has non-optional NEON support. */ +#elif defined(__VITA__) + return 1; +#elif defined(__3DS__) + return 0; +#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) + /* (note that sysctlbyname("hw.optional.neon") doesn't work!) */ + return 1; /* all Apple ARMv7 chips and later have NEON. */ +#elif defined(__APPLE__) + return 0; /* assume anything else from Apple doesn't have NEON. */ +#elif !defined(__arm__) + return 0; /* not an ARM CPU at all. */ +#elif defined(__OpenBSD__) + return 1; /* OpenBSD only supports ARMv7 CPUs that have NEON. */ +#elif defined(HAVE_ELF_AUX_INFO) + unsigned long hasneon = 0; + if (elf_aux_info(AT_HWCAP, (void *)&hasneon, (int)sizeof(hasneon)) != 0) + return 0; + + return ((hasneon & HWCAP_NEON) == HWCAP_NEON); +#elif defined(__QNXNTO__) + return SYSPAGE_ENTRY(cpuinfo)->flags & ARM_CPU_FLAG_NEON; +#elif (defined(__linux__) || defined(__ANDROID__)) && defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_NEON) == HWCAP_NEON; +#elif defined(__linux__) + return readProcAuxvForNeon(); +#elif defined(__ANDROID__) + /* Use NDK cpufeatures to read either /proc/self/auxv or /proc/cpuinfo */ + { + AndroidCpuFamily cpu_family = android_getCpuFamily(); + if (cpu_family == ANDROID_CPU_FAMILY_ARM) { + uint64_t cpu_features = android_getCpuFeatures(); + if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) { + return 1; + } + } + return 0; + } +#elif defined(__RISCOS__) + /* Use the VFPSupport_Features SWI to access the MVFR registers */ + { + _kernel_swi_regs regs; + regs.r[0] = 0; + if (_kernel_swi(VFPSupport_Features, ®s, ®s) == NULL) { + if ((regs.r[2] & 0xFFF000) == 0x111000) { + return 1; + } + } + return 0; + } +#else +#warning vec_CPU_have_NEON is not implemented for this ARM platform. Write me. + return 0; +#endif +} + enum { VEC_CPU_HAS_ALTIVEC = (1 << 0), VEC_CPU_HAS_ALTIVEC_VSX = (1 << 1), @@ -360,6 +471,7 @@ VEC_CPU_HAS_AVX = (1 << 8), VEC_CPU_HAS_AVX2 = (1 << 9), VEC_CPU_HAS_AVX512F = (1 << 10), + VEC_CPU_HAS_NEON = (1 << 11), }; #define VEC_CPU_FEATURES_RESET UINT32_C(0xFFFFFFFF) @@ -392,6 +504,8 @@ vec_CPU_features |= VEC_CPU_HAS_AVX2; if (vec_CPU_have_AVX512F()) vec_CPU_features |= VEC_CPU_HAS_AVX512F; + if (vec_CPU_have_NEON()) + vec_CPU_features |= VEC_CPU_HAS_NEON; } #endif /* VEC_IMPL_CPU_H_ */
--- a/include/vec/impl/ppc/altivec.h Wed Nov 20 12:02:15 2024 -0500 +++ b/include/vec/impl/ppc/altivec.h Wed Nov 20 14:33:19 2024 -0500 @@ -27,12 +27,8 @@ #ifndef VEC_IMPL_PPC_ALTIVEC_H_ #define VEC_IMPL_PPC_ALTIVEC_H_ -#include <string.h> - #include <altivec.h> -#define VEC_ALTIVEC_ALIGNMENT 16 - /* GCC 4.2.1 on Mac OS X doesn't have these for some reason */ #ifdef vec_mul # define VEC_ALTIVEC_DEFINE_MUL(sign, csign, bits, size) \
--- a/include/vec/vec.h Wed Nov 20 12:02:15 2024 -0500 +++ b/include/vec/vec.h Wed Nov 20 14:33:19 2024 -0500 @@ -111,6 +111,7 @@ /* --------------------------------------------------------------- */ /* Detect compiler SIMD support */ +#define VEC_NEON_ALIGNMENT 16 #define VEC_ALTIVEC_ALIGNMENT 16 #define VEC_SSE2_ALIGNMENT 16 #define VEC_AVX2_ALIGNMENT 32 @@ -203,6 +204,67 @@ # endif #endif +#ifdef __ARM_NEON +# include <arm_neon.h> +# define VEC_COMPILER_HAS_NEON +# if VINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT8x8_ALIGNMENT +# define VINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT16x4_ALIGNMENT +# define VINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT8x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT8x8_ALIGNMENT +# define VUINT8x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT16x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT16x4_ALIGNMENT +# define VUINT16x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT8x16_ALIGNMENT +# define VINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT16x8_ALIGNMENT +# define VINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT32x4_ALIGNMENT +# define VINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VINT64x2_ALIGNMENT +# define VINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT8x16_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT8x16_ALIGNMENT +# define VUINT8x16_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT16x8_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT16x8_ALIGNMENT +# define VUINT16x8_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT32x4_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT32x4_ALIGNMENT +# define VUINT32x4_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +# if VUINT64x2_ALIGNMENT < VEC_NEON_ALIGNMENT +# undef VUINT64x2_ALIGNMENT +# define VUINT64x2_ALIGNMENT VEC_NEON_ALIGNMENT +# endif +#endif + #ifdef __MMX__ # include <mmintrin.h> # define VEC_COMPILER_HAS_MMX @@ -410,7 +472,6 @@ xx.d = x; - // I have no idea what this does :) xx.u += roffset; xx.u >>= y; xx.u -= roffset >> y; @@ -485,6 +546,9 @@ #ifdef VEC_COMPILER_HAS_MMX __m64 mmx; #endif +#ifdef VEC_COMPILER_HAS_NEON + uint8x8_t neon; +#endif vuint8x4 generic[2]; } vuint8x8; @@ -493,6 +557,9 @@ #ifdef VEC_COMPILER_HAS_MMX __m64 mmx; #endif +#ifdef VEC_COMPILER_HAS_NEON + uint16x4_t neon; +#endif vuint16x2 generic[2]; } vuint16x4; @@ -501,6 +568,9 @@ #ifdef VEC_COMPILER_HAS_MMX __m64 mmx; #endif +#ifdef VEC_COMPILER_HAS_NEON + uint32x2_t neon; +#endif vec_uint32 generic[2]; } vuint32x2; @@ -509,6 +579,9 @@ #ifdef VEC_COMPILER_HAS_MMX __m64 mmx; #endif +#ifdef VEC_COMPILER_HAS_NEON + int8x8_t neon; +#endif vint8x4 generic[2]; } vint8x8; @@ -517,6 +590,9 @@ #ifdef VEC_COMPILER_HAS_MMX __m64 mmx; #endif +#ifdef VEC_COMPILER_HAS_NEON + int16x4_t neon; +#endif vint16x2 generic[2]; } vint16x4; @@ -525,6 +601,9 @@ #ifdef VEC_COMPILER_HAS_MMX __m64 mmx; #endif +#ifdef VEC_COMPILER_HAS_NEON + int32x2_t neon; +#endif vec_int32 generic[2]; } vint32x2; @@ -537,6 +616,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector unsigned char altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + uint8x16_t neon; +#endif vuint8x8 generic[2]; } vuint8x16; @@ -547,6 +629,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector unsigned short altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + uint16x8_t neon; +#endif vuint16x4 generic[2]; } vuint16x8; @@ -557,6 +642,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector unsigned int altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + uint32x4_t neon; +#endif vuint32x2 generic[2]; } vuint32x4; @@ -567,6 +655,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX vector unsigned long long altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + uint64x2_t neon; +#endif vec_uint64 generic[2]; } vuint64x2; @@ -577,6 +668,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector signed char altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + int8x16_t neon; +#endif vint8x8 generic[2]; } vint8x16; @@ -587,6 +681,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector signed short altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + int16x8_t neon; +#endif vint16x4 generic[2]; } vint16x8; @@ -597,6 +694,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC vector signed int altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + int32x4_t neon; +#endif vint32x2 generic[2]; } vint32x4; @@ -607,6 +707,9 @@ #ifdef VEC_COMPILER_HAS_ALTIVEC_VSX vector signed long long altivec; #endif +#ifdef VEC_COMPILER_HAS_NEON + int64x2_t neon; +#endif vec_int64 generic[2]; } vint64x2; @@ -891,6 +994,10 @@ # include "impl/x86/mmx.h" #endif +#ifdef VEC_COMPILER_HAS_NEON +# include "impl/arm/neon.h" +#endif + #include "impl/generic.h" /* ---------------------------------------------------------------- */ @@ -1029,6 +1136,27 @@ vuint32x2_impl_cpu = &vuint32x2_impl_mmx; } #endif +#ifdef VEC_COMPILER_HAS_NEON + if (vec_CPU_have_NEON()) { + // 64-bit + vint8x8_impl_cpu = &vint8x8_impl_neon; + vuint8x8_impl_cpu = &vuint8x8_impl_neon; + vint16x4_impl_cpu = &vint16x4_impl_neon; + vuint16x4_impl_cpu = &vuint16x4_impl_neon; + vint32x2_impl_cpu = &vint32x2_impl_neon; + vuint32x2_impl_cpu = &vuint32x2_impl_neon; + + // 64-bit + vint8x16_impl_cpu = &vint8x16_impl_neon; + vuint8x16_impl_cpu = &vuint8x16_impl_neon; + vint16x8_impl_cpu = &vint16x8_impl_neon; + vuint16x8_impl_cpu = &vuint16x8_impl_neon; + vint32x4_impl_cpu = &vint32x4_impl_neon; + vuint32x4_impl_cpu = &vuint32x4_impl_neon; + vint64x2_impl_cpu = &vint64x2_impl_neon; + vuint64x2_impl_cpu = &vuint64x2_impl_neon; + } +#endif { // do nothing, they're already set to generics }
--- a/test/Makefile Wed Nov 20 12:02:15 2024 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,230 +0,0 @@ -# CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 3.25 - -# Default target executed when no arguments are given to make. -default_target: all -.PHONY : default_target - -# Allow only one "make -f Makefile2" at a time, but pass parallelism. -.NOTPARALLEL: - -#============================================================================= -# Special targets provided by cmake. - -# Disable implicit rules so canonical targets will work. -.SUFFIXES: - -# Disable VCS-based implicit rules. -% : %,v - -# Disable VCS-based implicit rules. -% : RCS/% - -# Disable VCS-based implicit rules. -% : RCS/%,v - -# Disable VCS-based implicit rules. -% : SCCS/s.% - -# Disable VCS-based implicit rules. -% : s.% - -.SUFFIXES: .hpux_make_needs_suffix_list - -# Command-line flag to silence nested $(MAKE). -$(VERBOSE)MAKESILENT = -s - -#Suppress display of executed commands. -$(VERBOSE).SILENT: - -# A target that is always out of date. -cmake_force: -.PHONY : cmake_force - -#============================================================================= -# Set environment variables for the build. - -# The shell in which to execute make rules. -SHELL = /bin/sh - -# The CMake executable. -CMAKE_COMMAND = /usr/bin/cmake - -# The command to remove a file. -RM = /usr/bin/cmake -E rm -f - -# Escaping for special characters. -EQUALS = = - -# The top-level source directory on which CMake was run. -CMAKE_SOURCE_DIR = /home/paper/Documents/src/hg/vec - -# The top-level build directory on which CMake was run. -CMAKE_BINARY_DIR = /home/paper/Documents/src/hg/vec/test - -#============================================================================= -# Targets provided globally by CMake. - -# Special rule for the target edit_cache -edit_cache: - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..." - /usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available. -.PHONY : edit_cache - -# Special rule for the target edit_cache -edit_cache/fast: edit_cache -.PHONY : edit_cache/fast - -# Special rule for the target rebuild_cache -rebuild_cache: - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." - /usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) -.PHONY : rebuild_cache - -# Special rule for the target rebuild_cache -rebuild_cache/fast: rebuild_cache -.PHONY : rebuild_cache/fast - -# Special rule for the target list_install_components -list_install_components: - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\"" -.PHONY : list_install_components - -# Special rule for the target list_install_components -list_install_components/fast: list_install_components -.PHONY : list_install_components/fast - -# Special rule for the target install -install: preinstall - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." - /usr/bin/cmake -P cmake_install.cmake -.PHONY : install - -# Special rule for the target install -install/fast: preinstall/fast - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..." - /usr/bin/cmake -P cmake_install.cmake -.PHONY : install/fast - -# Special rule for the target install/local -install/local: preinstall - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." - /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake -.PHONY : install/local - -# Special rule for the target install/local -install/local/fast: preinstall/fast - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..." - /usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake -.PHONY : install/local/fast - -# Special rule for the target install/strip -install/strip: preinstall - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." - /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake -.PHONY : install/strip - -# Special rule for the target install/strip -install/strip/fast: preinstall/fast - @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..." - /usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake -.PHONY : install/strip/fast - -# The main all target -all: cmake_check_build_system - $(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles /home/paper/Documents/src/hg/vec/test//CMakeFiles/progress.marks - $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all - $(CMAKE_COMMAND) -E cmake_progress_start /home/paper/Documents/src/hg/vec/test/CMakeFiles 0 -.PHONY : all - -# The main clean target -clean: - $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean -.PHONY : clean - -# The main clean target -clean/fast: clean -.PHONY : clean/fast - -# Prepare targets for installation. -preinstall: all - $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall -.PHONY : preinstall - -# Prepare targets for installation. -preinstall/fast: - $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall -.PHONY : preinstall/fast - -# clear depends -depend: - $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1 -.PHONY : depend - -#============================================================================= -# Target rules for targets named vec - -# Build rule for target. -vec: cmake_check_build_system - $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 vec -.PHONY : vec - -# fast build rule for target. -vec/fast: - $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/build -.PHONY : vec/fast - -src/vec.o: src/vec.c.o -.PHONY : src/vec.o - -# target to build an object file -src/vec.c.o: - $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.o -.PHONY : src/vec.c.o - -src/vec.i: src/vec.c.i -.PHONY : src/vec.i - -# target to preprocess a source file -src/vec.c.i: - $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.i -.PHONY : src/vec.c.i - -src/vec.s: src/vec.c.s -.PHONY : src/vec.s - -# target to generate assembly for a file -src/vec.c.s: - $(MAKE) $(MAKESILENT) -f CMakeFiles/vec.dir/build.make CMakeFiles/vec.dir/src/vec.c.s -.PHONY : src/vec.c.s - -# Help Target -help: - @echo "The following are some of the valid targets for this Makefile:" - @echo "... all (the default if no target is provided)" - @echo "... clean" - @echo "... depend" - @echo "... edit_cache" - @echo "... install" - @echo "... install/local" - @echo "... install/strip" - @echo "... list_install_components" - @echo "... rebuild_cache" - @echo "... vec" - @echo "... src/vec.o" - @echo "... src/vec.i" - @echo "... src/vec.s" -.PHONY : help - - - -#============================================================================= -# Special targets to cleanup operation of make. - -# Special rule to run CMake to check the build system integrity. -# No rule that depends on this can have commands that come from listfiles -# because they might be regenerated. -cmake_check_build_system: - $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0 -.PHONY : cmake_check_build_system -
--- a/test/Makefile.template Wed Nov 20 12:02:15 2024 -0500 +++ b/test/Makefile.template Wed Nov 20 14:33:19 2024 -0500 @@ -14,7 +14,8 @@ ../include/vec/impl/generic.h \ test_align.h \ test_arith.h \ - test_compare.h + test_compare.h \ + test_shift.h BINS = test-generic test-host test-cxx OBJS = vec-generic.o vec-host.o test.o test-cxx.o @@ -22,10 +23,10 @@ all: $(BINS) -vec-generic.o: ../src/vec.c +vec-generic.o: ../src/vec.c $(HEADERS) $(CC) $(CFLAGS) -DVEC_SUPPRESS_HW=1 -c -o $@ $< -vec-host.o: ../src/vec.c +vec-host.o: ../src/vec.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< test.o: test.c @@ -40,8 +41,8 @@ test-host: vec-host.o test.o $(CC) $(LDFLAGS) -o $@ $^ -test-cxx: test-cxx.o - $(CXX) $(LDFLAGS) -o $@ $^ +test-cxx: test-cxx.o $(HEADERS) + $(CXX) $(LDFLAGS) -o $@ $< clean: $(RM) $(BINS) $(OBJS)
--- a/test/test.c Wed Nov 20 12:02:15 2024 -0500 +++ b/test/test.c Wed Nov 20 14:33:19 2024 -0500 @@ -112,6 +112,7 @@ #include "test_align.h" #include "test_arith.h" #include "test_compare.h" +#include "test_shift.h" // ------------------------------------------------------------ @@ -124,6 +125,7 @@ ret |= test_align(); ret |= test_arith(); ret |= test_compare(); + ret |= test_shift(); return ret; }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_shift.h Wed Nov 20 14:33:19 2024 -0500 @@ -0,0 +1,15 @@ +static int test_shift(void) +{ + int ret = 0; + + ret |= (vec_ulrshift(0xFFFFFFFF, 16) != 0xFFFF); + ret |= (vec_ullshift(0xFFFF, 16) != 0xFFFF0000); + ret |= (vec_lrshift(0xFFFFFFFF, 16) != 0xFFFF); + ret |= (vec_llshift(0xFFFF, 16) != 0xFFFF0000); + ret |= (vec_urshift(0xFFFFFFFF, 16) != 0xFFFF); + ret |= (vec_ulshift(0xFFFF, 16) != 0xFFFF0000); + ret |= (vec_rshift(-0xFFFF, 8) != -0x100); + ret |= (vec_lshift(-0xFFFF, 8) != -0xFFFF00); + + return ret; +}
--- a/test/vec.pc Wed Nov 20 12:02:15 2024 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -prefix=/usr/local -exec_prefix=/usr/local -libdir=${exec_prefix}/lib -includedir=${prefix}/include - -Name: vec -Description: a tiny C99 SIMD vector library -Version: 2.0.0 - -Requires: -Libs: -L${libdir} -lvec -Cflags: -I${includedir}