Mercurial > vec
changeset 22:fbcd3fa6f8fc
chore: merge diverging branches
author | Paper <paper@tflc.us> |
---|---|
date | Sat, 23 Nov 2024 04:09:44 +0000 |
parents | 697b9ba1c1de (current diff) 4de858e14464 (diff) |
children | e26874655738 |
files | |
diffstat | 3 files changed, 73 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/include/vec/impl/x86/sse2.h Thu Nov 21 21:55:20 2024 +0000 +++ b/include/vec/impl/x86/sse2.h Sat Nov 23 04:09:44 2024 +0000 @@ -104,9 +104,8 @@ #define VEC_SSE2_MUL_16x8(sign) \ do { \ /* we have a real instruction for this */ \ - v##sign##int16x8 vec; \ - vec.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \ - return vec; \ + vec1.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \ + return vec1; \ } while (0) #define VEC_SSE2_MUL_32x4(sign) \ @@ -119,9 +118,8 @@ __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ \ - v##sign##int32x4 vec; \ - vec.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ - return vec; \ + vec1.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ + return vec1; \ } while (0) #define VEC_SSE2_MUL_64x2(sign) \ @@ -134,9 +132,45 @@ __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ \ - v##sign##int64x2 vec; \ - vec.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \ - return vec; \ + vec1.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \ + return vec1; \ + } while (0) + +#define VEC_SSE2_CMPEQ_8x16(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi8(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) + +#define VEC_SSE2_CMPEQ_16x8(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi16(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) + +#define VEC_SSE2_CMPEQ_32x4(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ + return vec1; \ + } while (0) + +// SSE2 doesn't have an intrinsic for 64x2 equality comparison, +// so how can we take a 32x4 comparison result and turn it into +// a 64x2 comparison result? +// +// well, Intel conveniently provided an operation where we can +// shuffle around 32-bit integers (_mm_shuffle_epi32). +// +// this means all we have to do is simply do the 32-bit operation, +// shuffle the parts, and then return a bitwise AND of the result. + +#define VEC_SSE2_CMPEQ_64x2(sign) \ + do { \ + vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \ + vec2.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(1, 1, 3, 3)); \ + vec1.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(0, 0, 2, 2)); \ + vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \ + return vec1; \ } while (0) #define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \ @@ -219,6 +253,11 @@ VEC_SSE2_RSHIFT_##bits##x##size(sign, l); \ } \ \ + static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + VEC_SSE2_CMPEQ_##bits##x##size(sign); \ + } \ + \ static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \ /* .splat = */ NULL, \ v##sign##int##bits##x##size##_sse2_load_aligned, \ @@ -237,6 +276,11 @@ v##sign##int##bits##x##size##_sse2_lshift, \ v##sign##int##bits##x##size##_sse2_rshift, \ v##sign##int##bits##x##size##_sse2_lrshift, \ + /* .cmplt = */ NULL, \ + /* .cmple = */ NULL, \ + v##sign##int##bits##x##size##_sse2_cmpeq, \ + /* .cmpge = */ NULL, \ + /* .cmpgt = */ NULL, \ }; #define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \
--- a/include/vec/impl/x86/sse41.h Thu Nov 21 21:55:20 2024 +0000 +++ b/include/vec/impl/x86/sse41.h Sat Nov 23 04:09:44 2024 +0000 @@ -25,6 +25,7 @@ #ifndef VEC_IMPL_X86_SSE41_H_ #define VEC_IMPL_X86_SSE41_H_ +// SSE 4.1 provides a real _mm_mullo_epi32 #define VEC_SSE41_DEFINE_OPERATIONS(sign) \ static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \ { \ @@ -35,22 +36,27 @@ \ static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \ /* .splat = */ NULL, \ - v##sign##int32x4##_sse2_load_aligned, \ - v##sign##int32x4##_sse2_load, \ - v##sign##int32x4##_sse2_store_aligned, \ - v##sign##int32x4##_sse2_store, \ - v##sign##int32x4##_sse2_add, \ - v##sign##int32x4##_sse2_sub, \ - v##sign##int32x4##_sse41_mul, \ + v##sign##int32x4_sse2_load_aligned, \ + v##sign##int32x4_sse2_load, \ + v##sign##int32x4_sse2_store_aligned, \ + v##sign##int32x4_sse2_store, \ + v##sign##int32x4_sse2_add, \ + v##sign##int32x4_sse2_sub, \ + v##sign##int32x4_sse41_mul, \ /* .div = */ NULL, \ /* .avg = */ NULL, \ - v##sign##int32x4##_sse2_and, \ - v##sign##int32x4##_sse2_or, \ - v##sign##int32x4##_sse2_xor, \ + v##sign##int32x4_sse2_and, \ + v##sign##int32x4_sse2_or, \ + v##sign##int32x4_sse2_xor, \ /* .not = */ NULL, \ - v##sign##int32x4##_sse2_lshift, \ - v##sign##int32x4##_sse2_rshift, \ - v##sign##int32x4##_sse2_lrshift, \ + v##sign##int32x4_sse2_lshift, \ + v##sign##int32x4_sse2_rshift, \ + v##sign##int32x4_sse2_lrshift, \ + /* .cmplt = */ NULL, \ + /* .cmple = */ NULL, \ + v##sign##int32x4_sse2_cmpeq, \ + /* .cmpge = */ NULL, \ + /* .cmpgt = */ NULL, \ }; VEC_SSE41_DEFINE_OPERATIONS()
--- a/include/vec/vec.h Thu Nov 21 21:55:20 2024 +0000 +++ b/include/vec/vec.h Sat Nov 23 04:09:44 2024 +0000 @@ -1146,7 +1146,7 @@ vint32x2_impl_cpu = &vint32x2_impl_neon; vuint32x2_impl_cpu = &vuint32x2_impl_neon; - // 64-bit + // 128-bit vint8x16_impl_cpu = &vint8x16_impl_neon; vuint8x16_impl_cpu = &vuint8x16_impl_neon; vint16x8_impl_cpu = &vint16x8_impl_neon;