# HG changeset patch # User Paper # Date 1729702285 14400 # Node ID 1f070512497f6656fdd45d25eafd4773da1f5b90 # Parent 75ab77f874e284d7259122e43bab5f7e696af983 impl/sse2: unify multiplication implementations into macros they were basically the same thing anyway diff -r 75ab77f874e2 -r 1f070512497f include/vec/impl/sse2.h --- a/include/vec/impl/sse2.h Wed Oct 23 10:13:25 2024 -0400 +++ b/include/vec/impl/sse2.h Wed Oct 23 12:51:25 2024 -0400 @@ -26,6 +26,53 @@ #define VEC_SSE2_ALIGNMENT 16 +#define VEC_SSE2_MUL_8x16(sign) \ + VEC_DECL_MUL(sign, 8, 16) \ + { \ + /* unpack and multiply */ \ + __m128i dst_even = _mm_mullo_epi16(vec1, vec2); \ + __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8)); \ + \ + /* repack */ \ + return _mm_or_si128( \ + _mm_slli_epi16(dst_odd, 8), \ + _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \ + ); \ + } + +#define VEC_SSE2_MUL_16x8(sign) \ + VEC_DECL_MUL(sign, 16, 8) \ + { \ + /* we have a real instruction for this */ \ + return _mm_mullo_epi16(vec1, vec2); \ + } + +#define VEC_SSE2_MUL_32x4(sign) \ + VEC_DECL_MUL(sign, 32, 4) \ + { \ + /* this was stolen from... somewhere :) */ \ + __m128i a13 = _mm_shuffle_epi32(vec1, 0xF5); /* (-,a3,-,a1) */ \ + __m128i b13 = _mm_shuffle_epi32(vec2, 0xF5); /* (-,b3,-,b1) */ \ + __m128i prod02 = _mm_mul_epu32(vec1, vec2); /* (-,a2*b2,-,a0*b0) */ \ + __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \ + __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \ + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \ + return _mm_unpacklo_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \ + } + +#define VEC_SSE2_MUL_64x2(sign) \ + VEC_DECL_MUL(sign, 64, 2) \ + { \ + __m128i ac = _mm_mul_epu32(vec1, vec2); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \ + __m128i b = _mm_srli_epi64(vec1, 32); /* b = vec1 >> 32; */ \ + __m128i bc = _mm_mul_epu32(b, vec2); /* bc = b * (vec2 & UINT32_MAX); */ \ + __m128i d = _mm_srli_epi64(vec2, 32); /* d = vec2 >> 32; */ \ + __m128i ad = _mm_mul_epu32(vec1, d); /* ad = (vec1 & UINT32_MAX) * d; */ \ + __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \ + hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \ + return _mm_add_epi64(hi, ac); /* return ac + hi; */ \ + } + #define VEC_DEFINE_OPERATIONS(sign, csign, bits, size) \ VEC_DECL_LOAD_ALIGNED(sign, bits, size) \ { \ @@ -72,10 +119,11 @@ return _mm_xor_si128(vec1, vec2); \ } \ \ + VEC_SSE2_MUL_##bits##x##size(sign) \ + \ VEC_GENERIC_SPLAT(sign, csign, bits, size) \ VEC_GENERIC_DIVIDE(sign, csign, bits, size) \ VEC_GENERIC_SHIFTS(sign, csign, bits, size) \ - VEC_DECL_MUL(sign, bits, size); \ VEC_GENERIC_AVG(sign, bits, size) #define VEC_DEFINE_COMPARISONS_SIGNED(bits, size) \ @@ -101,15 +149,6 @@ # define VUINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(u, U, 8, 16) VEC_GENERIC_COMPARISONS(u, U, 8, 16) -VEC_DECL_MUL(u, 8, 16) -{ - // unpack and multiply - __m128i dst_even = _mm_mullo_epi16(vec1, vec2); - __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8)); - - // repack - return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8)); -} #endif #ifndef VEC_VUINT16X8 @@ -120,10 +159,6 @@ # define VUINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(u, U, 16, 8) VEC_GENERIC_COMPARISONS(u, U, 16, 8) -VEC_DECL_MUL(u, 16, 8) -{ - return _mm_mullo_epi16(vec1, vec2); -} #endif #ifndef VEC_VUINT32X4 @@ -134,17 +169,6 @@ # define VUINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(u, U, 32, 4) VEC_GENERIC_COMPARISONS(u, U, 32, 4) -VEC_DECL_MUL(u, 32, 4) -{ - /* this was stolen from... somewhere :) */ - __m128i a13 = _mm_shuffle_epi32(vec1, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(vec2, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(vec1, vec2); // (-,a2*b2,-,a0*b0) - __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) - __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) - return _mm_unpacklo_epi64(prod01, prod23); // (ab3,ab2,ab1,ab0) -} #endif #ifndef VEC_VUINT64X2 @@ -157,32 +181,6 @@ # define VUINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(u, U, 64, 2) VEC_GENERIC_COMPARISONS(u, U, 64, 2) -VEC_DECL_MUL(u, 64, 2) -{ - /* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */ - __m128i ac = _mm_mul_epu32(vec1, vec2); - - /* b = vec1 >> 32; */ - __m128i b = _mm_srli_epi64(vec1, 32); - - /* bc = b * (vec2 & 0xFFFFFFFF); */ - __m128i bc = _mm_mul_epu32(b, vec2); - - /* d = vec2 >> 32; */ - __m128i d = _mm_srli_epi64(vec2, 32); - - /* ad = (vec1 & 0xFFFFFFFF) * d; */ - __m128i ad = _mm_mul_epu32(vec1, d); - - /* high = bc + ad; */ - __m128i high = _mm_add_epi64(bc, ad); - - /* high <<= 32; */ - high = _mm_slli_epi64(high, 32); - - /* return ac + high; */ - return _mm_add_epi64(high, ac); -} #endif #ifndef VEC_VINT8X16 @@ -193,15 +191,6 @@ # define VINT8x16_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(, , 8, 16) VEC_DEFINE_COMPARISONS_SIGNED(8, 16) -VEC_DECL_MUL(, 8, 16) -{ - // unpack and multiply - __m128i dst_even = _mm_mullo_epi16(vec1, vec2); - __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8)); - - // repack - return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8)); -} #endif #ifndef VEC_VINT16X8 @@ -212,10 +201,6 @@ # define VINT16x8_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(, , 16, 8) VEC_DEFINE_COMPARISONS_SIGNED(16, 8) -VEC_DECL_MUL(, 16, 8) -{ - return _mm_mullo_epi16(vec1, vec2); -} #endif #ifndef VEC_VINT32X4 @@ -226,16 +211,6 @@ # define VINT32x4_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(, , 32, 4) VEC_DEFINE_COMPARISONS_SIGNED(32, 4) -VEC_DECL_MUL(, 32, 4) -{ - __m128i a13 = _mm_shuffle_epi32(vec1, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(vec2, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(vec1, vec2); // (-,a2*b2,-,a0*b0) - __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) - __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) - return _mm_unpacklo_epi64(prod01, prod23); // (ab3,ab2,ab1,ab0) -} #endif #ifndef VEC_VINT64X2 @@ -248,34 +223,13 @@ # define VINT64x2_ALIGNMENT VEC_SSE2_ALIGNMENT VEC_DEFINE_OPERATIONS(, , 64, 2) VEC_GENERIC_COMPARISONS(, , 64, 2) -VEC_DECL_MUL(, 64, 2) -{ - /* ac = (vec1 & 0xFFFFFFFF) * (vec2 & 0xFFFFFFFF); */ - __m128i ac = _mm_mul_epu32(vec1, vec2); - - /* b = vec1 >> 32; */ - __m128i b = _mm_srli_epi64(vec1, 32); - - /* bc = b * (vec2 & 0xFFFFFFFF); */ - __m128i bc = _mm_mul_epu32(b, vec2); - - /* d = vec2 >> 32; */ - __m128i d = _mm_srli_epi64(vec2, 32); - - /* ad = (vec1 & 0xFFFFFFFF) * d; */ - __m128i ad = _mm_mul_epu32(vec1, d); - - /* high = bc + ad; */ - __m128i high = _mm_add_epi64(bc, ad); - - /* high <<= 32; */ - high = _mm_slli_epi64(high, 32); - - /* return ac + high; */ - return _mm_add_epi64(high, ac); -} #endif #undef VEC_DEFINE_OPERATIONS -#undef VEC_SSE2_8x16_SHIFT -#undef VEC_SSE2_16x8_SHIFT +#undef VEC_DEFINE_COMPARISONS_SIGNED + +/* multiply */ +#undef VEC_SSE2_MUL_8x16 +#undef VEC_SSE2_MUL_16x8 +#undef VEC_SSE2_MUL_32x4 +#undef VEC_SSE2_MUL_64x2