Mercurial > vec

--- a/include/vec/impl/x86/sse2.h	Wed Nov 20 14:33:19 2024 -0500
+++ b/include/vec/impl/x86/sse2.h	Fri Nov 22 23:03:34 2024 -0500
@@ -104,9 +104,8 @@
 #define VEC_SSE2_MUL_16x8(sign) \
 	do { \
 		/* we have a real instruction for this */ \
-		v##sign##int16x8 vec; \
-		vec.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \
-		return vec; \
+		vec1.sse = _mm_mullo_epi16(vec1.sse, vec2.sse); \
+		return vec1; \
 	} while (0)

 #define VEC_SSE2_MUL_32x4(sign) \
@@ -119,9 +118,8 @@
 		__m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
 		__m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
 	\
-		v##sign##int32x4 vec; \
-		vec.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \
-		return vec; \
+		vec1.sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \
+		return vec1; \
 	} while (0)

 #define VEC_SSE2_MUL_64x2(sign) \
@@ -134,9 +132,45 @@
 		__m128i hi = _mm_add_epi64(bc, ad);             /* hi = bc + ad; */ \
 		hi = _mm_slli_epi64(hi, 32);                    /* hi <<= 32; */ \
 	\
-		v##sign##int64x2 vec; \
-		vec.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \
-		return vec; \
+		vec1.sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \
+		return vec1; \
+	} while (0)
+
+#define VEC_SSE2_CMPEQ_8x16(sign) \
+	do { \
+		vec1.sse = _mm_cmpeq_epi8(vec1.sse, vec2.sse); \
+		return vec1; \
+	} while (0)
+
+#define VEC_SSE2_CMPEQ_16x8(sign) \
+	do { \
+		vec1.sse = _mm_cmpeq_epi16(vec1.sse, vec2.sse); \
+		return vec1; \
+	} while (0)
+
+#define VEC_SSE2_CMPEQ_32x4(sign) \
+	do { \
+		vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \
+		return vec1; \
+	} while (0)
+
+// SSE2 doesn't have an intrinsic for 64x2 equality comparison,
+// so how can we take a 32x4 comparison result and turn it into
+// a 64x2 comparison result?
+//
+// well, Intel conveniently provided an operation where we can
+// shuffle around 32-bit integers (_mm_shuffle_epi32).
+//
+// this means all we have to do is simply do the 32-bit operation,
+// shuffle the parts, and then return a bitwise AND of the result.
+
+#define VEC_SSE2_CMPEQ_64x2(sign) \
+	do { \
+		vec1.sse = _mm_cmpeq_epi32(vec1.sse, vec2.sse); \
+		vec2.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(1, 1, 3, 3)); \
+		vec1.sse = _mm_shuffle_epi32(vec1.sse, _MM_SHUFFLE(0, 0, 2, 2)); \
+		vec1.sse = _mm_and_si128(vec1.sse, vec2.sse); \
+		return vec1; \
 	} while (0)

 #define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
@@ -219,6 +253,11 @@
 		VEC_SSE2_RSHIFT_##bits##x##size(sign, l); \
 	} \
 	\
+	static v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
+	{ \
+		VEC_SSE2_CMPEQ_##bits##x##size(sign); \
+	} \
+	\
 	static v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \
 		/* .splat = */ NULL, \
 		v##sign##int##bits##x##size##_sse2_load_aligned, \
@@ -237,6 +276,11 @@
 		v##sign##int##bits##x##size##_sse2_lshift, \
 		v##sign##int##bits##x##size##_sse2_rshift, \
 		v##sign##int##bits##x##size##_sse2_lrshift, \
+		/* .cmplt = */ NULL, \
+		/* .cmple = */ NULL, \
+		v##sign##int##bits##x##size##_sse2_cmpeq, \
+		/* .cmpge = */ NULL, \
+		/* .cmpgt = */ NULL, \
 	};

 #define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \
--- a/include/vec/impl/x86/sse41.h	Wed Nov 20 14:33:19 2024 -0500
+++ b/include/vec/impl/x86/sse41.h	Fri Nov 22 23:03:34 2024 -0500
@@ -25,6 +25,7 @@
 #ifndef VEC_IMPL_X86_SSE41_H_
 #define VEC_IMPL_X86_SSE41_H_

+// SSE 4.1 provides a real _mm_mullo_epi32
 #define VEC_SSE41_DEFINE_OPERATIONS(sign) \
 	static v##sign##int32x4 v##sign##int32x4_sse41_mul(v##sign##int32x4 vec1, v##sign##int32x4 vec2) \
 	{ \
@@ -35,22 +36,27 @@
 	\
 	static v##sign##int32x4_impl v##sign##int32x4_impl_sse41 = { \
 		/* .splat = */ NULL, \
-		v##sign##int32x4##_sse2_load_aligned, \
-		v##sign##int32x4##_sse2_load, \
-		v##sign##int32x4##_sse2_store_aligned, \
-		v##sign##int32x4##_sse2_store, \
-		v##sign##int32x4##_sse2_add, \
-		v##sign##int32x4##_sse2_sub, \
-		v##sign##int32x4##_sse41_mul, \
+		v##sign##int32x4_sse2_load_aligned, \
+		v##sign##int32x4_sse2_load, \
+		v##sign##int32x4_sse2_store_aligned, \
+		v##sign##int32x4_sse2_store, \
+		v##sign##int32x4_sse2_add, \
+		v##sign##int32x4_sse2_sub, \
+		v##sign##int32x4_sse41_mul, \
 		/* .div = */ NULL, \
 		/* .avg = */ NULL, \
-		v##sign##int32x4##_sse2_and, \
-		v##sign##int32x4##_sse2_or, \
-		v##sign##int32x4##_sse2_xor, \
+		v##sign##int32x4_sse2_and, \
+		v##sign##int32x4_sse2_or, \
+		v##sign##int32x4_sse2_xor, \
 		/* .not = */ NULL, \
-		v##sign##int32x4##_sse2_lshift, \
-		v##sign##int32x4##_sse2_rshift, \
-		v##sign##int32x4##_sse2_lrshift, \
+		v##sign##int32x4_sse2_lshift, \
+		v##sign##int32x4_sse2_rshift, \
+		v##sign##int32x4_sse2_lrshift, \
+		/* .cmplt = */ NULL, \
+		/* .cmple = */ NULL, \
+		v##sign##int32x4_sse2_cmpeq, \
+		/* .cmpge = */ NULL, \
+		/* .cmpgt = */ NULL, \
 	};

 VEC_SSE41_DEFINE_OPERATIONS()
--- a/include/vec/vec.h	Wed Nov 20 14:33:19 2024 -0500
+++ b/include/vec/vec.h	Fri Nov 22 23:03:34 2024 -0500
@@ -1146,7 +1146,7 @@
 		vint32x2_impl_cpu  = &vint32x2_impl_neon;
 		vuint32x2_impl_cpu = &vuint32x2_impl_neon;

-		// 64-bit
+		// 128-bit
 		vint8x16_impl_cpu  = &vint8x16_impl_neon;
 		vuint8x16_impl_cpu = &vuint8x16_impl_neon;
 		vint16x8_impl_cpu  = &vint16x8_impl_neon;