diff test/test_benchmark_vec.c @ 37:4b5a557aa64f

*: turns out extern is a practical joke. rewrite to be always inline again the sample benchmark performs about 3x as well with optimizations disabled :)
author Paper <paper@tflc.us>
date Sat, 26 Apr 2025 01:04:35 -0400
parents
children c6e0df09b86f
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_benchmark_vec.c	Sat Apr 26 01:04:35 2025 -0400
@@ -0,0 +1,43 @@
+#include "vec/vec.h"
+
+extern void test_benchmark_sample_minmax_vec_impl(int16_t *smpl,
+	uint32_t length, int32_t *pmin, int32_t *pmax)
+{
+	int32_t smin = INT32_MAX, smax = INT32_MIN;
+	uint32_t len32;
+	int i;
+	vint16x32 min = vint16x32_splat(*pmin);
+	vint16x32 max = vint16x32_splat(*pmax);
+	VINT16x32_ALIGNED_ARRAY(mins);
+	VINT16x32_ALIGNED_ARRAY(maxs);
+
+	len32 = length / 32;
+	while (len32--) {
+		vint16x32 vec = vint16x32_load_aligned(smpl);
+
+		min = vint16x32_min(vec, min);
+		max = vint16x32_max(vec, max);
+
+		smpl += 32;
+	}
+
+	vint16x32_store_aligned(min, mins);
+	vint16x32_store_aligned(max, maxs);
+
+	/* get the lowest minimum of what we have left */
+	for (i = 0; i < 32; i++) {
+		if (mins[i] < smin) smin = mins[i];
+		if (maxs[i] > smax) smax = maxs[i];
+	}
+
+	len32 = length % 32;
+	while (len32--) {
+		if (*smpl < smin) smin = *smpl;
+		if (*smpl > smax) smax = *smpl;
+
+		smpl++;
+	}
+
+	*pmin = smin;
+	*pmax = smax;
+}