diff test/test_benchmark.h @ 45:7955bed1d169 default tip

*: add preliminary floating point support no x86 intrinsics just yet, but I did add altivec since it's (arguably) the simplest :)
author Paper <paper@tflc.us>
date Wed, 30 Apr 2025 18:36:38 -0400
parents c6e0df09b86f
children
line wrap: on
line diff
--- a/test/test_benchmark.h	Tue Apr 29 16:54:13 2025 -0400
+++ b/test/test_benchmark.h	Wed Apr 30 18:36:38 2025 -0400
@@ -2,37 +2,115 @@
 /* ------------------------------------------------------------------------ */
 /* simple benchmark for getting the min/max range of an audio sample. */
 
-extern void test_benchmark_sample_minmax_simple_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax);
-extern void test_benchmark_sample_minmax_vec_impl(int16_t *smpl, uint32_t length, int32_t *pmin, int32_t *pmax);
+extern void test_benchmark_sample_minmax_int8x2_impl(vec_int8 *smpl, uint32_t length, vec_int8 *pmin, vec_int8 *pmax);
 
 VEC_FUNC_IMPL void test_benchmark_sample_minmax(void)
 {
-	int32_t min, max;
-	clock_t start, end;
 	int i;
-	int16_t *q = vec_malloc(16000001u * 2u);
+
+	union {
+		vec_int8   int8[16000001];
+		vec_uint8  uint8[16000001];
+		vec_int16  int16[16000001];
+		vec_uint16 uint16[16000001];
+		vec_int32  int32[16000001];
+		vec_uint32 uint32[16000001];
+		vec_int64  int64[16000001];
+		vec_uint64 uint64[16000001];
+		vec_f32    f32[16000001];
+		vec_f64    f64[16000001];
+	} *q;
+
+	q = vec_malloc(sizeof(*q));
+
+	for (i = 0; i < 16000001; i++)
+		q->f64[i] = (double)rand() / RAND_MAX;
 
-	printf("\nsigned 16-bit audio sample min/max - 1 thousand passes - 16000001 samples\n\n");
+	printf("\naudio sample min/max - 1 thousand passes - 16000001 samples\n\n");
 
-	start = clock();
-	for (i = 0; i < 100; i++) {
-		min = INT32_MAX;
-		max = INT32_MIN;
-		test_benchmark_sample_minmax_vec_impl(q, 16000001u, &min, &max);
+#define DO_TIMER(TYPE,NAME,MIN,MAX) \
+	{ \
+		vec_##TYPE min, max; \
+		clock_t start, end; \
+	\
+		start = clock(); \
+	\
+		for (i = 0; i < 1000; i++) { \
+			extern void test_benchmark_sample_minmax_##NAME##_impl(vec_##TYPE *smpl, uint32_t length, vec_##TYPE *pmin, vec_##TYPE *pmax); \
+	\
+			min = MAX; \
+			max = MIN; \
+	\
+			test_benchmark_sample_minmax_##NAME##_impl(q->TYPE, 16000001u, &min, &max); \
+		} \
+	\
+		end = clock(); \
+	\
+		printf("- %s: took %f secs\n", #NAME, (double)(end - start) / CLOCKS_PER_SEC); \
 	}
-	end = clock();
+
+	DO_TIMER(int8, int8x2, INT8_MIN, INT8_MAX)
+	DO_TIMER(int8, int8x4, INT8_MIN, INT8_MAX)
+	DO_TIMER(int8, int8x8, INT8_MIN, INT8_MAX)
+	DO_TIMER(int8, int8x16, INT8_MIN, INT8_MAX)
+	DO_TIMER(int8, int8x32, INT8_MIN, INT8_MAX)
+	DO_TIMER(int8, int8x64, INT8_MIN, INT8_MAX)
+	DO_TIMER(int8, int8, INT8_MIN, INT8_MAX)
 
-	printf("- vec: took %f secs\n", (double)(end - start) / CLOCKS_PER_SEC);
+	DO_TIMER(int16, int16x2, INT16_MIN, INT16_MAX)
+	DO_TIMER(int16, int16x4, INT16_MIN, INT16_MAX)
+	DO_TIMER(int16, int16x8, INT16_MIN, INT16_MAX)
+	DO_TIMER(int16, int16x16, INT16_MIN, INT16_MAX)
+	DO_TIMER(int16, int16x32, INT16_MIN, INT16_MAX)
+	DO_TIMER(int16, int16, INT16_MIN, INT16_MAX)
+
+	DO_TIMER(int32, int32x2, INT32_MIN, INT32_MAX)
+	DO_TIMER(int32, int32x4, INT32_MIN, INT32_MAX)
+	DO_TIMER(int32, int32x8, INT32_MIN, INT32_MAX)
+	DO_TIMER(int32, int32x16, INT32_MIN, INT32_MAX)
+	DO_TIMER(int32, int32, INT32_MIN, INT32_MAX)
+
+	DO_TIMER(int64, int64x2, INT64_MIN, INT64_MAX)
+	DO_TIMER(int64, int64x4, INT64_MIN, INT64_MAX)
+	DO_TIMER(int64, int64x8, INT64_MIN, INT64_MAX)
+	DO_TIMER(int64, int64, INT64_MIN, INT64_MAX)
 
-	start = clock();
-	for (i = 0; i < 100; i++) {
-		min = INT32_MAX;
-		max = INT32_MIN;
-		test_benchmark_sample_minmax_simple_impl(q, 16000001u, &min, &max);
-	}
-	end = clock();
+	DO_TIMER(uint8, uint8x2, 0, UINT8_MAX)
+	DO_TIMER(uint8, uint8x4, 0, UINT8_MAX)
+	DO_TIMER(uint8, uint8x8, 0, UINT8_MAX)
+	DO_TIMER(uint8, uint8x16, 0, UINT8_MAX)
+	DO_TIMER(uint8, uint8x32, 0, UINT8_MAX)
+	DO_TIMER(uint8, uint8x64, 0, UINT8_MAX)
+	DO_TIMER(uint8, uint8, 0, UINT8_MAX)
+
+	DO_TIMER(uint16, uint16x2, 0, UINT16_MAX)
+	DO_TIMER(uint16, uint16x4, 0, UINT16_MAX)
+	DO_TIMER(uint16, uint16x8, 0, UINT16_MAX)
+	DO_TIMER(uint16, uint16x16, 0, UINT16_MAX)
+	DO_TIMER(uint16, uint16x32, 0, UINT16_MAX)
+	DO_TIMER(uint16, uint16, 0, UINT16_MAX)
 
-	printf("- simple: took %f secs\n", (double)(end - start) / CLOCKS_PER_SEC);
+	DO_TIMER(uint32, uint32x2, 0, UINT32_MAX)
+	DO_TIMER(uint32, uint32x4, 0, UINT32_MAX)
+	DO_TIMER(uint32, uint32x8, 0, UINT32_MAX)
+	DO_TIMER(uint32, uint32x16, 0, UINT32_MAX)
+	DO_TIMER(uint32, uint32, 0, UINT32_MAX)
+
+	DO_TIMER(uint64, uint64x2, 0, UINT64_MAX)
+	DO_TIMER(uint64, uint64x4, 0, UINT64_MAX)
+	DO_TIMER(uint64, uint64x8, 0, UINT64_MAX)
+	DO_TIMER(uint64, uint64, 0, UINT64_MAX)
+
+	DO_TIMER(f32, f32x2, -1.0f, 1.0f)
+	DO_TIMER(f32, f32x4, -1.0f, 1.0f)
+	DO_TIMER(f32, f32x8, -1.0f, 1.0f)
+	DO_TIMER(f32, f32x16, -1.0f, 1.0f)
+	DO_TIMER(f32, f32, -1.0f, 1.0f)
+
+	DO_TIMER(f64, f64x2, -1.0, 1.0)
+	DO_TIMER(f64, f64x4, -1.0, 1.0)
+	DO_TIMER(f64, f64x8, -1.0, 1.0)
+	DO_TIMER(f64, f64, -1.0, 1.0)
 
 	printf("\n");