view include/vec/impl/x86/mmx.h @ 37:4b5a557aa64f

*: turns out extern is a practical joke. rewrite to be always inline again the sample benchmark performs about 3x as well with optimizations disabled :)
author Paper <paper@tflc.us>
date Sat, 26 Apr 2025 01:04:35 -0400
parents 677c03c382b8
children fd42f9b1b95e
line wrap: on
line source

/**
 * vec - a tiny SIMD vector library in C99
 * 
 * Copyright (c) 2024 Paper
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
**/

#ifndef VEC_IMPL_X86_MMX_H_
#define VEC_IMPL_X86_MMX_H_

/* ------------------------------------------------------------------------ */

#define VEC_MMX_OP_EX(name, intlsign, op, sign, bits, size, VARS, TRANS1, TRANS2) \
	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		VARS \
	\
		TRANS1 \
	\
		vec1.mmx = _mm_##op##_p##intlsign##bits(vec1.mmx, vec2.mmx); \
	\
		TRANS2 \
	\
		return vec1; \
	}

#define VEC_MMX_OP(name, intlsign, op, sign, bits, size) \
	VEC_MMX_OP_EX(name, intlsign, op, sign, bits, size, /* nothing */, /* nothing */, /* nothing */)

/* ------------------------------------------------------------------------ */
/* comparison */

/* helper funcs */
#define VEC_xMMX_CMP(name, op, sign, bits, size, first, second, VARS, TRANS1, TRANS2) \
	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		VARS \
	\
		TRANS1 \
	\
		vec1.mmx = _mm_##op##_pi##bits(vec##first.mmx, vec##second.mmx); \
	\
		TRANS2 \
	\
		return vec1; \
	}

#define VEC_MMX_CMP(name, op, bits, size, first, second) \
	VEC_xMMX_CMP(name, op, /* nothing */, bits, size, first, second, /* nothing */, /* nothing */, /* nothing */)

#define VEC_uMMX_CMP(name, op, bits, size, first, second) \
	VEC_xMMX_CMP(name, op, u, bits, size, first, second, \
		__m64 xor_val = _mm_set1_pi##bits((vec_int##bits)(1u << (bits - 1))); \
	, { \
		vec1.mmx = _mm_xor_si64(vec1.mmx, xor_val); \
		vec2.mmx = _mm_xor_si64(vec2.mmx, xor_val); \
	}, \
	{ \
		/* nothing */ \
	})

#define VEC_MMX_CMPEQ(sign, bits, size) VEC_xMMX_CMP(cmpeq, cmpeq, sign, bits, size, 1, 2, , ,)
#define VEC_MMX_CMPLT(sign, bits, size) VEC_##sign##MMX_CMP(cmplt, cmpgt, bits, size, 2, 1)
#define VEC_MMX_CMPGT(sign, bits, size) VEC_##sign##MMX_CMP(cmpgt, cmpgt, bits, size, 1, 2)

/* ------------------------------------------------------------------------ */

#define VEC_MMX_SPLAT(sign, bits, size) \
	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \
	{ \
		v##sign##int##bits##x##size vec; \
		vec.mmx = _mm_set1_pi##bits(x); \
		return vec; \
	}

#define VEC_MMX_LOAD_EX(name, sign, bits, size) \
	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(const vec_##sign##int##bits in[size]) \
	{ \
		v##sign##int##bits##x##size vec; \
		memcpy(&vec, in, sizeof(vec)); \
		return vec; \
	}

#define VEC_MMX_LOAD(sign, bits, size) VEC_MMX_LOAD_EX(load, sign, bits, size)
#define VEC_MMX_LOAD_ALIGNED(sign, bits, size) VEC_MMX_LOAD_EX(load_aligned, sign, bits, size)

#define VEC_MMX_STORE_EX(name, sign, bits, size) \
	VEC_FUNC_IMPL void v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
	{ \
		memcpy(out, &vec, sizeof(vec)); \
	}

#define VEC_MMX_STORE(sign, bits, size) VEC_MMX_STORE_EX(store, sign, bits, size)
#define VEC_MMX_STORE_ALIGNED(sign, bits, size) VEC_MMX_STORE_EX(store_aligned, sign, bits, size)

#define VEC_MMX_BITWISE(name, sign, bits, size) \
	VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
	{ \
		vec1.mmx = _mm_##name##_si64(vec1.mmx, vec2.mmx); \
	\
		return vec1; \
	}

/* ------------------------------------------------------------------------ */

#ifndef VINT8x8_SPLAT_DEFINED
VEC_MMX_SPLAT(, 8, 8)
# define VINT8x8_SPLAT_DEFINED
#endif

#ifndef VINT8x8_LOAD_DEFINED
VEC_MMX_LOAD(, 8, 8)
# define VINT8x8_LOAD_DEFINED
#endif

#ifndef VINT8x8_LOAD_ALIGNED_DEFINED
VEC_MMX_LOAD_ALIGNED(, 8, 8)
# define VINT8x8_LOAD_ALIGNED_DEFINED
#endif

#ifndef VINT8x8_STORE_DEFINED
VEC_MMX_STORE(, 8, 8)
# define VINT8x8_STORE_DEFINED
#endif

#ifndef VINT8x8_STORE_ALIGNED_DEFINED
VEC_MMX_STORE_ALIGNED(, 8, 8)
# define VINT8x8_STORE_ALIGNED_DEFINED
#endif

#ifndef VINT8x8_ADD_DEFINED
VEC_MMX_OP(add, i, add, /* nothing */, 8, 8)
# define VINT8x8_ADD_DEFINED
#endif

#ifndef VINT8x8_SUB_DEFINED
VEC_MMX_OP(sub, i, sub, /* nothing */, 8, 8)
# define VINT8x8_SUB_DEFINED
#endif

#ifndef VINT8x8_AND_DEFINED
VEC_MMX_BITWISE(and, /* nothing */, 8, 8)
# define VINT8x8_AND_DEFINED
#endif

#ifndef VINT8x8_OR_DEFINED
VEC_MMX_BITWISE(or, /* nothing */, 8, 8)
# define VINT8x8_OR_DEFINED
#endif

#ifndef VINT8x8_XOR_DEFINED
VEC_MMX_BITWISE(xor, /* nothing */, 8, 8)
# define VINT8x8_XOR_DEFINED
#endif

#ifndef VINT8x8_CMPEQ_DEFINED
VEC_MMX_CMPEQ(, 8, 8)
# define VINT8x8_CMPEQ_DEFINED
#endif

#ifndef VINT8x8_CMPLT_DEFINED
VEC_MMX_CMPLT(, 8, 8)
# define VINT8x8_CMPLT_DEFINED
#endif

#ifndef VINT8x8_CMPGT_DEFINED
VEC_MMX_CMPGT(, 8, 8)
# define VINT8x8_CMPGT_DEFINED
#endif

/* ------------------------------------------------------------------------ */

#ifndef VUINT8x8_SPLAT_DEFINED
VEC_MMX_SPLAT(u, 8, 8)
# define VUINT8x8_SPLAT_DEFINED
#endif

#ifndef VUINT8x8_LOAD_DEFINED
VEC_MMX_LOAD(u, 8, 8)
# define VUINT8x8_LOAD_DEFINED
#endif

#ifndef VUINT8x8_LOAD_ALIGNED_DEFINED
VEC_MMX_LOAD_ALIGNED(u, 8, 8)
# define VUINT8x8_LOAD_ALIGNED_DEFINED
#endif

#ifndef VUINT8x8_STORE_DEFINED
VEC_MMX_STORE(u, 8, 8)
# define VUINT8x8_STORE_DEFINED
#endif

#ifndef VUINT8x8_STORE_ALIGNED_DEFINED
VEC_MMX_STORE_ALIGNED(u, 8, 8)
# define VUINT8x8_STORE_ALIGNED_DEFINED
#endif

#ifndef VUINT8x8_ADD_DEFINED
VEC_MMX_OP(add, i, add, u, 8, 8)
# define VUINT8x8_ADD_DEFINED
#endif

#ifndef VUINT8x8_SUB_DEFINED
VEC_MMX_OP(sub, i, sub, u, 8, 8)
# define VUINT8x8_SUB_DEFINED
#endif

#ifndef VUINT8x8_AND_DEFINED
VEC_MMX_BITWISE(and, u, 8, 8)
# define VUINT8x8_AND_DEFINED
#endif

#ifndef VUINT8x8_OR_DEFINED
VEC_MMX_BITWISE(or, u, 8, 8)
# define VUINT8x8_OR_DEFINED
#endif

#ifndef VUINT8x8_XOR_DEFINED
VEC_MMX_BITWISE(xor, u, 8, 8)
# define VUINT8x8_XOR_DEFINED
#endif

#ifndef VUINT8x8_CMPEQ_DEFINED
VEC_MMX_CMPEQ(u, 8, 8)
# define VUINT8x8_CMPEQ_DEFINED
#endif

#ifndef VUINT8x8_CMPLT_DEFINED
VEC_MMX_CMPLT(u, 8, 8)
# define VUINT8x8_CMPLT_DEFINED
#endif

#ifndef VUINT8x8_CMPGT_DEFINED
VEC_MMX_CMPGT(u, 8, 8)
# define VUINT8x8_CMPGT_DEFINED
#endif

/* ------------------------------------------------------------------------ */

#define VEC_MMX_MUL_16x4(sign) \
	VEC_FUNC_IMPL v##sign##int16x4 v##sign##int16x4_mul(v##sign##int16x4 vec1, v##sign##int16x4 vec2) \
	{ \
		vec1.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \
		return vec1; \
	}

#ifndef VINT16x4_SPLAT_DEFINED
VEC_MMX_SPLAT(, 16, 4)
# define VINT16x4_SPLAT_DEFINED
#endif

#ifndef VINT16x4_LOAD_DEFINED
VEC_MMX_LOAD(, 16, 4)
# define VINT16x4_LOAD_DEFINED
#endif

#ifndef VINT16x4_LOAD_ALIGNED_DEFINED
VEC_MMX_LOAD_ALIGNED(, 16, 4)
# define VINT16x4_LOAD_ALIGNED_DEFINED
#endif

#ifndef VINT16x4_STORE_DEFINED
VEC_MMX_STORE(, 16, 4)
# define VINT16x4_STORE_DEFINED
#endif

#ifndef VINT16x4_STORE_ALIGNED_DEFINED
VEC_MMX_STORE_ALIGNED(, 16, 4)
# define VINT16x4_STORE_ALIGNED_DEFINED
#endif

#ifndef VINT16x4_ADD_DEFINED
VEC_MMX_OP(add, i, add, /* nothing */, 16, 4)
# define VINT16x4_ADD_DEFINED
#endif

#ifndef VINT16x4_SUB_DEFINED
VEC_MMX_OP(sub, i, sub, /* nothing */, 16, 4)
# define VINT16x4_SUB_DEFINED
#endif

#ifndef VINT16x4_MUL_DEFINED
VEC_MMX_MUL_16x4(/* nothing */)
# define VINT16x4_MUL_DEFINED
#endif

#ifndef VINT16x4_AND_DEFINED
VEC_MMX_BITWISE(and, /* nothing */, 16, 4)
# define VINT16x4_AND_DEFINED
#endif

#ifndef VINT16x4_OR_DEFINED
VEC_MMX_BITWISE(or, /* nothing */, 16, 4)
# define VINT16x4_OR_DEFINED
#endif

#ifndef VINT16x4_XOR_DEFINED
VEC_MMX_BITWISE(xor, /* nothing */, 16, 4)
# define VINT16x4_XOR_DEFINED
#endif

#ifndef VINT16x4_CMPEQ_DEFINED
VEC_MMX_CMPEQ(, 16, 4)
# define VINT16x4_CMPEQ_DEFINED
#endif

#ifndef VINT16x4_CMPLT_DEFINED
VEC_MMX_CMPLT(, 16, 4)
# define VINT16x4_CMPLT_DEFINED
#endif

#ifndef VINT16x4_CMPGT_DEFINED
VEC_MMX_CMPGT(, 16, 4)
# define VINT16x4_CMPGT_DEFINED
#endif

/* ------------------------------------------------------------------------ */

#ifndef VUINT16x4_SPLAT_DEFINED
VEC_MMX_SPLAT(u, 16, 4)
# define VUINT16x4_SPLAT_DEFINED
#endif

#ifndef VUINT16x4_LOAD_DEFINED
VEC_MMX_LOAD(u, 16, 4)
# define VUINT16x4_LOAD_DEFINED
#endif

#ifndef VUINT16x4_LOAD_ALIGNED_DEFINED
VEC_MMX_LOAD_ALIGNED(u, 16, 4)
# define VUINT16x4_LOAD_ALIGNED_DEFINED
#endif

#ifndef VUINT16x4_STORE_DEFINED
VEC_MMX_STORE(u, 16, 4)
# define VUINT16x4_STORE_DEFINED
#endif

#ifndef VUINT16x4_STORE_ALIGNED_DEFINED
VEC_MMX_STORE_ALIGNED(u, 16, 4)
# define VUINT16x4_STORE_ALIGNED_DEFINED
#endif

#ifndef VUINT16x4_ADD_DEFINED
VEC_MMX_OP(add, i, add, u, 16, 4)
# define VUINT16x4_ADD_DEFINED
#endif

#ifndef VUINT16x4_SUB_DEFINED
VEC_MMX_OP(sub, i, sub, u, 16, 4)
# define VUINT16x4_SUB_DEFINED
#endif

#ifndef VUINT16x4_MUL_DEFINED
VEC_MMX_MUL_16x4(u)
# define VUINT16x4_MUL_DEFINED
#endif

#ifndef VUINT16x4_AND_DEFINED
VEC_MMX_BITWISE(and, u, 16, 4)
# define VUINT16x4_AND_DEFINED
#endif

#ifndef VUINT16x4_OR_DEFINED
VEC_MMX_BITWISE(or, u, 16, 4)
# define VUINT16x4_OR_DEFINED
#endif

#ifndef VUINT16x4_XOR_DEFINED
VEC_MMX_BITWISE(xor, u, 16, 4)
# define VUINT16x4_XOR_DEFINED
#endif

#ifndef VUINT16x4_CMPEQ_DEFINED
VEC_MMX_CMPEQ(u, 16, 4)
# define VUINT16x4_CMPEQ_DEFINED
#endif

#ifndef VUINT16x4_CMPLT_DEFINED
VEC_MMX_CMPLT(u, 16, 4)
# define VUINT16x4_CMPLT_DEFINED
#endif

#ifndef VUINT16x4_CMPGT_DEFINED
VEC_MMX_CMPGT(u, 16, 4)
# define VUINT16x4_CMPGT_DEFINED
#endif

/* ------------------------------------------------------------------------ */

#ifndef VINT32x2_SPLAT_DEFINED
VEC_MMX_SPLAT(, 32, 2)
# define VINT32x2_SPLAT_DEFINED
#endif

#ifndef VINT32x2_LOAD_DEFINED
VEC_MMX_LOAD(, 32, 2)
# define VINT32x2_LOAD_DEFINED
#endif

#ifndef VINT32x2_LOAD_ALIGNED_DEFINED
VEC_MMX_LOAD_ALIGNED(, 32, 2)
# define VINT32x2_LOAD_ALIGNED_DEFINED
#endif

#ifndef VINT32x2_STORE_DEFINED
VEC_MMX_STORE(, 32, 2)
# define VINT32x2_STORE_DEFINED
#endif

#ifndef VINT32x2_STORE_ALIGNED_DEFINED
VEC_MMX_STORE_ALIGNED(, 32, 2)
# define VINT32x2_STORE_ALIGNED_DEFINED
#endif

#ifndef VINT32x2_ADD_DEFINED
VEC_MMX_OP(add, i, add, /* nothing */, 32, 2)
# define VINT32x2_ADD_DEFINED
#endif

#ifndef VINT32x2_SUB_DEFINED
VEC_MMX_OP(sub, i, sub, /* nothing */, 32, 2)
# define VINT32x2_SUB_DEFINED
#endif

#ifndef VINT32x2_AND_DEFINED
VEC_MMX_BITWISE(and, /* nothing */, 32, 2)
# define VINT32x2_AND_DEFINED
#endif

#ifndef VINT32x2_OR_DEFINED
VEC_MMX_BITWISE(or, /* nothing */, 32, 2)
# define VINT32x2_OR_DEFINED
#endif

#ifndef VINT32x2_XOR_DEFINED
VEC_MMX_BITWISE(xor, /* nothing */, 32, 2)
# define VINT32x2_XOR_DEFINED
#endif

#ifndef VINT32x2_CMPEQ_DEFINED
VEC_MMX_CMPEQ(, 32, 2)
# define VINT32x2_CMPEQ_DEFINED
#endif

#ifndef VINT32x2_CMPLT_DEFINED
VEC_MMX_CMPLT(, 32, 2)
# define VINT32x2_CMPLT_DEFINED
#endif

#ifndef VINT32x2_CMPGT_DEFINED
VEC_MMX_CMPGT(, 32, 2)
# define VINT32x2_CMPGT_DEFINED
#endif

/* ------------------------------------------------------------------------ */

#ifndef VUINT32x2_SPLAT_DEFINED
VEC_MMX_SPLAT(u, 32, 2)
# define VUINT32x2_SPLAT_DEFINED
#endif

#ifndef VUINT32x2_LOAD_DEFINED
VEC_MMX_LOAD(u, 32, 2)
# define VUINT32x2_LOAD_DEFINED
#endif

#ifndef VUINT32x2_LOAD_ALIGNED_DEFINED
VEC_MMX_LOAD_ALIGNED(u, 32, 2)
# define VUINT32x2_LOAD_ALIGNED_DEFINED
#endif

#ifndef VUINT32x2_STORE_DEFINED
VEC_MMX_STORE(u, 32, 2)
# define VUINT32x2_STORE_DEFINED
#endif

#ifndef VUINT32x2_STORE_ALIGNED_DEFINED
VEC_MMX_STORE_ALIGNED(u, 32, 2)
# define VUINT32x2_STORE_ALIGNED_DEFINED
#endif

#ifndef VUINT32x2_ADD_DEFINED
VEC_MMX_OP(add, i, add, u, 32, 2)
# define VUINT32x2_ADD_DEFINED
#endif

#ifndef VUINT32x2_SUB_DEFINED
VEC_MMX_OP(sub, i, sub, u, 32, 2)
# define VUINT32x2_SUB_DEFINED
#endif

#ifndef VUINT32x2_AND_DEFINED
VEC_MMX_BITWISE(and, u, 32, 2)
# define VUINT32x2_AND_DEFINED
#endif

#ifndef VUINT32x2_OR_DEFINED
VEC_MMX_BITWISE(or, u, 32, 2)
# define VUINT32x2_OR_DEFINED
#endif

#ifndef VUINT32x2_XOR_DEFINED
VEC_MMX_BITWISE(xor, u, 32, 2)
# define VUINT32x2_XOR_DEFINED
#endif

#ifndef VUINT32x2_CMPEQ_DEFINED
VEC_MMX_CMPEQ(u, 32, 2)
# define VUINT32x2_CMPEQ_DEFINED
#endif

#ifndef VUINT32x2_CMPLT_DEFINED
VEC_MMX_CMPLT(u, 32, 2)
# define VUINT32x2_CMPLT_DEFINED
#endif

#ifndef VUINT32x2_CMPGT_DEFINED
VEC_MMX_CMPGT(u, 32, 2)
# define VUINT32x2_CMPGT_DEFINED
#endif

#endif /* VEC_IMPL_X86_MMX_H_ */