Mercurial > vec
view include/vec/impl/x86/mmx.h @ 37:4b5a557aa64f
*: turns out extern is a practical joke. rewrite to be always inline again
the sample benchmark performs about 3x as well with optimizations
disabled :)
author | Paper <paper@tflc.us> |
---|---|
date | Sat, 26 Apr 2025 01:04:35 -0400 |
parents | 677c03c382b8 |
children | fd42f9b1b95e |
line wrap: on
line source
/** * vec - a tiny SIMD vector library in C99 * * Copyright (c) 2024 Paper * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. **/ #ifndef VEC_IMPL_X86_MMX_H_ #define VEC_IMPL_X86_MMX_H_ /* ------------------------------------------------------------------------ */ #define VEC_MMX_OP_EX(name, intlsign, op, sign, bits, size, VARS, TRANS1, TRANS2) \ VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ VARS \ \ TRANS1 \ \ vec1.mmx = _mm_##op##_p##intlsign##bits(vec1.mmx, vec2.mmx); \ \ TRANS2 \ \ return vec1; \ } #define VEC_MMX_OP(name, intlsign, op, sign, bits, size) \ VEC_MMX_OP_EX(name, intlsign, op, sign, bits, size, /* nothing */, /* nothing */, /* nothing */) /* ------------------------------------------------------------------------ */ /* comparison */ /* helper funcs */ #define VEC_xMMX_CMP(name, op, sign, bits, size, first, second, VARS, TRANS1, TRANS2) \ VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ VARS \ \ TRANS1 \ \ vec1.mmx = _mm_##op##_pi##bits(vec##first.mmx, vec##second.mmx); \ \ TRANS2 \ \ return vec1; \ } #define VEC_MMX_CMP(name, op, bits, size, first, second) \ VEC_xMMX_CMP(name, op, /* nothing */, bits, size, first, second, /* nothing */, /* nothing */, /* nothing */) #define VEC_uMMX_CMP(name, op, bits, size, first, second) \ VEC_xMMX_CMP(name, op, u, bits, size, first, second, \ __m64 xor_val = _mm_set1_pi##bits((vec_int##bits)(1u << (bits - 1))); \ , { \ vec1.mmx = _mm_xor_si64(vec1.mmx, xor_val); \ vec2.mmx = _mm_xor_si64(vec2.mmx, xor_val); \ }, \ { \ /* nothing */ \ }) #define VEC_MMX_CMPEQ(sign, bits, size) VEC_xMMX_CMP(cmpeq, cmpeq, sign, bits, size, 1, 2, , ,) #define VEC_MMX_CMPLT(sign, bits, size) VEC_##sign##MMX_CMP(cmplt, cmpgt, bits, size, 2, 1) #define VEC_MMX_CMPGT(sign, bits, size) VEC_##sign##MMX_CMP(cmpgt, cmpgt, bits, size, 1, 2) /* ------------------------------------------------------------------------ */ #define VEC_MMX_SPLAT(sign, bits, size) \ VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(vec_##sign##int##bits x) \ { \ v##sign##int##bits##x##size vec; \ vec.mmx = _mm_set1_pi##bits(x); \ return vec; \ } #define VEC_MMX_LOAD_EX(name, sign, bits, size) \ VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(const vec_##sign##int##bits in[size]) \ { \ v##sign##int##bits##x##size vec; \ memcpy(&vec, in, sizeof(vec)); \ return vec; \ } #define VEC_MMX_LOAD(sign, bits, size) VEC_MMX_LOAD_EX(load, sign, bits, size) #define VEC_MMX_LOAD_ALIGNED(sign, bits, size) VEC_MMX_LOAD_EX(load_aligned, sign, bits, size) #define VEC_MMX_STORE_EX(name, sign, bits, size) \ VEC_FUNC_IMPL void v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \ { \ memcpy(out, &vec, sizeof(vec)); \ } #define VEC_MMX_STORE(sign, bits, size) VEC_MMX_STORE_EX(store, sign, bits, size) #define VEC_MMX_STORE_ALIGNED(sign, bits, size) VEC_MMX_STORE_EX(store_aligned, sign, bits, size) #define VEC_MMX_BITWISE(name, sign, bits, size) \ VEC_FUNC_IMPL v##sign##int##bits##x##size v##sign##int##bits##x##size##_##name(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ { \ vec1.mmx = _mm_##name##_si64(vec1.mmx, vec2.mmx); \ \ return vec1; \ } /* ------------------------------------------------------------------------ */ #ifndef VINT8x8_SPLAT_DEFINED VEC_MMX_SPLAT(, 8, 8) # define VINT8x8_SPLAT_DEFINED #endif #ifndef VINT8x8_LOAD_DEFINED VEC_MMX_LOAD(, 8, 8) # define VINT8x8_LOAD_DEFINED #endif #ifndef VINT8x8_LOAD_ALIGNED_DEFINED VEC_MMX_LOAD_ALIGNED(, 8, 8) # define VINT8x8_LOAD_ALIGNED_DEFINED #endif #ifndef VINT8x8_STORE_DEFINED VEC_MMX_STORE(, 8, 8) # define VINT8x8_STORE_DEFINED #endif #ifndef VINT8x8_STORE_ALIGNED_DEFINED VEC_MMX_STORE_ALIGNED(, 8, 8) # define VINT8x8_STORE_ALIGNED_DEFINED #endif #ifndef VINT8x8_ADD_DEFINED VEC_MMX_OP(add, i, add, /* nothing */, 8, 8) # define VINT8x8_ADD_DEFINED #endif #ifndef VINT8x8_SUB_DEFINED VEC_MMX_OP(sub, i, sub, /* nothing */, 8, 8) # define VINT8x8_SUB_DEFINED #endif #ifndef VINT8x8_AND_DEFINED VEC_MMX_BITWISE(and, /* nothing */, 8, 8) # define VINT8x8_AND_DEFINED #endif #ifndef VINT8x8_OR_DEFINED VEC_MMX_BITWISE(or, /* nothing */, 8, 8) # define VINT8x8_OR_DEFINED #endif #ifndef VINT8x8_XOR_DEFINED VEC_MMX_BITWISE(xor, /* nothing */, 8, 8) # define VINT8x8_XOR_DEFINED #endif #ifndef VINT8x8_CMPEQ_DEFINED VEC_MMX_CMPEQ(, 8, 8) # define VINT8x8_CMPEQ_DEFINED #endif #ifndef VINT8x8_CMPLT_DEFINED VEC_MMX_CMPLT(, 8, 8) # define VINT8x8_CMPLT_DEFINED #endif #ifndef VINT8x8_CMPGT_DEFINED VEC_MMX_CMPGT(, 8, 8) # define VINT8x8_CMPGT_DEFINED #endif /* ------------------------------------------------------------------------ */ #ifndef VUINT8x8_SPLAT_DEFINED VEC_MMX_SPLAT(u, 8, 8) # define VUINT8x8_SPLAT_DEFINED #endif #ifndef VUINT8x8_LOAD_DEFINED VEC_MMX_LOAD(u, 8, 8) # define VUINT8x8_LOAD_DEFINED #endif #ifndef VUINT8x8_LOAD_ALIGNED_DEFINED VEC_MMX_LOAD_ALIGNED(u, 8, 8) # define VUINT8x8_LOAD_ALIGNED_DEFINED #endif #ifndef VUINT8x8_STORE_DEFINED VEC_MMX_STORE(u, 8, 8) # define VUINT8x8_STORE_DEFINED #endif #ifndef VUINT8x8_STORE_ALIGNED_DEFINED VEC_MMX_STORE_ALIGNED(u, 8, 8) # define VUINT8x8_STORE_ALIGNED_DEFINED #endif #ifndef VUINT8x8_ADD_DEFINED VEC_MMX_OP(add, i, add, u, 8, 8) # define VUINT8x8_ADD_DEFINED #endif #ifndef VUINT8x8_SUB_DEFINED VEC_MMX_OP(sub, i, sub, u, 8, 8) # define VUINT8x8_SUB_DEFINED #endif #ifndef VUINT8x8_AND_DEFINED VEC_MMX_BITWISE(and, u, 8, 8) # define VUINT8x8_AND_DEFINED #endif #ifndef VUINT8x8_OR_DEFINED VEC_MMX_BITWISE(or, u, 8, 8) # define VUINT8x8_OR_DEFINED #endif #ifndef VUINT8x8_XOR_DEFINED VEC_MMX_BITWISE(xor, u, 8, 8) # define VUINT8x8_XOR_DEFINED #endif #ifndef VUINT8x8_CMPEQ_DEFINED VEC_MMX_CMPEQ(u, 8, 8) # define VUINT8x8_CMPEQ_DEFINED #endif #ifndef VUINT8x8_CMPLT_DEFINED VEC_MMX_CMPLT(u, 8, 8) # define VUINT8x8_CMPLT_DEFINED #endif #ifndef VUINT8x8_CMPGT_DEFINED VEC_MMX_CMPGT(u, 8, 8) # define VUINT8x8_CMPGT_DEFINED #endif /* ------------------------------------------------------------------------ */ #define VEC_MMX_MUL_16x4(sign) \ VEC_FUNC_IMPL v##sign##int16x4 v##sign##int16x4_mul(v##sign##int16x4 vec1, v##sign##int16x4 vec2) \ { \ vec1.mmx = _mm_mullo_pi16(vec1.mmx, vec2.mmx); \ return vec1; \ } #ifndef VINT16x4_SPLAT_DEFINED VEC_MMX_SPLAT(, 16, 4) # define VINT16x4_SPLAT_DEFINED #endif #ifndef VINT16x4_LOAD_DEFINED VEC_MMX_LOAD(, 16, 4) # define VINT16x4_LOAD_DEFINED #endif #ifndef VINT16x4_LOAD_ALIGNED_DEFINED VEC_MMX_LOAD_ALIGNED(, 16, 4) # define VINT16x4_LOAD_ALIGNED_DEFINED #endif #ifndef VINT16x4_STORE_DEFINED VEC_MMX_STORE(, 16, 4) # define VINT16x4_STORE_DEFINED #endif #ifndef VINT16x4_STORE_ALIGNED_DEFINED VEC_MMX_STORE_ALIGNED(, 16, 4) # define VINT16x4_STORE_ALIGNED_DEFINED #endif #ifndef VINT16x4_ADD_DEFINED VEC_MMX_OP(add, i, add, /* nothing */, 16, 4) # define VINT16x4_ADD_DEFINED #endif #ifndef VINT16x4_SUB_DEFINED VEC_MMX_OP(sub, i, sub, /* nothing */, 16, 4) # define VINT16x4_SUB_DEFINED #endif #ifndef VINT16x4_MUL_DEFINED VEC_MMX_MUL_16x4(/* nothing */) # define VINT16x4_MUL_DEFINED #endif #ifndef VINT16x4_AND_DEFINED VEC_MMX_BITWISE(and, /* nothing */, 16, 4) # define VINT16x4_AND_DEFINED #endif #ifndef VINT16x4_OR_DEFINED VEC_MMX_BITWISE(or, /* nothing */, 16, 4) # define VINT16x4_OR_DEFINED #endif #ifndef VINT16x4_XOR_DEFINED VEC_MMX_BITWISE(xor, /* nothing */, 16, 4) # define VINT16x4_XOR_DEFINED #endif #ifndef VINT16x4_CMPEQ_DEFINED VEC_MMX_CMPEQ(, 16, 4) # define VINT16x4_CMPEQ_DEFINED #endif #ifndef VINT16x4_CMPLT_DEFINED VEC_MMX_CMPLT(, 16, 4) # define VINT16x4_CMPLT_DEFINED #endif #ifndef VINT16x4_CMPGT_DEFINED VEC_MMX_CMPGT(, 16, 4) # define VINT16x4_CMPGT_DEFINED #endif /* ------------------------------------------------------------------------ */ #ifndef VUINT16x4_SPLAT_DEFINED VEC_MMX_SPLAT(u, 16, 4) # define VUINT16x4_SPLAT_DEFINED #endif #ifndef VUINT16x4_LOAD_DEFINED VEC_MMX_LOAD(u, 16, 4) # define VUINT16x4_LOAD_DEFINED #endif #ifndef VUINT16x4_LOAD_ALIGNED_DEFINED VEC_MMX_LOAD_ALIGNED(u, 16, 4) # define VUINT16x4_LOAD_ALIGNED_DEFINED #endif #ifndef VUINT16x4_STORE_DEFINED VEC_MMX_STORE(u, 16, 4) # define VUINT16x4_STORE_DEFINED #endif #ifndef VUINT16x4_STORE_ALIGNED_DEFINED VEC_MMX_STORE_ALIGNED(u, 16, 4) # define VUINT16x4_STORE_ALIGNED_DEFINED #endif #ifndef VUINT16x4_ADD_DEFINED VEC_MMX_OP(add, i, add, u, 16, 4) # define VUINT16x4_ADD_DEFINED #endif #ifndef VUINT16x4_SUB_DEFINED VEC_MMX_OP(sub, i, sub, u, 16, 4) # define VUINT16x4_SUB_DEFINED #endif #ifndef VUINT16x4_MUL_DEFINED VEC_MMX_MUL_16x4(u) # define VUINT16x4_MUL_DEFINED #endif #ifndef VUINT16x4_AND_DEFINED VEC_MMX_BITWISE(and, u, 16, 4) # define VUINT16x4_AND_DEFINED #endif #ifndef VUINT16x4_OR_DEFINED VEC_MMX_BITWISE(or, u, 16, 4) # define VUINT16x4_OR_DEFINED #endif #ifndef VUINT16x4_XOR_DEFINED VEC_MMX_BITWISE(xor, u, 16, 4) # define VUINT16x4_XOR_DEFINED #endif #ifndef VUINT16x4_CMPEQ_DEFINED VEC_MMX_CMPEQ(u, 16, 4) # define VUINT16x4_CMPEQ_DEFINED #endif #ifndef VUINT16x4_CMPLT_DEFINED VEC_MMX_CMPLT(u, 16, 4) # define VUINT16x4_CMPLT_DEFINED #endif #ifndef VUINT16x4_CMPGT_DEFINED VEC_MMX_CMPGT(u, 16, 4) # define VUINT16x4_CMPGT_DEFINED #endif /* ------------------------------------------------------------------------ */ #ifndef VINT32x2_SPLAT_DEFINED VEC_MMX_SPLAT(, 32, 2) # define VINT32x2_SPLAT_DEFINED #endif #ifndef VINT32x2_LOAD_DEFINED VEC_MMX_LOAD(, 32, 2) # define VINT32x2_LOAD_DEFINED #endif #ifndef VINT32x2_LOAD_ALIGNED_DEFINED VEC_MMX_LOAD_ALIGNED(, 32, 2) # define VINT32x2_LOAD_ALIGNED_DEFINED #endif #ifndef VINT32x2_STORE_DEFINED VEC_MMX_STORE(, 32, 2) # define VINT32x2_STORE_DEFINED #endif #ifndef VINT32x2_STORE_ALIGNED_DEFINED VEC_MMX_STORE_ALIGNED(, 32, 2) # define VINT32x2_STORE_ALIGNED_DEFINED #endif #ifndef VINT32x2_ADD_DEFINED VEC_MMX_OP(add, i, add, /* nothing */, 32, 2) # define VINT32x2_ADD_DEFINED #endif #ifndef VINT32x2_SUB_DEFINED VEC_MMX_OP(sub, i, sub, /* nothing */, 32, 2) # define VINT32x2_SUB_DEFINED #endif #ifndef VINT32x2_AND_DEFINED VEC_MMX_BITWISE(and, /* nothing */, 32, 2) # define VINT32x2_AND_DEFINED #endif #ifndef VINT32x2_OR_DEFINED VEC_MMX_BITWISE(or, /* nothing */, 32, 2) # define VINT32x2_OR_DEFINED #endif #ifndef VINT32x2_XOR_DEFINED VEC_MMX_BITWISE(xor, /* nothing */, 32, 2) # define VINT32x2_XOR_DEFINED #endif #ifndef VINT32x2_CMPEQ_DEFINED VEC_MMX_CMPEQ(, 32, 2) # define VINT32x2_CMPEQ_DEFINED #endif #ifndef VINT32x2_CMPLT_DEFINED VEC_MMX_CMPLT(, 32, 2) # define VINT32x2_CMPLT_DEFINED #endif #ifndef VINT32x2_CMPGT_DEFINED VEC_MMX_CMPGT(, 32, 2) # define VINT32x2_CMPGT_DEFINED #endif /* ------------------------------------------------------------------------ */ #ifndef VUINT32x2_SPLAT_DEFINED VEC_MMX_SPLAT(u, 32, 2) # define VUINT32x2_SPLAT_DEFINED #endif #ifndef VUINT32x2_LOAD_DEFINED VEC_MMX_LOAD(u, 32, 2) # define VUINT32x2_LOAD_DEFINED #endif #ifndef VUINT32x2_LOAD_ALIGNED_DEFINED VEC_MMX_LOAD_ALIGNED(u, 32, 2) # define VUINT32x2_LOAD_ALIGNED_DEFINED #endif #ifndef VUINT32x2_STORE_DEFINED VEC_MMX_STORE(u, 32, 2) # define VUINT32x2_STORE_DEFINED #endif #ifndef VUINT32x2_STORE_ALIGNED_DEFINED VEC_MMX_STORE_ALIGNED(u, 32, 2) # define VUINT32x2_STORE_ALIGNED_DEFINED #endif #ifndef VUINT32x2_ADD_DEFINED VEC_MMX_OP(add, i, add, u, 32, 2) # define VUINT32x2_ADD_DEFINED #endif #ifndef VUINT32x2_SUB_DEFINED VEC_MMX_OP(sub, i, sub, u, 32, 2) # define VUINT32x2_SUB_DEFINED #endif #ifndef VUINT32x2_AND_DEFINED VEC_MMX_BITWISE(and, u, 32, 2) # define VUINT32x2_AND_DEFINED #endif #ifndef VUINT32x2_OR_DEFINED VEC_MMX_BITWISE(or, u, 32, 2) # define VUINT32x2_OR_DEFINED #endif #ifndef VUINT32x2_XOR_DEFINED VEC_MMX_BITWISE(xor, u, 32, 2) # define VUINT32x2_XOR_DEFINED #endif #ifndef VUINT32x2_CMPEQ_DEFINED VEC_MMX_CMPEQ(u, 32, 2) # define VUINT32x2_CMPEQ_DEFINED #endif #ifndef VUINT32x2_CMPLT_DEFINED VEC_MMX_CMPLT(u, 32, 2) # define VUINT32x2_CMPLT_DEFINED #endif #ifndef VUINT32x2_CMPGT_DEFINED VEC_MMX_CMPGT(u, 32, 2) # define VUINT32x2_CMPGT_DEFINED #endif #endif /* VEC_IMPL_X86_MMX_H_ */