# HG changeset patch # User Paper # Date 1729574561 14400 # Node ID 02a517e4c4920802f62e6463caa1ebe7c8a72c88 *: initial commit diff -r 000000000000 -r 02a517e4c492 LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Tue Oct 22 01:22:41 2024 -0400 @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Paper + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff -r 000000000000 -r 02a517e4c492 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Tue Oct 22 01:22:41 2024 -0400 @@ -0,0 +1,46 @@ +vec - a tiny SIMD vector header-only library written in C99 + +it comes with an extremely basic (and somewhat lacking) API, +where there are eight supported vector types, all 128-bit: + + vint8x16 - 16 signed 8-bit integers + vint16x8 - 8 signed 16-bit integers + vint32x4 - 4 signed 32-bit integers + vint64x2 - 2 signed 64-bit integers + vuint8x16 - 16 unsigned 8-bit integers + vuint16x8 - 8 unsigned 16-bit integers + vuint32x4 - 4 unsigned 32-bit integers + vuint32x4 - 2 unsigned 64-bit integers + +all of these have many operations that are prefixed with the +name of the type and an underscore, for example: + + vint8x16 vint8x16_splat(uint8_t x) + - creates a vint8x16 where all of the values are filled + with the value of `x' + +the current supported operations are: + + v[u]intAxB splat([u]intA_t x) + creates a vector with all of the values are filled with + the value of `x' + + v[u]intAxB load(const [u]intA_t x[B]) + copies the values from the memory address stored at `x'; + the address is NOT required to be aligned + + void store(v[u]intAxB vec, [u]intA_t x[B]) + copies the values from the vector into the memory address + stored at `x' + + like with load(), this does not require address alignment + + v[u]intAxB add(v[u]intAxB vec1, v[u]intAxB vec2) + adds the value of `vec1' and `vec2' and returns it + + v[u]intAxB sub(v[u]intAxB vec1, v[u]intAxB vec2) + subtracts the value of `vec2' from `vec1' and returns it + + v[u]intAxB mul(v[u]intAxB vec1, v[u]intAxB vec2) + multiplies the values of `vec1' and `vec2' together and + returns it diff -r 000000000000 -r 02a517e4c492 include/vec/impl/altivec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/altivec.h Tue Oct 22 01:22:41 2024 -0400 @@ -0,0 +1,145 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +/* Altivec vector support. */ + +#include +#include + +#include + +#define VEC_ALTIVEC_ALIGNMENT 16 + +/* Since altivec conveniently made their API super user friendly, we can just use + * one giant macro to define literally everything */ +#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t i) \ + { \ + return vec_splats(i); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + { \ + return vec_perm(vec_ld(0, in), vec_ld(VEC_ALTIVEC_ALIGNMENT, in), vec_lvsl(0, in)); \ + } \ + \ + static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + VEC_ALIGNED_ARRAY(sign##int##bits##_t, aligned_out, size, VEC_ALTIVEC_ALIGNMENT); \ + vec_st(vec, 0, aligned_out); \ + memcpy(out, aligned_out, size * sizeof(*aligned_out)); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return vec_add(vec1, vec2); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return vec_sub(vec1, vec2); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return vec_mul(vec1, vec2); \ + } + +#ifndef VEC_VUINT8X16 +# define VEC_VUINT8X16 +typedef vector unsigned char vuint8x16; +# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } +VEC_DEFINE_OPERATIONS(u, 8, 16) +# define VUINT8x16_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +#endif /* VEC_VUINT8X16 */ + +#ifndef VEC_VINT8X16 +# define VEC_VINT8X16 +typedef vector signed char vint8x16; +# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } +VEC_DEFINE_OPERATIONS(, 8, 16) +# define VINT8x16_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +#endif /* VEC_VINT8X16 */ + +#ifndef VEC_VUINT16X8 +# define VEC_VUINT16X8 +typedef vector unsigned short vuint16x8; +# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (vuint16x8){ a, b, c, d, e, f, g, h } +VEC_DEFINE_OPERATIONS(u, 16, 8) +# define VUINT16x8_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +#endif /* VEC_VUINT16X8 */ + +#ifndef VEC_VINT16X8 +# define VEC_VINT16X8 +typedef vector signed short vint16x8; +# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (vint16x8){ a, b, c, d, e, f, g, h } +VEC_DEFINE_OPERATIONS(, 16, 8) +# define VINT16x8_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +#endif /* VEC_VINT16X8 */ + +#ifndef VEC_VUINT32X4 +# define VEC_VUINT32X4 +typedef vector unsigned int vuint32x4; +# define VUINT32x4_CONSTANT(a, b, c, d) \ + (vuint32x4){ a, b, c, d } +VEC_DEFINE_OPERATIONS(u, 32, 4) +# define VUINT32x4_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +#endif /* VEC_VUINT32X4 */ + +#ifndef VEC_VINT32X4 +# define VEC_VINT32X4 +typedef vector signed int vint32x4; +# define VINT32x4_CONSTANT(a, b, c, d) \ + (vint32x4){ a, b, c, d } +VEC_DEFINE_OPERATIONS(, 32, 4) +# define VINT32x4_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +#endif /* VEC_VINT32X4 */ + +#if defined(__POWER8__) && defined(__VSX__) + +# ifndef VEC_VUINT64X2 +# define VEC_VUINT64X2 +typedef vector unsigned long long vuint64x2; +# define VUINT64x2_CONSTANT(a, b) \ + (vuint64x2){ a, b } +VEC_DEFINE_OPERATIONS(u, 64, 2) +# define VUINT64x2_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +# endif /* VEC_VUINT64X2 */ + +# ifndef VEC_VINT64X2 +# define VEC_VINT64X2 +typedef vector signed long long vint64x2; +# define VINT64x2_CONSTANT(a, b) \ + (vint64x2){ a, b } +VEC_DEFINE_OPERATIONS(, 64, 2) +# define VINT64x2_ALIGNED(x) ((x) % VEC_ALTIVEC_ALIGNMENT == 0) +# endif /* VEC_VINT64X2 */ + +#endif /* defined(__POWER8__) && defined(__VSX__) */ + +#undef VEC_DEFINE_OPERATIONS diff -r 000000000000 -r 02a517e4c492 include/vec/impl/gcc.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/gcc.h Tue Oct 22 01:22:41 2024 -0400 @@ -0,0 +1,137 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +/* GCC built in vectors */ + +#include +#include + +#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_splat(sign##int##bits##_t x) \ + { \ + v##sign##int##bits##x##size vec; \ + for (int i = 0; i < size; i++) vec[i] = x; \ + return vec; \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + memcpy(&vec, in, sizeof(vec)); \ + return vec; \ + } \ + \ + static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + memcpy(out, &vec, sizeof(vec)); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return vec1 + vec2; \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return vec1 - vec2; \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return vec1 * vec2; \ + } + +#ifndef VEC_VINT8X16 +# define VEC_VINT8X16 +typedef int8_t vint8x16 __attribute__((__vector_size__(16))); +# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (vint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } +VEC_DEFINE_OPERATIONS(, 8, 16) +# define VINT8x16_ALIGNED 1 +#endif + +#ifndef VEC_VINT16X8 +# define VEC_VINT16X8 +typedef int16_t vint16x8 __attribute__((__vector_size__(16))); +# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (vint16x8){ a, b, c, d, e, f, g, h } +VEC_DEFINE_OPERATIONS(, 16, 8) +# define VINT16x8_ALIGNED 1 +#endif + +#ifndef VEC_VINT32X4 +# define VEC_VINT32X4 +typedef int32_t vint32x4 __attribute__((__vector_size__(16))); +# define VINT32x4_CONSTANT(a, b, c, d) \ + (vint32x4){ a, b, c, d } +VEC_DEFINE_OPERATIONS(, 32, 4) +# define VINT32x4_ALIGNED 1 +#endif + +#ifndef VEC_VINT64X2 +# define VEC_VINT64X2 +typedef int64_t vint64x2 __attribute__((__vector_size__(16))); +# define VINT64x2_CONSTANT(a, b) \ + (vint64x2){ a, b } +VEC_DEFINE_OPERATIONS(, 64, 2) +# define VINT64x2_ALIGNED 1 +#endif + +#ifndef VEC_VUINT8X16 +# define VEC_VUINT8X16 +typedef uint8_t vuint8x16 __attribute__((__vector_size__(16))); +# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (vuint8x16){ a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } +VEC_DEFINE_OPERATIONS(u, 8, 16) +# define VINT8x16_ALIGNED 1 +#endif + +#ifndef VEC_VUINT16X8 +# define VEC_VUINT16X8 +typedef uint16_t vuint16x8 __attribute__((__vector_size__(16))); +# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (vuint16x8){ a, b, c, d, e, f, g, h } +VEC_DEFINE_OPERATIONS(u, 16, 8) +# define VINT16x8_ALIGNED 1 +#endif + +#ifndef VEC_VUINT32X4 +# define VEC_VUINT32X4 +typedef uint32_t vuint32x4 __attribute__((__vector_size__(16))); +# define VUINT32x4_CONSTANT(a, b, c, d) \ + (vuint32x4){ a, b, c, d } +VEC_DEFINE_OPERATIONS(u, 32, 4) +# define VINT32x4_ALIGNED 1 +#endif + +#ifndef VEC_VUINT64X2 +# define VEC_VUINT64X4 +typedef uint64_t vuint64x2 __attribute__((__vector_size__(16))); +# define VUINT64x2_CONSTANT(a, b) \ + (vuint64x2){ a, b } +VEC_DEFINE_OPERATIONS(u, 64, 2) +# define VINT64x2_ALIGNED 1 +#endif + +#undef VEC_DEFINE_OPERATIONS diff -r 000000000000 -r 02a517e4c492 include/vec/impl/generic.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/generic.h Tue Oct 22 01:22:41 2024 -0400 @@ -0,0 +1,149 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +/* Generic array-based implementation. */ + +#include +#include + +#define VEC_DEFINE_STRUCT(sign, bits, size) \ + typedef struct { \ + sign##int##bits##_t arr[size]; \ + } v##sign##int##bits##x##size; + +#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v ## sign ## int ## bits ## x ## size ## _splat(sign ## int ## bits ## _t x) \ + { \ + v##sign##int##bits##x##size vec; \ + for (int i = 0; i < size; i++) vec.arr[i] = x; \ + return vec; \ + } \ + \ + static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _load(const sign ## int ## bits ## _t in[size]) \ + { \ + v##sign##int##bits##x##size vec; \ + memcpy(vec.arr, in, sizeof(vec.arr)); \ + return vec; \ + } \ + \ + static inline VEC_ALWAYS_INLINE void v ## sign ## int ## bits ## x ## size ## _store(v ## sign ## int ## bits ## x ## size vec, sign ## int ## bits ## _t out[size]) \ + { \ + memcpy(out, vec.arr, sizeof(vec.arr)); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _add(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] + vec2.arr[i]; \ + return vec; \ + } \ + \ + static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _sub(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] - vec2.arr[i]; \ + return vec; \ + } \ + \ + static inline VEC_ALWAYS_INLINE v ## sign ## int ## bits ## x ## size v ## sign ## int ## bits ## x ## size ## _mul(v ## sign ## int ## bits ## x ## size vec1, v ## sign ## int ## bits ## x ## size vec2) \ + { \ + v##sign##int##bits##x##size vec; \ + for (int i = 0; i < size; i++) vec.arr[i] = vec1.arr[i] * vec2.arr[i]; \ + return vec; \ + } + +#ifndef VEC_VINT8X16 +# define VEC_VINT8X16 +VEC_DEFINE_STRUCT(, 8, 16) +# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + ((vint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) +VEC_DEFINE_OPERATIONS(, 8, 16) +# define VINT8x16_ALIGNED 1 +#endif + +#ifndef VEC_VINT16X8 +# define VEC_VINT16X8 +VEC_DEFINE_STRUCT(, 16, 8) +# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + ((vint16x8){ .arr = { a, b, c, d, e, f, g, h } }) +VEC_DEFINE_OPERATIONS(, 16, 8) +# define VINT16x8_ALIGNED 1 +#endif + +#ifndef VEC_VINT32X4 +# define VEC_VINT32X4 +VEC_DEFINE_STRUCT(, 32, 4) +# define VINT32x4_CONSTANT(a, b, c, d) \ + ((vint32x4){ .arr = { a, b, c, d } }) +VEC_DEFINE_OPERATIONS(, 32, 4) +# define VINT32x4_ALIGNED 1 +#endif + +#ifndef VEC_VINT64X2 +# define VEC_VINT64X2 +VEC_DEFINE_STRUCT(, 64, 2) +# define VINT64x2_CONSTANT(a, b) \ + ((vint64x2){ .arr = { a, b } }) +VEC_DEFINE_OPERATIONS(, 64, 2) +# define VINT64x2_ALIGNED 1 +#endif + +#ifndef VEC_VUINT8X16 +# define VEC_VUINT8X16 +VEC_DEFINE_STRUCT(u, 8, 16) +# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + ((vuint8x16){ .arr = { a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p } }) +VEC_DEFINE_OPERATIONS(u, 8, 16) +# define VINT8x16_ALIGNED 1 +#endif + +#ifndef VEC_VUINT16X8 +# define VEC_VUINT16X8 +VEC_DEFINE_STRUCT(u, 16, 8) +# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + ((vuint16x8){ .arr = { a, b, c, d, e, f, g, h } }) +VEC_DEFINE_OPERATIONS(u, 16, 8) +# define VINT16x8_ALIGNED 1 +#endif + +#ifndef VEC_VUINT32X4 +# define VEC_VUINT32X4 +VEC_DEFINE_STRUCT(u, 32, 4) +# define VUINT32x4_CONSTANT(a, b, c, d) \ + ((vuint32x4){ .arr = { a, b, c, d } }) +VEC_DEFINE_OPERATIONS(u, 32, 4) +# define VINT32x4_ALIGNED 1 +#endif + +#ifndef VEC_VUINT64X2 +# define VEC_VUINT64X2 +VEC_DEFINE_STRUCT(u, 64, 2) +# define VUINT64x2_CONSTANT(a, b) \ + ((vuint64x2){ .arr = { a, b } }) +VEC_DEFINE_OPERATIONS(u, 64, 2) +# define VINT64x2_ALIGNED 1 +#endif + +#undef VEC_DEFINE_STRUCT +#undef VEC_DEFINE_OPERATIONS diff -r 000000000000 -r 02a517e4c492 include/vec/impl/sse2.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/impl/sse2.h Tue Oct 22 01:22:41 2024 -0400 @@ -0,0 +1,255 @@ +/** + * vec - a tiny SIMD vector library in plain C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#include + +#define VEC_DEFINE_OPERATIONS(sign, bits, size) \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_load(const sign##int##bits##_t in[size]) \ + { \ + return _mm_loadu_si128((const __m128i *)in); \ + } \ + \ + static inline VEC_ALWAYS_INLINE void v##sign##int##bits##x##size##_store(v##sign##int##bits##x##size vec, sign##int##bits##_t out[size]) \ + { \ + memcpy(out, &vec, sizeof(vec)); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return _mm_add_epi##bits(vec1, vec2); \ + } \ + \ + static inline VEC_ALWAYS_INLINE v##sign##int##bits##x##size v##sign##int##bits##x##size##_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \ + { \ + return _mm_sub_epi##bits(vec1, vec2); \ + } + +#ifndef VEC_VINT8X16 +# define VEC_VINT8X16 +typedef __m128i vint8x16; +# define VINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (_mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)) +VEC_DEFINE_OPERATIONS(, 8, 16) +# define VINT8x16_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vint8x16 vint8x16_mul(vint8x16 vec1, vint8x16 vec2) +{ + // unpack and multiply + __m128i dst_even = _mm_mullo_epi16(vec1, vec2); + __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8)); + + // repack + return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8)); +} +static inline VEC_ALWAYS_INLINE vint8x16 vint8x16_splat(int8_t c) +{ + return VINT8x16_CONSTANT(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c); +} +#endif + +#ifndef VEC_VINT16X8 +# define VEC_VINT16X8 +typedef __m128i vint16x8; +# define VINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (_mm_setr_epi16(a, b, c, d, e, f, g, h)) +VEC_DEFINE_OPERATIONS(, 16, 8) +# define VINT16x8_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vint16x8 vint16x8_mul(vint16x8 vec1, vint16x8 vec2) +{ + return _mm_mullo_epi16(vec1, vec2); +} +static inline VEC_ALWAYS_INLINE vint16x8 vint16x8_splat(int16_t c) +{ + return VINT16x8_CONSTANT(c, c, c, c, c, c, c, c); +} +#endif + +#ifndef VEC_VINT32X4 +# define VEC_VINT32X4 +typedef __m128i vint32x4; +# define VINT32x4_CONSTANT(a, b, c, d) \ + (_mm_setr_epi32(a, b, c, d)) +VEC_DEFINE_OPERATIONS(, 32, 4) +# define VINT32x4_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vint32x4 vint32x4_mul(vint32x4 a, vint32x4 b) +{ + __m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0) + __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) + __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) + return _mm_unpacklo_epi64(prod01, prod23); // (ab3,ab2,ab1,ab0) +} +static inline VEC_ALWAYS_INLINE vint32x4 vint32x4_splat(int32_t c) +{ + return VINT32x4_CONSTANT(c, c, c, c); +} +#endif + +#ifndef VEC_VINT64X2 +# define VEC_VINT64X2 +typedef __m128i vint64x2; +static inline VEC_ALWAYS_INLINE vint64x2 VINT64x2_CONSTANT(int64_t a, int64_t b) +{ + return _mm_setr_epi32(b, b >> 32, a, a >> 32); +} +VEC_DEFINE_OPERATIONS(, 64, 2) +# define VINT64x2_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vint64x2 vint64x2_mul(vint64x2 ab, vint64x2 cd) +{ + /* ac = (ab & 0xFFFFFFFF) * (cd & 0xFFFFFFFF); */ + __m128i ac = _mm_mul_epu32(ab, cd); + + /* b = ab >> 32; */ + __m128i b = _mm_srli_epi64(ab, 32); + + /* bc = b * (cd & 0xFFFFFFFF); */ + __m128i bc = _mm_mul_epu32(b, cd); + + /* d = cd >> 32; */ + __m128i d = _mm_srli_epi64(cd, 32); + + /* ad = (ab & 0xFFFFFFFF) * d; */ + __m128i ad = _mm_mul_epu32(ab, d); + + /* high = bc + ad; */ + __m128i high = _mm_add_epi64(bc, ad); + + /* high <<= 32; */ + high = _mm_slli_epi64(high, 32); + + /* return ac + high; */ + return _mm_add_epi64(high, ac); +} +static inline VEC_ALWAYS_INLINE vint64x2 vint64x2_splat(int64_t c) +{ + return VINT64x2_CONSTANT(c, c); +} +#endif + +#ifndef VEC_VUINT8X16 +# define VEC_VUINT8X16 +typedef __m128i vuint8x16; +# define VUINT8x16_CONSTANT(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + (_mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p)) +VEC_DEFINE_OPERATIONS(u, 8, 16) +# define VINT8x16_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vint8x16 vuint8x16_mul(vuint8x16 vec1, vuint8x16 vec2) +{ + // unpack and multiply + __m128i dst_even = _mm_mullo_epi16(vec1, vec2); + __m128i dst_odd = _mm_mullo_epi16(_mm_srli_epi16(vec1, 8), _mm_srli_epi16(vec2, 8)); + + // repack + return _mm_or_si128(_mm_slli_epi16(dst_odd, 8), _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8)); +} +static inline VEC_ALWAYS_INLINE vuint8x16 vuint8x16_splat(uint8_t c) +{ + return VUINT8x16_CONSTANT(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c); +} +#endif + +#ifndef VEC_VUINT16X8 +# define VEC_VUINT16X8 +typedef __m128i vuint16x8; +# define VUINT16x8_CONSTANT(a, b, c, d, e, f, g, h) \ + (_mm_setr_epi16(a, b, c, d, e, f, g, h)) +VEC_DEFINE_OPERATIONS(u, 16, 8) +# define VINT16x8_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vuint16x8 vuint16x8_mul(vuint16x8 vec1, vuint16x8 vec2) +{ + return _mm_mullo_epi16(vec1, vec2); +} +static inline VEC_ALWAYS_INLINE vuint16x8 vuint16x8_splat(uint16_t c) +{ + return VUINT16x8_CONSTANT(c, c, c, c, c, c, c, c); +} +#endif + +#ifndef VEC_VUINT32X4 +# define VEC_VUINT32X4 +typedef __m128i vuint32x4; +# define VUINT32x4_CONSTANT(a, b, c, d) \ + (_mm_setr_epi32(a, b, c, d)) +VEC_DEFINE_OPERATIONS(u, 32, 4) +# define VUINT32x4_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vuint32x4 vuint32x4_mul(vuint32x4 a, vuint32x4 b) +{ + /* this was stolen from... somewhere :) */ + __m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0) + __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) + __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) + __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) + return _mm_unpacklo_epi64(prod01, prod23); // (ab3,ab2,ab1,ab0) +} +static inline VEC_ALWAYS_INLINE vuint32x4 vuint32x4_splat(int32_t c) +{ + return VUINT32x4_CONSTANT(c, c, c, c); +} +#endif + +#ifndef VEC_VUINT64X2 +# define VEC_VUINT64X2 +typedef __m128i vuint64x2; +static inline VEC_ALWAYS_INLINE vint64x2 VUINT64x2_CONSTANT(int64_t a, int64_t b) +{ + return _mm_setr_epi32(b, b >> 32, a, a >> 32); +} +VEC_DEFINE_OPERATIONS(u, 64, 2) +# define VUINT64x2_ALIGNED 1 +static inline VEC_ALWAYS_INLINE vuint64x2 vuint64x2_mul(vuint64x2 ab, vuint64x2 cd) +{ + /* ac = (ab & 0xFFFFFFFF) * (cd & 0xFFFFFFFF); */ + __m128i ac = _mm_mul_epu32(ab, cd); + + /* b = ab >> 32; */ + __m128i b = _mm_srli_epi64(ab, 32); + + /* bc = b * (cd & 0xFFFFFFFF); */ + __m128i bc = _mm_mul_epu32(b, cd); + + /* d = cd >> 32; */ + __m128i d = _mm_srli_epi64(cd, 32); + + /* ad = (ab & 0xFFFFFFFF) * d; */ + __m128i ad = _mm_mul_epu32(ab, d); + + /* high = bc + ad; */ + __m128i high = _mm_add_epi64(bc, ad); + + /* high <<= 32; */ + high = _mm_slli_epi64(high, 32); + + /* return ac + high; */ + return _mm_add_epi64(high, ac); +} +static inline VEC_ALWAYS_INLINE vuint64x2 vuint64x2_splat(uint64_t c) +{ + return VUINT64x2_CONSTANT(c, c); +} +#endif + +#undef VEC_DEFINE_OPERATIONS diff -r 000000000000 -r 02a517e4c492 include/vec/vec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/vec/vec.h Tue Oct 22 01:22:41 2024 -0400 @@ -0,0 +1,97 @@ +/** + * vec - a tiny SIMD vector library in C99 + * + * Copyright (c) 2024 Paper + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. +**/ + +#ifndef VEC_VEC_H_ +#define VEC_VEC_H_ + +#define VEC_SEMVER_ATLEAST(a, b, c, x, y, z) \ + (((a) >= (x)) && \ + ((a) > x || (b) >= (y)) && \ + ((a) > x || (b) > (y) || (c) >= (z))) + +#define VEC_GCC_ATLEAST(x, y, z) \ + VEC_SEMVER_ATLEAST(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, x, y, z) + +/* GCC/clang attributes */ +#if defined(__has_attribute) +# if __has_attribute(__always_inline__) +# define VEC_ALWAYS_INLINE __attribute__((__always_inline__)) +# endif +# if __has_attribute(__aligned__) +# define VEC_ALIGNED(x) __attribute__((__aligned__(x))) +# endif +# if __has_attribute(__vector_size__) +# define VEC_HAVE_GCC_VECTORS +# endif +#endif + +#ifndef VEC_HAVE_GCC_VECTORS +# if __GNUC__ >= 4 || __clang_major__ >= 3 +# define VEC_HAVE_GCC_VECTORS +# endif +#endif + +#ifndef VEC_ALIGNED +# if VEC_GCC_ATLEAST(2, 7, 0) +# define VEC_ALIGNED(x) __attribute__((aligned(x))) +# endif +#endif + +#ifndef VEC_ALWAYS_INLINE +# if VEC_GCC_ATLEAST(3, 1, 0) +# define VEC_ALWAYS_INLINE(x) __attribute__((always_inline)) +# endif +#endif + +#ifndef VEC_ALWAYS_INLINE +# define VEC_ALWAYS_INLINE +#endif + +#ifdef VEC_ALIGNED +# define VEC_ALIGNED_ARRAY(type, var, size, align) \ + VEC_ALIGNED(align) type var[size] +#else +/* allocate more than necessary to align */ +# define VEC_ALIGNED_ARRAY(type, var, size, align) \ + type var##_unaligned_[size + align - 1]; \ + type *var = (type *)((((intptr_t)var##_unaligned_ + align - 1) / align) * align) +#endif + +/* POWER altivec */ +#ifdef __ALTIVEC__ +# include "impl/altivec.h" +#endif + +/* x86_64 SSE2+ */ +#ifdef __SSE2__ +# include "impl/sse2.h" +#endif + +#ifdef VEC_HAVE_GCC_VECTORS +# include "impl/gcc.h" +#endif + +#include "impl/generic.h" + +#endif /* VEC_VEC_H_ */