annotate src/impl/x86/sse2.c @ 25:92156fe32755

impl/ppc/altivec: update to new implementation the signed average function is wrong; it needs to round up the number when only one of them is odd, but that doesn't necessarily seem to be true because altivec is weird, and that's what we need to emulate the quirks for. ugh. also the altivec backend uses the generic functions instead of fallbacks because it does indeed use the exact same memory structure as the generic implementation...
author Paper <paper@tflc.us>
date Sun, 24 Nov 2024 11:15:59 +0000
parents e49e70f7012f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
23
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
1 /**
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
2 * vec - a tiny SIMD vector library in C99
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
3 *
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
4 * Copyright (c) 2024 Paper
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
5 *
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
7 * of this software and associated documentation files (the "Software"), to deal
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
8 * in the Software without restriction, including without limitation the rights
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
10 * copies of the Software, and to permit persons to whom the Software is
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
11 * furnished to do so, subject to the following conditions:
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
12 *
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
13 * The above copyright notice and this permission notice shall be included in all
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
14 * copies or substantial portions of the Software.
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
15 *
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
22 * SOFTWARE.
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
23 **/
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
24
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
25 #include "vec/impl/x86/sse2.h"
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
26 #include "vec/impl/generic.h"
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
27
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
28 #include <emmintrin.h>
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
29
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
30 #define VEC_SSE2_OPERATION_8x16(op, sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
31 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
32 /* unpack and multiply */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
33 union v##sign##int8x16_impl_data *vec1d = (union v##sign##int8x16_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
34 union v##sign##int8x16_impl_data *vec2d = (union v##sign##int8x16_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
35 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
36 __m128i dst_even = _mm_##op##_epi16(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
37 __m128i dst_odd = _mm_##op##_epi16(_mm_srli_epi16(vec1d->sse, 8), _mm_srli_epi16(vec2d->sse, 8)); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
38 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
39 /* repack */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
40 vec1d->sse = _mm_or_si128( \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
41 _mm_slli_epi16(dst_odd, 8), \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
42 _mm_srli_epi16(_mm_slli_epi16(dst_even, 8), 8) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
43 ); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
44 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
45 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
46
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
47 // shared between SSE2 variations
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
48 #define VEC_SSE2_MUL_8x16(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
49 VEC_SSE2_OPERATION_8x16(mullo, sign)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
50
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
51 #define VEC_SSE2_MUL_16x8(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
52 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
53 /* we have a real instruction for this */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
54 union v##sign##int16x8_impl_data *vec1d = (union v##sign##int16x8_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
55 union v##sign##int16x8_impl_data *vec2d = (union v##sign##int16x8_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
56 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
57 vec1d->sse = _mm_mullo_epi16(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
58 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
59 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
60
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
61 #define VEC_SSE2_MUL_32x4(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
62 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
63 /* this was stolen from... somewhere :) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
64 union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
65 union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
66 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
67 __m128i a13 = _mm_shuffle_epi32(vec1d->sse, 0xF5); /* (-,a3,-,a1) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
68 __m128i b13 = _mm_shuffle_epi32(vec2d->sse, 0xF5); /* (-,b3,-,b1) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
69 __m128i prod02 = _mm_mul_epu32(vec1d->sse, vec2d->sse); /* (-,a2*b2,-,a0*b0) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
70 __m128i prod13 = _mm_mul_epu32(a13, b13); /* (-,a3*b3,-,a1*b1) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
71 __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); /* (-,-,a1*b1,a0*b0) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
72 __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); /* (-,-,a3*b3,a2*b2) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
73 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
74 vec1d->sse = _mm_srl_epi64(prod01, prod23); /* (ab3,ab2,ab1,ab0) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
75 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
76 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
77
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
78 #define VEC_SSE2_MUL_64x2(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
79 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
80 union v##sign##int64x2_impl_data *vec1d = (union v##sign##int64x2_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
81 union v##sign##int64x2_impl_data *vec2d = (union v##sign##int64x2_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
82 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
83 __m128i ac = _mm_mul_epu32(vec1d->sse, vec2d->sse); /* ac = (vec1 & UINT32_MAX) * (vec2 & UINT32_MAX); */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
84 __m128i b = _mm_srli_epi64(vec1d->sse, 32); /* b = vec1 >> 32; */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
85 __m128i bc = _mm_mul_epu32(b, vec2d->sse); /* bc = b * (vec2 & UINT32_MAX); */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
86 __m128i d = _mm_srli_epi64(vec2d->sse, 32); /* d = vec2 >> 32; */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
87 __m128i ad = _mm_mul_epu32(vec1d->sse, d); /* ad = (vec1 & UINT32_MAX) * d; */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
88 __m128i hi = _mm_add_epi64(bc, ad); /* hi = bc + ad; */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
89 hi = _mm_slli_epi64(hi, 32); /* hi <<= 32; */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
90 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
91 vec1d->sse = _mm_add_epi64(hi, ac); /* (ab3,ab2,ab1,ab0) */ \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
92 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
93 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
94
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
95 #define VEC_SSE2_CMPEQ_8x16(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
96 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
97 union v##sign##int8x16_impl_data *vec1d = (union v##sign##int8x16_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
98 union v##sign##int8x16_impl_data *vec2d = (union v##sign##int8x16_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
99 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
100 vec1d->sse = _mm_cmpeq_epi8(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
101 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
102 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
103
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
104 #define VEC_SSE2_CMPEQ_16x8(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
105 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
106 union v##sign##int16x8_impl_data *vec1d = (union v##sign##int16x8_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
107 union v##sign##int16x8_impl_data *vec2d = (union v##sign##int16x8_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
108 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
109 vec1d->sse = _mm_cmpeq_epi16(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
110 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
111 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
112
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
113 #define VEC_SSE2_CMPEQ_32x4(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
114 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
115 union v##sign##int32x4_impl_data *vec1d = (union v##sign##int32x4_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
116 union v##sign##int32x4_impl_data *vec2d = (union v##sign##int32x4_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
117 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
118 vec1d->sse = _mm_cmpeq_epi32(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
119 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
120 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
121
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
122 // SSE2 doesn't have an intrinsic for 64x2 equality comparison,
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
123 // so how can we take a 32x4 comparison result and turn it into
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
124 // a 64x2 comparison result?
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
125 //
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
126 // well, Intel conveniently provided an operation where we can
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
127 // shuffle around 32-bit integers (_mm_shuffle_epi32).
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
128 //
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
129 // this means all we have to do is simply do the 32-bit operation,
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
130 // shuffle the parts, and then return a bitwise AND of the result.
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
131
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
132 #define VEC_SSE2_CMPEQ_64x2(sign) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
133 do { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
134 union v##sign##int64x2_impl_data *vec1d = (union v##sign##int64x2_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
135 union v##sign##int64x2_impl_data *vec2d = (union v##sign##int64x2_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
136 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
137 vec1d->sse = _mm_cmpeq_epi32(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
138 vec2d->sse = _mm_shuffle_epi32(vec1d->sse, _MM_SHUFFLE(1, 1, 3, 3)); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
139 vec1d->sse = _mm_shuffle_epi32(vec1d->sse, _MM_SHUFFLE(0, 0, 2, 2)); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
140 vec1d->sse = _mm_and_si128(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
141 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
142 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
143 } while (0)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
144
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
145 #define VEC_SSE2_DEFINE_OPERATIONS_SIGN(sign, bits, size) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
146 union v##sign##int##bits##x##size##_impl_data { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
147 v##sign##int##bits##x##size vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
148 __m128i sse; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
149 }; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
150 \
24
e49e70f7012f impl/x86: add static assertions for alignment and size
Paper <paper@tflc.us>
parents: 23
diff changeset
151 VEC_STATIC_ASSERT(VEC_ALIGNOF(__m128i) <= VEC_ALIGNOF(v##sign##int##bits##x##size), "vec: v" #sign "int" #bits "x" #size " alignment needs to be expanded to fit intrinsic type size"); \
e49e70f7012f impl/x86: add static assertions for alignment and size
Paper <paper@tflc.us>
parents: 23
diff changeset
152 VEC_STATIC_ASSERT(sizeof(__m128i) <= sizeof(v##sign##int##bits##x##size), "vec: v" #sign "int" #bits "x" #size " needs to be expanded to fit intrinsic type size"); \
e49e70f7012f impl/x86: add static assertions for alignment and size
Paper <paper@tflc.us>
parents: 23
diff changeset
153 \
23
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
154 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load_aligned(const vec_##sign##int##bits in[size]) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
155 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
156 union v##sign##int##bits##x##size##_impl_data vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
157 vec.sse = _mm_load_si128((const __m128i *)in); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
158 return vec.vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
159 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
160 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
161 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_load(const vec_##sign##int##bits in[size]) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
162 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
163 union v##sign##int##bits##x##size##_impl_data vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
164 vec.sse = _mm_loadu_si128((const __m128i *)in); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
165 return vec.vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
166 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
167 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
168 void v##sign##int##bits##x##size##_sse2_store_aligned(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
169 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
170 _mm_store_si128((__m128i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
171 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
172 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
173 void v##sign##int##bits##x##size##_sse2_store(v##sign##int##bits##x##size vec, vec_##sign##int##bits out[size]) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
174 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
175 _mm_storeu_si128((__m128i *)out, ((union v##sign##int##bits##x##size##_impl_data *)&vec)->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
176 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
177 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
178 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_add(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
179 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
180 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
181 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
182 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
183 vec1d->sse = _mm_add_epi##bits(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
184 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
185 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
186 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
187 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_sub(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
188 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
189 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
190 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
191 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
192 vec1d->sse = _mm_sub_epi##bits(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
193 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
194 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
195 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
196 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_mul(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
197 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
198 VEC_SSE2_MUL_##bits##x##size(sign); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
199 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
200 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
201 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_and(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
202 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
203 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
204 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
205 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
206 vec1d->sse = _mm_and_si128(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
207 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
208 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
209 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
210 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_or(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
211 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
212 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
213 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
214 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
215 vec1d->sse = _mm_or_si128(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
216 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
217 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
218 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
219 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_xor(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
220 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
221 union v##sign##int##bits##x##size##_impl_data *vec1d = (union v##sign##int##bits##x##size##_impl_data *)&vec1; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
222 union v##sign##int##bits##x##size##_impl_data *vec2d = (union v##sign##int##bits##x##size##_impl_data *)&vec2; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
223 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
224 vec1d->sse = _mm_xor_si128(vec1d->sse, vec2d->sse); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
225 return vec1d->vec; \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
226 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
227 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
228 v##sign##int##bits##x##size v##sign##int##bits##x##size##_sse2_cmpeq(v##sign##int##bits##x##size vec1, v##sign##int##bits##x##size vec2) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
229 { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
230 VEC_SSE2_CMPEQ_##bits##x##size(sign); \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
231 } \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
232 \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
233 const v##sign##int##bits##x##size##_impl v##sign##int##bits##x##size##_impl_sse2 = { \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
234 v##sign##int##bits##x##size##_generic_splat, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
235 v##sign##int##bits##x##size##_sse2_load_aligned, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
236 v##sign##int##bits##x##size##_sse2_load, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
237 v##sign##int##bits##x##size##_sse2_store_aligned, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
238 v##sign##int##bits##x##size##_sse2_store, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
239 v##sign##int##bits##x##size##_sse2_add, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
240 v##sign##int##bits##x##size##_sse2_sub, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
241 v##sign##int##bits##x##size##_sse2_mul, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
242 v##sign##int##bits##x##size##_generic_div, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
243 v##sign##int##bits##x##size##_generic_avg, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
244 v##sign##int##bits##x##size##_sse2_and, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
245 v##sign##int##bits##x##size##_sse2_or, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
246 v##sign##int##bits##x##size##_sse2_xor, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
247 v##sign##int##bits##x##size##_generic_not, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
248 v##sign##int##bits##x##size##_generic_lshift, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
249 v##sign##int##bits##x##size##_generic_rshift, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
250 v##sign##int##bits##x##size##_generic_lrshift, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
251 v##sign##int##bits##x##size##_generic_cmplt, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
252 v##sign##int##bits##x##size##_generic_cmple, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
253 v##sign##int##bits##x##size##_sse2_cmpeq, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
254 v##sign##int##bits##x##size##_generic_cmpge, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
255 v##sign##int##bits##x##size##_generic_cmpgt, \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
256 };
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
257
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
258 #define VEC_SSE2_DEFINE_OPERATIONS(bits, size) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
259 VEC_SSE2_DEFINE_OPERATIONS_SIGN(u, bits, size) \
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
260 VEC_SSE2_DEFINE_OPERATIONS_SIGN( , bits, size)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
261
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
262 // SSE is *only* 128-bit
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
263 VEC_SSE2_DEFINE_OPERATIONS(8, 16)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
264 VEC_SSE2_DEFINE_OPERATIONS(16, 8)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
265 VEC_SSE2_DEFINE_OPERATIONS(32, 4)
e26874655738 *: huge refactor, new major release (hahaha)
Paper <paper@tflc.us>
parents:
diff changeset
266 VEC_SSE2_DEFINE_OPERATIONS(64, 2)