|
1
|
1 #include "pfc-lite.h"
|
|
|
2 #include "audio_sample.h"
|
|
|
3 #include "primitives.h"
|
|
|
4 #include "cpuid.h"
|
|
|
5
|
|
|
6
|
|
|
7 #if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__x86_64__) || defined(__SSE2__)
|
|
|
8 #define AUDIO_MATH_SSE
|
|
|
9 #include <xmmintrin.h>
|
|
|
10 #include <tmmintrin.h> // _mm_shuffle_epi8
|
|
|
11 #include <smmintrin.h> // _mm_blend_epi16
|
|
|
12
|
|
|
13 #ifndef _mm_loadu_si32
|
|
|
14 #define _mm_loadu_si32(p) _mm_cvtsi32_si128(*(unsigned int const*)(p))
|
|
|
15 #endif
|
|
|
16 #ifndef _mm_storeu_si32
|
|
|
17 #define _mm_storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
|
|
|
18 #endif
|
|
|
19
|
|
|
20 #ifdef __AVX__
|
|
|
21 #define allowAVX 1
|
|
|
22 #define haveAVX 1
|
|
|
23 #elif PFC_HAVE_CPUID
|
|
|
24 #define allowAVX 1
|
|
|
25 static const bool haveAVX = pfc::query_cpu_feature_set(pfc::CPU_HAVE_AVX);
|
|
|
26 #else
|
|
|
27 #define allowAVX 0
|
|
|
28 #define haveAVX 0
|
|
|
29 #endif
|
|
|
30
|
|
|
31 #ifdef __SSE4_1__
|
|
|
32 #define haveSSE41 true
|
|
|
33 #elif PFC_HAVE_CPUID
|
|
|
34 static const bool haveSSE41 = pfc::query_cpu_feature_set(pfc::CPU_HAVE_SSE41);
|
|
|
35 #else
|
|
|
36 #define haveSSE41 false
|
|
|
37 #endif
|
|
|
38
|
|
|
39 #if allowAVX
|
|
|
40 #include <immintrin.h> // _mm256_set1_pd
|
|
|
41 #endif
|
|
|
42
|
|
|
43 #endif // end SSE
|
|
|
44
|
|
|
45 #if defined( __aarch64__ ) || defined( _M_ARM64) || defined( _M_ARM64EC )
|
|
|
46 #define AUDIO_MATH_ARM64
|
|
|
47 #endif
|
|
|
48
|
|
|
49 #if defined( AUDIO_MATH_ARM64 ) || defined( __ARM_NEON__ )
|
|
|
50 #define AUDIO_MATH_NEON
|
|
|
51 #include <arm_neon.h>
|
|
|
52
|
|
|
53 // No vcvtnq_s32_f32 on ARM32, use vcvtq_s32_f32, close enough
|
|
|
54 #ifdef AUDIO_MATH_ARM64
|
|
|
55 #define vcvtnq_s32_f32_wrap vcvtnq_s32_f32
|
|
|
56 #else
|
|
|
57 #define vcvtnq_s32_f32_wrap vcvtq_s32_f32
|
|
|
58 #endif
|
|
|
59
|
|
|
60 #endif
|
|
|
61
|
|
|
62
|
|
|
63 #if defined( AUDIO_MATH_ARM64 ) && !defined( __ANDROID__ )
|
|
|
64 // Don't do Neon float64 on Android, crashes clang from NDK 25
|
|
|
65 #define AUDIO_MATH_NEON_FLOAT64
|
|
|
66 #endif
|
|
|
67
|
|
|
68 template<typename float_t> inline static float_t noopt_calculate_peak(const float_t *p_src, t_size p_num)
|
|
|
69 {
|
|
|
70 float_t peak = 0;
|
|
|
71 t_size num = p_num;
|
|
|
72 for(;num;num--)
|
|
|
73 {
|
|
|
74 float_t temp = (float_t)fabs(*(p_src++));
|
|
|
75 peak = fmax(peak, temp);
|
|
|
76 }
|
|
|
77 return peak;
|
|
|
78 }
|
|
|
79
|
|
|
80
|
|
|
81 template<typename float_t>
|
|
|
82 inline static void noopt_convert_to_32bit(const float_t* p_source,t_size p_count,t_int32 * p_output, float_t p_scale)
|
|
|
83 {
|
|
|
84 t_size num = p_count;
|
|
|
85 for(;num;--num)
|
|
|
86 {
|
|
|
87 t_int64 val = pfc::audio_math::rint64( *(p_source++) * p_scale );
|
|
|
88 if (val < INT32_MIN) val = INT32_MIN;
|
|
|
89 else if (val > INT32_MAX) val = INT32_MAX;
|
|
|
90 *(p_output++) = (t_int32) val;
|
|
|
91 }
|
|
|
92 }
|
|
|
93
|
|
|
94 template<typename float_t>
|
|
|
95 inline static void noopt_convert_to_16bit(const float_t* p_source,t_size p_count,t_int16 * p_output, float_t p_scale) {
|
|
|
96 for(t_size n=0;n<p_count;n++) {
|
|
|
97 *(p_output++) = (t_int16) pfc::clip_t<int32_t>(pfc::audio_math::rint32(*(p_source++)*p_scale),INT16_MIN,INT16_MAX);
|
|
|
98 }
|
|
|
99 }
|
|
|
100
|
|
|
101 template<typename float_t>
|
|
|
102 inline static void noopt_convert_from_int16(const t_int16 * __restrict p_source,t_size p_count, float_t* __restrict p_output, float_t p_scale)
|
|
|
103 {
|
|
|
104 t_size num = p_count;
|
|
|
105 for(;num;num--)
|
|
|
106 *(p_output++) = (float_t)*(p_source++) * p_scale;
|
|
|
107 }
|
|
|
108
|
|
|
109
|
|
|
110
|
|
|
111 template<typename float_t>
|
|
|
112 inline static void noopt_convert_from_int32(const t_int32 * __restrict p_source,t_size p_count, float_t* __restrict p_output, float_t p_scale)
|
|
|
113 {
|
|
|
114 t_size num = p_count;
|
|
|
115 for(;num;num--)
|
|
|
116 *(p_output++) = (float_t)( * (p_source++) * p_scale );
|
|
|
117 }
|
|
|
118
|
|
|
119 template<typename in_t, typename out_t, typename scale_t>
|
|
|
120 inline static void noopt_scale(const in_t * p_source,size_t p_count,out_t * p_output,scale_t p_scale)
|
|
|
121 {
|
|
|
122 for(t_size n=0;n<p_count;n++)
|
|
|
123 p_output[n] = (out_t)(p_source[n] * p_scale);
|
|
|
124 }
|
|
|
125 template<typename in_t, typename out_t>
|
|
|
126 inline static void noopt_convert(const in_t* in, out_t* out, size_t count) {
|
|
|
127 for (size_t walk = 0; walk < count; ++walk) out[walk] = (out_t)in[walk];
|
|
|
128 }
|
|
|
129
|
|
|
130 #ifdef AUDIO_MATH_NEON
|
|
|
131
|
|
|
132 #ifdef AUDIO_MATH_ARM64
|
|
|
133 #define _vmaxvq_f32_wrap vmaxvq_f32
|
|
|
134 #else
|
|
|
135 inline float _vmaxvq_f32_wrap( float32x4_t arg ) {
|
|
|
136 return pfc::max_t<float>( pfc::max_t<float>(arg[0], arg[1]), pfc::max_t<float>(arg[2], arg[3]) );
|
|
|
137 }
|
|
|
138 #endif
|
|
|
139
|
|
|
140 inline static float neon_calculate_peak( const float * p_source, size_t p_count ) {
|
|
|
141 size_t num = p_count / 8;
|
|
|
142 float32x4_t ret1 = {}, ret2 = {};
|
|
|
143 for(;num;--num) {
|
|
|
144 float32x4_t f32lo = vld1q_f32( p_source );
|
|
|
145 float32x4_t f32hi = vld1q_f32( p_source + 4 );
|
|
|
146 p_source += 8;
|
|
|
147 ret1 = vmaxq_f32(ret1, vabsq_f32(f32lo));
|
|
|
148 ret2 = vmaxq_f32(ret2, vabsq_f32(f32hi));
|
|
|
149 }
|
|
|
150
|
|
|
151 float ret = _vmaxvq_f32_wrap(vmaxq_f32( ret1, ret2 ));
|
|
|
152
|
|
|
153 size_t rem = p_count % 8;
|
|
|
154 if ( rem != 0 ) {
|
|
|
155 float v = noopt_calculate_peak( p_source, p_count % 8);
|
|
|
156 if (v > ret) ret = v;
|
|
|
157 }
|
|
|
158
|
|
|
159 return ret;
|
|
|
160 }
|
|
|
161
|
|
|
162 inline static void neon_scale(const float * p_source,size_t p_count, float * p_output,float p_scale) {
|
|
|
163 size_t num = p_count / 8;
|
|
|
164 for(;num;--num) {
|
|
|
165 float32x4_t lo = vld1q_f32( p_source );
|
|
|
166 float32x4_t hi = vld1q_f32( p_source + 4 );
|
|
|
167
|
|
|
168 lo = vmulq_n_f32( lo, p_scale);
|
|
|
169 hi = vmulq_n_f32( hi, p_scale);
|
|
|
170
|
|
|
171 vst1q_f32( p_output, lo );
|
|
|
172 vst1q_f32( p_output+4, hi );
|
|
|
173
|
|
|
174 p_source += 8;
|
|
|
175 p_output += 8;
|
|
|
176 }
|
|
|
177
|
|
|
178 noopt_scale( p_source, p_count % 8, p_output, p_scale);
|
|
|
179 }
|
|
|
180 inline static void neon_convert_to_int32(const float * __restrict p_source,t_size p_count, int32_t * __restrict p_output,float p_scale)
|
|
|
181 {
|
|
|
182 size_t num = p_count / 8;
|
|
|
183 for(;num;--num) {
|
|
|
184 float32x4_t f32lo = vld1q_f32( p_source );
|
|
|
185 float32x4_t f32hi = vld1q_f32( p_source + 4 );
|
|
|
186
|
|
|
187 int32x4_t lo = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32lo, p_scale) );
|
|
|
188 int32x4_t hi = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32hi, p_scale) );
|
|
|
189
|
|
|
190 vst1q_s32(p_output, lo);
|
|
|
191 vst1q_s32(p_output+4, hi);
|
|
|
192
|
|
|
193 p_source += 8;
|
|
|
194 p_output += 8;
|
|
|
195
|
|
|
196 }
|
|
|
197
|
|
|
198 noopt_convert_to_32bit(p_source, p_count % 8, p_output, p_scale);
|
|
|
199 }
|
|
|
200
|
|
|
201 inline static void neon_convert_from_int32(const int32_t * __restrict p_source,t_size p_count, float * __restrict p_output,float p_scale)
|
|
|
202 {
|
|
|
203 size_t num = p_count / 8;
|
|
|
204 size_t rem = p_count % 8;
|
|
|
205 for(;num;num--) {
|
|
|
206 int32x4_t i32lo = vld1q_s32( p_source );
|
|
|
207 int32x4_t i32hi = vld1q_s32( p_source + 4 );
|
|
|
208 float32x4_t f32vl = vcvtq_f32_s32(i32lo);
|
|
|
209 float32x4_t f32vh = vcvtq_f32_s32(i32hi);
|
|
|
210
|
|
|
211 vst1q_f32(&p_output[0], vmulq_n_f32(f32vl, p_scale));
|
|
|
212 vst1q_f32(&p_output[4], vmulq_n_f32(f32vh, p_scale));
|
|
|
213
|
|
|
214 p_source += 8;
|
|
|
215 p_output += 8;
|
|
|
216
|
|
|
217 }
|
|
|
218
|
|
|
219 noopt_convert_from_int32( p_source, rem, p_output, p_scale );
|
|
|
220 }
|
|
|
221
|
|
|
222 inline static void neon_convert_to_int16(const float* __restrict p_source,t_size p_count, int16_t * __restrict p_output,float p_scale)
|
|
|
223 {
|
|
|
224 size_t num = p_count / 8;
|
|
|
225 size_t rem = p_count % 8;
|
|
|
226 for(;num;--num) {
|
|
|
227 float32x4_t f32lo = vld1q_f32( p_source );
|
|
|
228 float32x4_t f32hi = vld1q_f32( p_source + 4);
|
|
|
229
|
|
|
230 int32x4_t lo = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32lo, p_scale) );
|
|
|
231 int32x4_t hi = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32hi, p_scale) );
|
|
|
232
|
|
|
233 vst1q_s16(&p_output[0], vcombine_s16( vqmovn_s32( lo ), vqmovn_s32( hi ) ) );
|
|
|
234
|
|
|
235 p_source += 8;
|
|
|
236 p_output += 8;
|
|
|
237
|
|
|
238 }
|
|
|
239
|
|
|
240 noopt_convert_to_16bit(p_source, rem, p_output, p_scale);
|
|
|
241
|
|
|
242 }
|
|
|
243 inline static void neon_convert_from_int16(const t_int16 * __restrict p_source,t_size p_count, float * __restrict p_output,float p_scale)
|
|
|
244 {
|
|
|
245 size_t num = p_count / 8;
|
|
|
246 size_t rem = p_count % 8;
|
|
|
247 for(;num;num--) {
|
|
|
248 auto i16lo = vld1_s16(p_source);
|
|
|
249 auto i16hi = vld1_s16(p_source + 4);
|
|
|
250
|
|
|
251 float32x4_t f32vl = vcvtq_f32_s32(vmovl_s16 (i16lo));
|
|
|
252 float32x4_t f32vh = vcvtq_f32_s32(vmovl_s16 (i16hi));
|
|
|
253
|
|
|
254 vst1q_f32(&p_output[0], vmulq_n_f32(f32vl, p_scale));
|
|
|
255 vst1q_f32(&p_output[4], vmulq_n_f32(f32vh, p_scale));
|
|
|
256
|
|
|
257 p_source += 8;
|
|
|
258 p_output += 8;
|
|
|
259
|
|
|
260 }
|
|
|
261
|
|
|
262 noopt_convert_from_int16( p_source, rem, p_output, p_scale );
|
|
|
263 }
|
|
|
264 #ifdef AUDIO_MATH_NEON_FLOAT64
|
|
|
265 inline static void neon_convert_to_int16(const double* __restrict p_source, t_size p_count, int16_t* __restrict p_output, double p_scale)
|
|
|
266 {
|
|
|
267 size_t num = p_count / 4;
|
|
|
268 size_t rem = p_count % 4;
|
|
|
269 for (; num; --num) {
|
|
|
270 float64x2_t f64lo = vld1q_f64(p_source);
|
|
|
271 float64x2_t f64hi = vld1q_f64(p_source + 2);
|
|
|
272
|
|
|
273 f64lo = vmulq_n_f64(f64lo, p_scale);
|
|
|
274 f64hi = vmulq_n_f64(f64hi, p_scale);
|
|
|
275
|
|
|
276 int64x2_t lo64 = vcvtnq_s64_f64(f64lo);
|
|
|
277 int64x2_t hi64 = vcvtnq_s64_f64(f64hi);
|
|
|
278
|
|
|
279 int32x4_t v32 = vcombine_s32(vqmovn_s64(lo64), vqmovn_s64(hi64));
|
|
|
280
|
|
|
281
|
|
|
282 vst1_s16(&p_output[0], vqmovn_s32(v32));
|
|
|
283
|
|
|
284 p_source += 4;
|
|
|
285 p_output += 4;
|
|
|
286
|
|
|
287 }
|
|
|
288
|
|
|
289 noopt_convert_to_16bit(p_source, rem, p_output, p_scale);
|
|
|
290 }
|
|
|
291
|
|
|
292 inline static void neon_convert_from_int16(const t_int16* __restrict p_source, t_size p_count, double* __restrict p_output, double p_scale)
|
|
|
293 {
|
|
|
294 size_t num = p_count / 4;
|
|
|
295 size_t rem = p_count % 4;
|
|
|
296 for (; num; num--) {
|
|
|
297 int32x4_t i32 = vmovl_s16(vld1_s16(p_source));
|
|
|
298
|
|
|
299 int64x2_t lo64 = vmovl_s32( vget_low_s32(i32) );
|
|
|
300 int64x2_t hi64 = vmovl_s32(vget_high_s32(i32));
|
|
|
301
|
|
|
302 float64x2_t f64vl = vcvtq_f64_s64(lo64);
|
|
|
303 float64x2_t f64vh = vcvtq_f64_s64(hi64);
|
|
|
304
|
|
|
305 vst1q_f64(&p_output[0], vmulq_n_f64(f64vl, p_scale));
|
|
|
306 vst1q_f64(&p_output[2], vmulq_n_f64(f64vh, p_scale));
|
|
|
307
|
|
|
308 p_source += 4;
|
|
|
309 p_output += 4;
|
|
|
310
|
|
|
311 }
|
|
|
312
|
|
|
313 noopt_convert_from_int16(p_source, rem, p_output, p_scale);
|
|
|
314 }
|
|
|
315 #endif // AUDIO_MATH_NEON_FLOAT64
|
|
|
316
|
|
|
317 #endif // AUDIO_MATH_NEON
|
|
|
318
|
|
|
319 #if defined(AUDIO_MATH_SSE)
|
|
|
320
|
|
|
321 inline void convert_to_32bit_sse2(const float* p_src, size_t numTotal, t_int32* p_dst, float p_mul)
|
|
|
322 {
|
|
|
323
|
|
|
324 // Implementation notes
|
|
|
325 // There doesn't seem to be a nice and tidy way to convert float to int32 with graceful clipping to INT32_MIN .. INT32_MAX range.
|
|
|
326 // While low clipping at INT32_MIN can be accomplished with _mm_max_ps(), high clipping needs float compare THEN substitute bad int with INT32_MAX.
|
|
|
327 // The best we could do with _mm_min_ps() would result with high clipping at 0x7FFFFF80 instead of 0x7FFFFFFF (INT32_MAX).
|
|
|
328 // We store masks from float compare and fix ints according to the mask later.
|
|
|
329
|
|
|
330 __m128 mul = _mm_set1_ps(p_mul);
|
|
|
331 __m128 loF = _mm_set1_ps((float)INT32_MIN);
|
|
|
332 __m128 hiF = _mm_set1_ps((float)INT32_MAX);
|
|
|
333 // __m128i loI = _mm_set1_epi32(INT32_MIN);
|
|
|
334 __m128i hiI = _mm_set1_epi32(INT32_MAX);
|
|
|
335
|
|
|
336 size_t num = numTotal / 4;
|
|
|
337 size_t rem = numTotal % 4;
|
|
|
338 for (; num; --num) {
|
|
|
339 __m128 s = _mm_mul_ps(mul, _mm_loadu_ps(p_src)); p_src += 4;
|
|
|
340
|
|
|
341 s = _mm_max_ps(s, loF);
|
|
|
342
|
|
|
343 // __m128i maskLo = _mm_castps_si128(_mm_cmple_ps(s, loF));
|
|
|
344 __m128i maskHi = _mm_castps_si128(_mm_cmpge_ps(s, hiF));
|
|
|
345
|
|
|
346 __m128i i = _mm_cvtps_epi32(s);
|
|
|
347
|
|
|
348 // i = _mm_or_si128(_mm_andnot_si128(maskLo, i), _mm_and_si128(loI, maskLo));
|
|
|
349 i = _mm_or_si128(_mm_andnot_si128(maskHi, i), _mm_and_si128(hiI, maskHi));
|
|
|
350
|
|
|
351 _mm_storeu_si128((__m128i*) p_dst, i); p_dst += 4;
|
|
|
352 }
|
|
|
353
|
|
|
354 for (; rem; --rem) {
|
|
|
355 __m128 s = _mm_mul_ss(_mm_load_ss(p_src++), mul);
|
|
|
356 s = _mm_max_ss(s, loF);
|
|
|
357
|
|
|
358 // __m128i maskLo = _mm_castps_si128( _mm_cmple_ss(s, loF) );
|
|
|
359 __m128i maskHi = _mm_castps_si128(_mm_cmpge_ss(s, hiF));
|
|
|
360
|
|
|
361 __m128i i = _mm_cvtps_epi32(s); // not ss
|
|
|
362
|
|
|
363 // i = _mm_or_si128(_mm_andnot_si128(maskLo, i), _mm_and_si128(loI, maskLo));
|
|
|
364 i = _mm_or_si128(_mm_andnot_si128(maskHi, i), _mm_and_si128(hiI, maskHi));
|
|
|
365
|
|
|
366 _mm_storeu_si32(p_dst++, i);
|
|
|
367 }
|
|
|
368 }
|
|
|
369
|
|
|
370 inline void convert_to_32bit_sse2(const double* p_src, size_t numTotal, t_int32* p_dst, double p_mul)
|
|
|
371 {
|
|
|
372 auto mul = _mm_set1_pd(p_mul);
|
|
|
373 auto loF = _mm_set1_pd(INT32_MIN);
|
|
|
374 auto hiF = _mm_set1_pd(INT32_MAX);
|
|
|
375
|
|
|
376 size_t num = numTotal / 4;
|
|
|
377 size_t rem = numTotal % 4;
|
|
|
378 for (; num; --num) {
|
|
|
379 auto v1 = _mm_loadu_pd(p_src);
|
|
|
380 auto v2 = _mm_loadu_pd(p_src + 2);
|
|
|
381 p_src += 4;
|
|
|
382
|
|
|
383 v1 = _mm_mul_pd(v1, mul); v2 = _mm_mul_pd(v2, mul);
|
|
|
384
|
|
|
385 v1 = _mm_max_pd(v1, loF); v2 = _mm_max_pd(v2, loF);
|
|
|
386 v1 = _mm_min_pd(v1, hiF); v2 = _mm_min_pd(v2, hiF);
|
|
|
387
|
|
|
388 auto i1 = _mm_cvtpd_epi32(v1), i2 = _mm_cvtpd_epi32(v2);
|
|
|
389
|
|
|
390
|
|
|
391 _mm_storeu_si128((__m128i*) p_dst, _mm_unpacklo_epi64(i1, i2)); p_dst += 4;
|
|
|
392 }
|
|
|
393
|
|
|
394 for (; rem; --rem) {
|
|
|
395 auto s = _mm_mul_sd(_mm_load_sd(p_src++), mul);
|
|
|
396 s = _mm_max_sd(s, loF); s = _mm_min_sd(s, hiF);
|
|
|
397 * p_dst++ = _mm_cvtsd_si32(s);
|
|
|
398 }
|
|
|
399 }
|
|
|
400
|
|
|
401 inline void convert_from_int16_sse2(const t_int16 * p_source,t_size p_count,float * p_output,float p_scale)
|
|
|
402 {
|
|
|
403 while(!pfc::is_ptr_aligned_t<16>(p_output) && p_count) {
|
|
|
404 *(p_output++) = (float)*(p_source++) * p_scale;
|
|
|
405 p_count--;
|
|
|
406 }
|
|
|
407
|
|
|
408 {
|
|
|
409 __m128 mul = _mm_set1_ps(p_scale);
|
|
|
410 __m128i nulls = _mm_setzero_si128();
|
|
|
411 __m128i delta1 = _mm_set1_epi16((int16_t)0x8000);
|
|
|
412 __m128i delta2 = _mm_set1_epi32((int32_t)0x8000);
|
|
|
413
|
|
|
414 for(t_size loop = p_count >> 3;loop;--loop) {
|
|
|
415 __m128i source, temp1, temp2; __m128 float1, float2;
|
|
|
416 source = _mm_loadu_si128((__m128i*)p_source);
|
|
|
417 source = _mm_xor_si128(source,delta1);
|
|
|
418 temp1 = _mm_unpacklo_epi16(source,nulls);
|
|
|
419 temp2 = _mm_unpackhi_epi16(source,nulls);
|
|
|
420 temp1 = _mm_sub_epi32(temp1,delta2);
|
|
|
421 temp2 = _mm_sub_epi32(temp2,delta2);
|
|
|
422 p_source += 8;
|
|
|
423 float1 = _mm_cvtepi32_ps(temp1);
|
|
|
424 float2 = _mm_cvtepi32_ps(temp2);
|
|
|
425 float1 = _mm_mul_ps(float1,mul);
|
|
|
426 float2 = _mm_mul_ps(float2,mul);
|
|
|
427 _mm_store_ps(p_output,float1);
|
|
|
428 _mm_store_ps(p_output+4,float2);
|
|
|
429 p_output += 8;
|
|
|
430 }
|
|
|
431
|
|
|
432 p_count &= 7;
|
|
|
433 }
|
|
|
434
|
|
|
435 while(p_count) {
|
|
|
436 *(p_output++) = (float)*(p_source++) * p_scale;
|
|
|
437 p_count--;
|
|
|
438 }
|
|
|
439 }
|
|
|
440
|
|
|
441 inline static void convert_to_16bit_sse2(const float * p_source,t_size p_count,t_int16 * p_output,float p_scale)
|
|
|
442 {
|
|
|
443 size_t num = p_count/8;
|
|
|
444 size_t rem = p_count%8;
|
|
|
445 __m128 mul = _mm_set1_ps(p_scale);
|
|
|
446 for(;num;--num)
|
|
|
447 {
|
|
|
448 __m128 temp1,temp2; __m128i itemp1, itemp2;
|
|
|
449 temp1 = _mm_loadu_ps(p_source);
|
|
|
450 temp2 = _mm_loadu_ps(p_source+4);
|
|
|
451 temp1 = _mm_mul_ps(temp1,mul);
|
|
|
452 temp2 = _mm_mul_ps(temp2,mul);
|
|
|
453 p_source += 8;
|
|
|
454 itemp1 = _mm_cvtps_epi32(temp1);
|
|
|
455 itemp2 = _mm_cvtps_epi32(temp2);
|
|
|
456 _mm_storeu_si128( (__m128i*)p_output, _mm_packs_epi32(itemp1, itemp2) );
|
|
|
457 p_output += 8;
|
|
|
458 }
|
|
|
459
|
|
|
460 noopt_convert_to_16bit(p_source, rem, p_output, p_scale);
|
|
|
461 }
|
|
|
462
|
|
|
463 inline static void convert_to_16bit_sse2(const double* p_source, t_size p_count, t_int16* p_output, double p_scale)
|
|
|
464 {
|
|
|
465 size_t num = p_count / 8;
|
|
|
466 size_t rem = p_count % 8;
|
|
|
467 __m128d mul = _mm_set1_pd(p_scale);
|
|
|
468 for (; num; --num)
|
|
|
469 {
|
|
|
470 __m128d temp1, temp2, temp3, temp4; __m128i itemp1, itemp2;
|
|
|
471 temp1 = _mm_loadu_pd(p_source);
|
|
|
472 temp2 = _mm_loadu_pd(p_source + 2);
|
|
|
473 temp3 = _mm_loadu_pd(p_source + 4);
|
|
|
474 temp4 = _mm_loadu_pd(p_source + 6);
|
|
|
475
|
|
|
476 p_source += 8;
|
|
|
477
|
|
|
478 temp1 = _mm_mul_pd(temp1, mul);
|
|
|
479 temp2 = _mm_mul_pd(temp2, mul);
|
|
|
480 temp3 = _mm_mul_pd(temp3, mul);
|
|
|
481 temp4 = _mm_mul_pd(temp4, mul);
|
|
|
482
|
|
|
483
|
|
|
484 itemp1 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(temp1), _mm_cvtpd_epi32(temp2));
|
|
|
485 itemp2 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(temp3), _mm_cvtpd_epi32(temp4));
|
|
|
486
|
|
|
487 _mm_storeu_si128((__m128i*)p_output, _mm_packs_epi32(itemp1, itemp2));
|
|
|
488 p_output += 8;
|
|
|
489 }
|
|
|
490
|
|
|
491 noopt_convert_to_16bit(p_source, rem, p_output, p_scale);
|
|
|
492 }
|
|
|
493 #if allowAVX
|
|
|
494 inline static void avx_convert_to_16bit(const double* p_source, size_t p_count, int16_t* p_output, double p_scale) {
|
|
|
495 size_t num = p_count / 8;
|
|
|
496 size_t rem = p_count % 8;
|
|
|
497 auto mul = _mm256_set1_pd(p_scale);
|
|
|
498 for (; num; --num)
|
|
|
499 {
|
|
|
500 auto temp1 = _mm256_loadu_pd(p_source);
|
|
|
501 auto temp2 = _mm256_loadu_pd(p_source + 4);
|
|
|
502
|
|
|
503 p_source += 8;
|
|
|
504
|
|
|
505 temp1 = _mm256_mul_pd(temp1, mul);
|
|
|
506 temp2 = _mm256_mul_pd(temp2, mul);
|
|
|
507
|
|
|
508 auto itemp1 = _mm256_cvtpd_epi32(temp1);
|
|
|
509 auto itemp2 = _mm256_cvtpd_epi32(temp2);
|
|
|
510
|
|
|
511 _mm_storeu_si128((__m128i*)p_output, _mm_packs_epi32(itemp1, itemp2));
|
|
|
512 p_output += 8;
|
|
|
513 }
|
|
|
514
|
|
|
515 noopt_convert_to_16bit(p_source, rem, p_output, p_scale);
|
|
|
516 }
|
|
|
517 #endif
|
|
|
518
|
|
|
519 inline float sse_calculate_peak( const float * src, size_t count ) {
|
|
|
520 size_t num = count/8;
|
|
|
521 size_t rem = count%8;
|
|
|
522
|
|
|
523 __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
|
|
|
524 __m128 acc1 = _mm_setzero_ps(), acc2 = _mm_setzero_ps();
|
|
|
525
|
|
|
526 for(;num;--num) {
|
|
|
527 __m128 v1 = _mm_loadu_ps( src );
|
|
|
528 __m128 v2 = _mm_loadu_ps( src + 4 );
|
|
|
529 v1 = _mm_and_ps( v1, mask );
|
|
|
530 v2 = _mm_and_ps( v2, mask );
|
|
|
531 // Two acc channels so one _mm_max_ps doesn't block the other
|
|
|
532 acc1 = _mm_max_ps( acc1, v1 );
|
|
|
533 acc2 = _mm_max_ps( acc2, v2 );
|
|
|
534 src += 8;
|
|
|
535 }
|
|
|
536
|
|
|
537 float ret;
|
|
|
538 {
|
|
|
539 float blah[4];
|
|
|
540 _mm_storeu_ps(blah, _mm_max_ps( acc1, acc2 ));
|
|
|
541 __m128 acc = _mm_load_ss( &blah[0] );
|
|
|
542 acc = _mm_max_ss( acc, _mm_load_ss( &blah[1] ) );
|
|
|
543 acc = _mm_max_ss( acc, _mm_load_ss( &blah[2] ) );
|
|
|
544 acc = _mm_max_ss( acc, _mm_load_ss( &blah[3] ) );
|
|
|
545 ret = _mm_cvtss_f32(acc);
|
|
|
546 }
|
|
|
547 if ( rem > 0 ) {
|
|
|
548 __m128 acc = _mm_set_ss( ret );
|
|
|
549 for( ;rem; --rem) {
|
|
|
550 __m128 v = _mm_load_ss( src++ );
|
|
|
551 v = _mm_and_ps( v, mask );
|
|
|
552 acc = _mm_max_ss( acc, v );
|
|
|
553 }
|
|
|
554 ret = _mm_cvtss_f32(acc);
|
|
|
555 }
|
|
|
556 return ret;
|
|
|
557 }
|
|
|
558
|
|
|
559 #if allowAVX
|
|
|
560 inline double avx_calculate_peak(const double* src, size_t count) {
|
|
|
561 size_t num = count / 8;
|
|
|
562 size_t rem = count % 8;
|
|
|
563
|
|
|
564 auto mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
|
|
|
565 auto acc1 = _mm256_setzero_pd(), acc2 = _mm256_setzero_pd();
|
|
|
566
|
|
|
567 for (; num; --num) {
|
|
|
568 auto v1 = _mm256_loadu_pd(src);
|
|
|
569 auto v2 = _mm256_loadu_pd(src + 4);
|
|
|
570
|
|
|
571 v1 = _mm256_and_pd(v1, mask);
|
|
|
572 v2 = _mm256_and_pd(v2, mask);
|
|
|
573
|
|
|
574 acc1 = _mm256_max_pd(acc1, v1);
|
|
|
575 acc2 = _mm256_max_pd(acc2, v2);
|
|
|
576
|
|
|
577 src += 8;
|
|
|
578 }
|
|
|
579
|
|
|
580 __m128d acc;
|
|
|
581 {
|
|
|
582 acc1 = _mm256_max_pd(acc1, acc2);
|
|
|
583
|
|
|
584 acc = _mm_max_pd(_mm256_extractf128_pd(acc1, 0), _mm256_extractf128_pd(acc1, 1));
|
|
|
585
|
|
|
586 acc = _mm_max_sd(acc, _mm_shuffle_pd(acc, acc, _MM_SHUFFLE2(0, 1)));
|
|
|
587 }
|
|
|
588
|
|
|
589 if (rem > 0) {
|
|
|
590 __m128d mask128 = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFFFFFFFFFFFFFF));
|
|
|
591 for (; rem; --rem) {
|
|
|
592 __m128d v = _mm_load_sd(src++);
|
|
|
593 v = _mm_and_pd(v, mask128);
|
|
|
594 acc = _mm_max_sd(acc, v);
|
|
|
595 }
|
|
|
596 }
|
|
|
597 return _mm_cvtsd_f64(acc);
|
|
|
598 }
|
|
|
599 #endif // allowAVX
|
|
|
600
|
|
|
601 inline double sse_calculate_peak(const double* src, size_t count) {
|
|
|
602 size_t num = count / 4;
|
|
|
603 size_t rem = count % 4;
|
|
|
604
|
|
|
605 __m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFFFFFFFFFFFFFF));
|
|
|
606 __m128d acc1 = _mm_setzero_pd(), acc2 = _mm_setzero_pd();
|
|
|
607
|
|
|
608 for (; num; --num) {
|
|
|
609 __m128d v1 = _mm_loadu_pd(src);
|
|
|
610 __m128d v2 = _mm_loadu_pd(src + 2);
|
|
|
611 v1 = _mm_and_pd(v1, mask);
|
|
|
612 v2 = _mm_and_pd(v2, mask);
|
|
|
613 // Two acc channels so one _mm_max_pd doesn't block the other
|
|
|
614 acc1 = _mm_max_pd(acc1, v1);
|
|
|
615 acc2 = _mm_max_pd(acc2, v2);
|
|
|
616 src += 4;
|
|
|
617 }
|
|
|
618
|
|
|
619 {
|
|
|
620 acc1 = _mm_max_pd(acc1, acc2);
|
|
|
621 acc1 = _mm_max_sd(acc1, _mm_shuffle_pd(acc1, acc1, _MM_SHUFFLE2(0, 1)));
|
|
|
622 }
|
|
|
623 if (rem > 0) {
|
|
|
624 for (; rem; --rem) {
|
|
|
625 __m128d v = _mm_load_sd(src++);
|
|
|
626 v = _mm_and_pd(v, mask);
|
|
|
627 acc1 = _mm_max_sd(acc1, v);
|
|
|
628 }
|
|
|
629 }
|
|
|
630 return _mm_cvtsd_f64(acc1);
|
|
|
631 }
|
|
|
632
|
|
|
633 inline void sse_convert_from_int32(const int32_t* source, size_t count, float* output, float scale) {
|
|
|
634 __m128 mul = _mm_set1_ps(scale);
|
|
|
635 for (size_t num = count/8; num; --num)
|
|
|
636 {
|
|
|
637 __m128i itemp1, itemp2; __m128 temp1, temp2;
|
|
|
638 itemp1 = _mm_loadu_si128((__m128i*)source);
|
|
|
639 itemp2 = _mm_loadu_si128((__m128i*)source + 1);
|
|
|
640 temp1 = _mm_cvtepi32_ps(itemp1);
|
|
|
641 temp2 = _mm_cvtepi32_ps(itemp2);
|
|
|
642 source += 8;
|
|
|
643 temp1 = _mm_mul_ps(temp1, mul);
|
|
|
644 temp2 = _mm_mul_ps(temp2, mul);
|
|
|
645 _mm_storeu_ps(output, temp1);
|
|
|
646 _mm_storeu_ps(output + 4, temp2);
|
|
|
647 output += 8;
|
|
|
648 }
|
|
|
649 for (size_t rem = count % 8; rem; --rem) {
|
|
|
650 __m128i i = _mm_loadu_si32(source++);
|
|
|
651 __m128 f = _mm_cvtepi32_ps(i);
|
|
|
652 f = _mm_mul_ss(f, mul);
|
|
|
653 _mm_store_ss(output++, f);
|
|
|
654 }
|
|
|
655 }
|
|
|
656
|
|
|
657 inline void sse_convert_from_int32(const int32_t* source, size_t count, double* output, double scale) {
|
|
|
658 auto mul = _mm_set1_pd(scale);
|
|
|
659 for (size_t num = count / 8; num; --num)
|
|
|
660 {
|
|
|
661 auto itemp1 = _mm_loadu_si128((__m128i*)source);
|
|
|
662 auto itemp2 = _mm_loadu_si128((__m128i*)source + 1);
|
|
|
663 auto temp1 = _mm_cvtepi32_pd(itemp1);
|
|
|
664 auto temp2 = _mm_cvtepi32_pd(_mm_shuffle_epi32(itemp1, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
|
665 auto temp3 = _mm_cvtepi32_pd(itemp2);
|
|
|
666 auto temp4 = _mm_cvtepi32_pd(_mm_shuffle_epi32(itemp2, _MM_SHUFFLE(1, 0, 3, 2)));
|
|
|
667 source += 8;
|
|
|
668 temp1 = _mm_mul_pd(temp1, mul);
|
|
|
669 temp2 = _mm_mul_pd(temp2, mul);
|
|
|
670 temp3 = _mm_mul_pd(temp3, mul);
|
|
|
671 temp4 = _mm_mul_pd(temp4, mul);
|
|
|
672 _mm_storeu_pd(output, temp1);
|
|
|
673 _mm_storeu_pd(output + 2, temp2);
|
|
|
674 _mm_storeu_pd(output + 4, temp3);
|
|
|
675 _mm_storeu_pd(output + 6, temp4);
|
|
|
676 output += 8;
|
|
|
677 }
|
|
|
678 for (size_t rem = count % 8; rem; --rem) {
|
|
|
679 __m128i i = _mm_loadu_si32(source++);
|
|
|
680 auto f = _mm_cvtepi32_pd(i);
|
|
|
681 f = _mm_mul_sd(f, mul);
|
|
|
682 _mm_store_sd(output++, f);
|
|
|
683 }
|
|
|
684 }
|
|
|
685 #if allowAVX
|
|
|
686 inline void convert_from_int16_avx(const t_int16* p_source, t_size p_count, double* p_output, double p_scale) {
|
|
|
687 while (!pfc::is_ptr_aligned_t<32>(p_output) && p_count) {
|
|
|
688 *(p_output++) = (double)*(p_source++) * p_scale;
|
|
|
689 p_count--;
|
|
|
690 }
|
|
|
691
|
|
|
692 {
|
|
|
693 __m256d muld = _mm256_set1_pd(p_scale);
|
|
|
694
|
|
|
695 for (t_size loop = p_count >> 3; loop; --loop) {
|
|
|
696 auto source = _mm_loadu_si128((__m128i*)p_source);
|
|
|
697 auto temp1 = _mm_cvtepi16_epi32(source);
|
|
|
698 auto temp2 = _mm_cvtepi16_epi32(_mm_shuffle_epi32(source, _MM_SHUFFLE(0, 0, 3, 2)));
|
|
|
699 p_source += 8;
|
|
|
700
|
|
|
701 auto double1 = _mm256_cvtepi32_pd(temp1);
|
|
|
702 auto double2 = _mm256_cvtepi32_pd(temp2);
|
|
|
703
|
|
|
704 double1 = _mm256_mul_pd(double1, muld);
|
|
|
705 double2 = _mm256_mul_pd(double2, muld);
|
|
|
706
|
|
|
707 _mm256_store_pd(p_output, double1);
|
|
|
708 _mm256_store_pd(p_output+4, double2);
|
|
|
709
|
|
|
710 p_output += 8;
|
|
|
711 }
|
|
|
712
|
|
|
713 p_count &= 7;
|
|
|
714 }
|
|
|
715
|
|
|
716 while (p_count) {
|
|
|
717 *(p_output++) = (double)*(p_source++) * p_scale;
|
|
|
718 p_count--;
|
|
|
719 }
|
|
|
720
|
|
|
721 }
|
|
|
722 #endif // allowAVX
|
|
|
723
|
|
|
724 inline void convert_from_int16_sse2(const t_int16* p_source, t_size p_count, double * p_output, double p_scale)
|
|
|
725 {
|
|
|
726 while (!pfc::is_ptr_aligned_t<16>(p_output) && p_count) {
|
|
|
727 *(p_output++) = (double) * (p_source++) * p_scale;
|
|
|
728 p_count--;
|
|
|
729 }
|
|
|
730
|
|
|
731 {
|
|
|
732 __m128d muld = _mm_set1_pd(p_scale);
|
|
|
733 __m128i nulls = _mm_setzero_si128();
|
|
|
734 __m128i delta1 = _mm_set1_epi16((int16_t)0x8000);
|
|
|
735 __m128i delta2 = _mm_set1_epi32((int32_t)0x8000);
|
|
|
736
|
|
|
737 for (t_size loop = p_count >> 3; loop; --loop) {
|
|
|
738 __m128i source, temp1, temp2; __m128d double1, double2, double3, double4;
|
|
|
739 source = _mm_loadu_si128((__m128i*)p_source);
|
|
|
740 source = _mm_xor_si128(source, delta1);
|
|
|
741 temp1 = _mm_unpacklo_epi16(source, nulls);
|
|
|
742 temp2 = _mm_unpackhi_epi16(source, nulls);
|
|
|
743 temp1 = _mm_sub_epi32(temp1, delta2);
|
|
|
744 temp2 = _mm_sub_epi32(temp2, delta2);
|
|
|
745 p_source += 8;
|
|
|
746
|
|
|
747 double1 = _mm_cvtepi32_pd(temp1);
|
|
|
748 double2 = _mm_cvtepi32_pd(_mm_shuffle_epi32(temp1, _MM_SHUFFLE(3, 2, 3, 2)));
|
|
|
749 double3 = _mm_cvtepi32_pd(temp2);
|
|
|
750 double4 = _mm_cvtepi32_pd(_mm_shuffle_epi32(temp2, _MM_SHUFFLE(3, 2, 3, 2)));
|
|
|
751
|
|
|
752 double1 = _mm_mul_pd(double1, muld);
|
|
|
753 double2 = _mm_mul_pd(double2, muld);
|
|
|
754 double3 = _mm_mul_pd(double3, muld);
|
|
|
755 double4 = _mm_mul_pd(double4, muld);
|
|
|
756 _mm_store_pd(p_output, double1);
|
|
|
757 _mm_store_pd(p_output + 2, double2);
|
|
|
758 _mm_store_pd(p_output + 4, double3);
|
|
|
759 _mm_store_pd(p_output + 6, double4);
|
|
|
760
|
|
|
761 p_output += 8;
|
|
|
762 }
|
|
|
763
|
|
|
764 p_count &= 7;
|
|
|
765 }
|
|
|
766
|
|
|
767 while (p_count) {
|
|
|
768 *(p_output++) = (double) * (p_source++) * p_scale;
|
|
|
769 p_count--;
|
|
|
770 }
|
|
|
771 }
|
|
|
772
|
|
|
773 #endif
|
|
|
774
|
|
|
775 namespace pfc {
|
|
|
776 void audio_math::scale(const float* p_source, size_t p_count, float* p_output, float p_scale) {
|
|
|
777 #if defined( AUDIO_MATH_NEON )
|
|
|
778 neon_scale(p_source, p_count, p_output, p_scale);
|
|
|
779 #else
|
|
|
780 noopt_scale(p_source, p_count, p_output, p_scale);
|
|
|
781 #endif
|
|
|
782 }
|
|
|
783 void audio_math::scale(const double* p_source, size_t p_count, double* p_output, double p_scale) {
|
|
|
784 noopt_scale(p_source, p_count, p_output, p_scale);
|
|
|
785 }
|
|
|
786
|
|
|
787 void audio_math::convert_to_int16(const float* p_source, t_size p_count, t_int16* p_output, float p_scale)
|
|
|
788 {
|
|
|
789 float scale = (float)(p_scale * 0x8000);
|
|
|
790 #if defined(AUDIO_MATH_SSE)
|
|
|
791 convert_to_16bit_sse2(p_source, p_count, p_output, scale);
|
|
|
792 #elif defined( AUDIO_MATH_NEON )
|
|
|
793 neon_convert_to_int16(p_source, p_count, p_output, scale);
|
|
|
794 #else
|
|
|
795 noopt_convert_to_16bit(p_source, p_count, p_output, scale);
|
|
|
796 #endif
|
|
|
797 }
|
|
|
798 void audio_math::convert_to_int16(const double* p_source, t_size p_count, t_int16* p_output, double p_scale)
|
|
|
799 {
|
|
|
800 double scale = (double)(p_scale * 0x8000);
|
|
|
801 #if defined(AUDIO_MATH_SSE)
|
|
|
802 #if allowAVX
|
|
|
803 if (haveAVX) {
|
|
|
804 avx_convert_to_16bit(p_source, p_count, p_output, scale);
|
|
|
805 } else
|
|
|
806 #endif // allowAVX
|
|
|
807 {
|
|
|
808 convert_to_16bit_sse2(p_source, p_count, p_output, scale);
|
|
|
809 }
|
|
|
810 #elif defined( AUDIO_MATH_NEON_FLOAT64 )
|
|
|
811 neon_convert_to_int16(p_source, p_count, p_output, scale);
|
|
|
812 #else
|
|
|
813 noopt_convert_to_16bit(p_source, p_count, p_output, scale);
|
|
|
814 #endif
|
|
|
815 }
|
|
|
816
|
|
|
817 void audio_math::convert_from_int16(const t_int16* p_source, t_size p_count, float* p_output, float p_scale)
|
|
|
818 {
|
|
|
819 float scale = (float)(p_scale / (double)0x8000);
|
|
|
820 #if defined(AUDIO_MATH_SSE)
|
|
|
821 convert_from_int16_sse2(p_source, p_count, p_output, scale);
|
|
|
822 #elif defined( AUDIO_MATH_NEON )
|
|
|
823 neon_convert_from_int16(p_source, p_count, p_output, scale);
|
|
|
824 #else
|
|
|
825 noopt_convert_from_int16(p_source, p_count, p_output, scale);
|
|
|
826 #endif
|
|
|
827 }
|
|
|
828
|
|
|
829 void audio_math::convert_from_int16(const t_int16* p_source, t_size p_count, double* p_output, double p_scale)
|
|
|
830 {
|
|
|
831 double scale = (double)(p_scale / (double)0x8000);
|
|
|
832 #if defined(AUDIO_MATH_SSE)
|
|
|
833 #if allowAVX
|
|
|
834 if (haveAVX) {
|
|
|
835 convert_from_int16_avx(p_source, p_count, p_output, scale);
|
|
|
836 } else
|
|
|
837 #endif
|
|
|
838 {
|
|
|
839 convert_from_int16_sse2(p_source, p_count, p_output, scale);
|
|
|
840 }
|
|
|
841 #elif defined( AUDIO_MATH_NEON_FLOAT64 )
|
|
|
842 neon_convert_from_int16(p_source, p_count, p_output, scale);
|
|
|
843 #else
|
|
|
844 noopt_convert_from_int16(p_source, p_count, p_output, scale);
|
|
|
845 #endif
|
|
|
846 }
|
|
|
847
|
|
|
848 void audio_math::convert_to_int32(const float* p_source, t_size p_count, t_int32* p_output, float p_scale)
|
|
|
849 {
|
|
|
850 float scale = (float)(p_scale * 0x80000000ul);
|
|
|
851 #if defined(AUDIO_MATH_NEON)
|
|
|
852 neon_convert_to_int32(p_source, p_count, p_output, scale);
|
|
|
853 #elif defined(AUDIO_MATH_SSE)
|
|
|
854 convert_to_32bit_sse2(p_source, p_count, p_output, scale);
|
|
|
855 #else
|
|
|
856 noopt_convert_to_32bit(p_source, p_count, p_output, scale);
|
|
|
857 #endif
|
|
|
858 }
|
|
|
859
|
|
|
860 void audio_math::convert_to_int32(const double* p_source, t_size p_count, t_int32* p_output, double p_scale)
|
|
|
861 {
|
|
|
862 double scale = (double)(p_scale * 0x80000000ul);
|
|
|
863 #if defined(AUDIO_MATH_SSE)
|
|
|
864 convert_to_32bit_sse2(p_source, p_count, p_output, scale);
|
|
|
865 #else
|
|
|
866 noopt_convert_to_32bit(p_source, p_count, p_output, scale);
|
|
|
867 #endif
|
|
|
868 }
|
|
|
869
|
|
|
870 void audio_math::convert_from_int32(const t_int32* p_source, t_size p_count, float* p_output, float p_scale)
|
|
|
871 {
|
|
|
872 float scale = (float)(p_scale / (double)0x80000000ul);
|
|
|
873 // Note: speed difference here is marginal over compiler output as of Xcode 12
|
|
|
874 #if defined(AUDIO_MATH_NEON)
|
|
|
875 neon_convert_from_int32(p_source, p_count, p_output, scale);
|
|
|
876 #elif defined(AUDIO_MATH_SSE)
|
|
|
877 sse_convert_from_int32(p_source, p_count, p_output, scale);
|
|
|
878 #else
|
|
|
879 noopt_convert_from_int32(p_source, p_count, p_output, scale);
|
|
|
880 #endif
|
|
|
881 }
|
|
|
882
|
|
|
883 void audio_math::convert_from_int32(const t_int32* p_source, t_size p_count, double* p_output, double p_scale)
|
|
|
884 {
|
|
|
885 double scale = (double)(p_scale / (double)0x80000000ul);
|
|
|
886 #if defined(AUDIO_MATH_SSE)
|
|
|
887 sse_convert_from_int32(p_source, p_count, p_output, scale);
|
|
|
888 #else
|
|
|
889 noopt_convert_from_int32(p_source, p_count, p_output, scale);
|
|
|
890 #endif
|
|
|
891 }
|
|
|
892
|
|
|
893 float audio_math::calculate_peak(const float * p_source, t_size p_count) {
|
|
|
894 #if defined(AUDIO_MATH_SSE)
|
|
|
895 return sse_calculate_peak(p_source, p_count);
|
|
|
896 #elif defined(AUDIO_MATH_NEON)
|
|
|
897 return neon_calculate_peak(p_source, p_count);
|
|
|
898 #else
|
|
|
899 return noopt_calculate_peak(p_source, p_count);
|
|
|
900 #endif
|
|
|
901 }
|
|
|
902 double audio_math::calculate_peak(const double * p_source, t_size p_count) {
|
|
|
903 #if defined(AUDIO_MATH_SSE)
|
|
|
904 // Note that avx_calculate_peak failed to score better than sse_calculate_peak
|
|
|
905 return sse_calculate_peak(p_source, p_count);
|
|
|
906 #else
|
|
|
907 return noopt_calculate_peak(p_source, p_count);
|
|
|
908 #endif
|
|
|
909 }
|
|
|
910
|
|
|
911 void audio_math::remove_denormals(float* p_buffer, t_size p_count) {
|
|
|
912 t_uint32* ptr = reinterpret_cast<t_uint32*>(p_buffer);
|
|
|
913 for (; p_count; p_count--)
|
|
|
914 {
|
|
|
915 t_uint32 t = *ptr;
|
|
|
916 if ((t & 0x007FFFFF) && !(t & 0x7F800000)) *ptr = 0;
|
|
|
917 ptr++;
|
|
|
918 }
|
|
|
919 }
|
|
|
920 void audio_math::remove_denormals(double* p_buffer, t_size p_count) {
|
|
|
921 t_uint64* ptr = reinterpret_cast<t_uint64*>(p_buffer);
|
|
|
922 for (; p_count; p_count--)
|
|
|
923 {
|
|
|
924 t_uint64 t = *ptr;
|
|
|
925 if ((t & 0x000FFFFFFFFFFFFF) && !(t & 0x7FF0000000000000)) *ptr = 0;
|
|
|
926 ptr++;
|
|
|
927 }
|
|
|
928 }
|
|
|
929
|
|
|
930 void audio_math::add_offset(float* p_buffer, float p_delta, size_t p_count) {
|
|
|
931 for (size_t n = 0; n < p_count; ++n) p_buffer[n] += p_delta;
|
|
|
932 }
|
|
|
933 void audio_math::add_offset(double* p_buffer, double p_delta, size_t p_count) {
|
|
|
934 for (size_t n = 0; n < p_count; ++n) p_buffer[n] += p_delta;
|
|
|
935 }
|
|
|
936
|
|
|
937 void audio_math::convert(const float* in, float* out, size_t count) {
|
|
|
938 memcpy(out, in, count * sizeof(float));
|
|
|
939 }
|
|
|
940 void audio_math::convert(const float* in, float* out, size_t count, float scale) {
|
|
|
941 audio_math::scale(in, count, out, scale);
|
|
|
942 }
|
|
|
943 void audio_math::convert(const double* in, double* out, size_t count) {
|
|
|
944 memcpy(out, in, count * sizeof(double));
|
|
|
945 }
|
|
|
946 void audio_math::convert(const double* in, double* out, size_t count, double scale) {
|
|
|
947 audio_math::scale(in, count, out, scale);
|
|
|
948 }
|
|
|
949
|
|
|
950 void audio_math::convert(const float* in, double* out, size_t count) {
|
|
|
951 // optimize me
|
|
|
952 noopt_convert(in, out, count);
|
|
|
953 }
|
|
|
954 void audio_math::convert(const float* in, double* out, size_t count, double scale) {
|
|
|
955 // optimize me
|
|
|
956 noopt_scale(in, count, out, scale);
|
|
|
957 }
|
|
|
958 void audio_math::convert(const double* in, float* out, size_t count) {
|
|
|
959 // optimize me
|
|
|
960 noopt_convert(in, out, count);
|
|
|
961 }
|
|
|
962 void audio_math::convert(const double* in, float* out, size_t count, double scale) {
|
|
|
963 // optimize me
|
|
|
964 noopt_scale(in, count, out, scale);
|
|
|
965 }
|
|
|
966
|
|
|
967
|
|
|
968 typedef char store24_t;
|
|
|
969 static store24_t* store24(store24_t* out, int32_t in) {
|
|
|
970 *(out++) = ((store24_t*)&in)[0];
|
|
|
971 *(out++) = ((store24_t*)&in)[1];
|
|
|
972 *(out++) = ((store24_t*)&in)[2];
|
|
|
973 return out;
|
|
|
974 }
|
|
|
975 static store24_t* store24p(store24_t* out, int32_t in) {
|
|
|
976 *(int32_t*)out = in;
|
|
|
977 return out + 3;
|
|
|
978 }
|
|
|
979
|
|
|
980 static constexpr int32_t INT24_MAX = 0x7FFFFF, INT24_MIN = -0x800000;
|
|
|
981
|
|
|
982 template<typename float_t> void convert_to_int24_noopt(float_t const* in, size_t count, void* out, float_t scale) {
|
|
|
983 if (count == 0) return;
|
|
|
984 --count;
|
|
|
985 auto ptr = reinterpret_cast<store24_t*>(out);
|
|
|
986 constexpr float_t lo = INT24_MIN, hi = INT24_MAX;
|
|
|
987 while (count) {
|
|
|
988 auto vf = *in++ * scale;
|
|
|
989 if (vf < lo) vf = lo;
|
|
|
990 else if (vf > hi) vf = hi;
|
|
|
991 ptr = store24p(ptr, audio_math::rint32(vf));
|
|
|
992 --count;
|
|
|
993 }
|
|
|
994
|
|
|
995 auto vf = *in * scale;
|
|
|
996 if (vf < lo) vf = lo;
|
|
|
997 else if (vf > hi) vf = hi;
|
|
|
998 store24(ptr, audio_math::rint32(vf));
|
|
|
999 }
|
|
|
1000 #ifdef AUDIO_MATH_SSE
|
|
|
1001 #if allowAVX
|
|
|
1002 static void f64_to_i24_avx(double const* in, size_t n, uint8_t* out, double scale) {
|
|
|
1003 const __m128i pi0 = _mm_set_epi8(-128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
|
|
|
1004 const __m128i pi1 = _mm_set_epi8(4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5);
|
|
|
1005 const __m128i pi2 = _mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10);
|
|
|
1006 const __m128i pi3 = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128);
|
|
|
1007 const auto mul = _mm256_set1_pd(scale);
|
|
|
1008
|
|
|
1009 // PROBLEM: if we want to handle wildly out-of-bounds values, we can't do int clipping!
|
|
|
1010 // float clipping is sadly considerably slower than int clipping
|
|
|
1011 const auto lo = _mm256_set1_pd(INT24_MIN);
|
|
|
1012 const auto hi = _mm256_set1_pd(INT24_MAX);
|
|
|
1013
|
|
|
1014 while (n >= 4 * 4) {
|
|
|
1015 auto f0 = _mm256_mul_pd(_mm256_loadu_pd(in + 0), mul);
|
|
|
1016 auto f1 = _mm256_mul_pd(_mm256_loadu_pd(in + 4), mul);
|
|
|
1017 auto f2 = _mm256_mul_pd(_mm256_loadu_pd(in + 8), mul);
|
|
|
1018 auto f3 = _mm256_mul_pd(_mm256_loadu_pd(in + 12), mul);
|
|
|
1019 f0 = _mm256_max_pd(_mm256_min_pd(f0, hi), lo);
|
|
|
1020 f1 = _mm256_max_pd(_mm256_min_pd(f1, hi), lo);
|
|
|
1021 f2 = _mm256_max_pd(_mm256_min_pd(f2, hi), lo);
|
|
|
1022 f3 = _mm256_max_pd(_mm256_min_pd(f3, hi), lo);
|
|
|
1023 __m128i w0 = _mm256_cvtpd_epi32(f0);
|
|
|
1024 __m128i w1 = _mm256_cvtpd_epi32(f1);
|
|
|
1025 __m128i w2 = _mm256_cvtpd_epi32(f2);
|
|
|
1026 __m128i w3 = _mm256_cvtpd_epi32(f3);
|
|
|
1027
|
|
|
1028 // _mm_shuffle_epi8 : SSSE3
|
|
|
1029 w0 = _mm_shuffle_epi8(w0, pi0);
|
|
|
1030 w1 = _mm_shuffle_epi8(w1, pi1);
|
|
|
1031 w2 = _mm_shuffle_epi8(w2, pi2);
|
|
|
1032 w3 = _mm_shuffle_epi8(w3, pi3);
|
|
|
1033
|
|
|
1034 // _mm_blend_epi16 : SSE4.1
|
|
|
1035 __m128i u0 = _mm_blend_epi16(w0, w1, 0xC0);
|
|
|
1036 __m128i u1 = _mm_blend_epi16(w1, w2, 0xF0);
|
|
|
1037 __m128i u2 = _mm_blend_epi16(w2, w3, 0xFC);
|
|
|
1038
|
|
|
1039 _mm_storeu_si128((__m128i*)(out + 0), u0);
|
|
|
1040 _mm_storeu_si128((__m128i*)(out + 16), u1);
|
|
|
1041 _mm_storeu_si128((__m128i*)(out + 32), u2);
|
|
|
1042
|
|
|
1043 in += 4 * 4;
|
|
|
1044 out += 16 * 3;
|
|
|
1045 n -= 4 * 4;
|
|
|
1046 }
|
|
|
1047
|
|
|
1048 convert_to_int24_noopt(in, n, out, scale);
|
|
|
1049 }
|
|
|
1050 #endif // allowAVX
|
|
|
1051 static void f64_to_i24_sse41(double const* in, size_t n, uint8_t* out, double scale) {
|
|
|
1052 const __m128i pi0 = _mm_set_epi8(-128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
|
|
|
1053 const __m128i pi1 = _mm_set_epi8(4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5);
|
|
|
1054 const __m128i pi2 = _mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10);
|
|
|
1055 const __m128i pi3 = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128);
|
|
|
1056 const auto mul = _mm_set1_pd(scale);
|
|
|
1057
|
|
|
1058 // PROBLEM: if we want to handle wildly out-of-bounds values, we can't do int clipping!
|
|
|
1059 // float clipping is sadly considerably slower than int clipping
|
|
|
1060 const auto lo = _mm_set1_pd(INT24_MIN);
|
|
|
1061 const auto hi = _mm_set1_pd(INT24_MAX);
|
|
|
1062
|
|
|
1063 while (n >= 4 * 4) {
|
|
|
1064 auto f0 = _mm_mul_pd(_mm_loadu_pd(in + 0), mul);
|
|
|
1065 auto f1 = _mm_mul_pd(_mm_loadu_pd(in + 2), mul);
|
|
|
1066 auto f2 = _mm_mul_pd(_mm_loadu_pd(in + 4), mul);
|
|
|
1067 auto f3 = _mm_mul_pd(_mm_loadu_pd(in + 6), mul);
|
|
|
1068 auto f4 = _mm_mul_pd(_mm_loadu_pd(in + 8), mul);
|
|
|
1069 auto f5 = _mm_mul_pd(_mm_loadu_pd(in + 10), mul);
|
|
|
1070 auto f6 = _mm_mul_pd(_mm_loadu_pd(in + 12), mul);
|
|
|
1071 auto f7 = _mm_mul_pd(_mm_loadu_pd(in + 14), mul);
|
|
|
1072 f0 = _mm_max_pd(_mm_min_pd(f0, hi), lo);
|
|
|
1073 f1 = _mm_max_pd(_mm_min_pd(f1, hi), lo);
|
|
|
1074 f2 = _mm_max_pd(_mm_min_pd(f2, hi), lo);
|
|
|
1075 f3 = _mm_max_pd(_mm_min_pd(f3, hi), lo);
|
|
|
1076 f4 = _mm_max_pd(_mm_min_pd(f4, hi), lo);
|
|
|
1077 f5 = _mm_max_pd(_mm_min_pd(f5, hi), lo);
|
|
|
1078 f6 = _mm_max_pd(_mm_min_pd(f6, hi), lo);
|
|
|
1079 f7 = _mm_max_pd(_mm_min_pd(f7, hi), lo);
|
|
|
1080
|
|
|
1081
|
|
|
1082
|
|
|
1083 __m128i w0 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f0), _mm_cvtpd_epi32(f1));
|
|
|
1084 __m128i w1 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f2), _mm_cvtpd_epi32(f3));
|
|
|
1085 __m128i w2 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f4), _mm_cvtpd_epi32(f5));
|
|
|
1086 __m128i w3 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f6), _mm_cvtpd_epi32(f7));
|
|
|
1087
|
|
|
1088 // _mm_shuffle_epi8 : SSSE3
|
|
|
1089 w0 = _mm_shuffle_epi8(w0, pi0);
|
|
|
1090 w1 = _mm_shuffle_epi8(w1, pi1);
|
|
|
1091 w2 = _mm_shuffle_epi8(w2, pi2);
|
|
|
1092 w3 = _mm_shuffle_epi8(w3, pi3);
|
|
|
1093
|
|
|
1094 // _mm_blend_epi16 : SSE4.1
|
|
|
1095 __m128i u0 = _mm_blend_epi16(w0, w1, 0xC0);
|
|
|
1096 __m128i u1 = _mm_blend_epi16(w1, w2, 0xF0);
|
|
|
1097 __m128i u2 = _mm_blend_epi16(w2, w3, 0xFC);
|
|
|
1098
|
|
|
1099 _mm_storeu_si128((__m128i*)(out + 0), u0);
|
|
|
1100 _mm_storeu_si128((__m128i*)(out + 16), u1);
|
|
|
1101 _mm_storeu_si128((__m128i*)(out + 32), u2);
|
|
|
1102
|
|
|
1103 in += 4 * 4;
|
|
|
1104 out += 16 * 3;
|
|
|
1105 n -= 4 * 4;
|
|
|
1106 }
|
|
|
1107
|
|
|
1108 convert_to_int24_noopt(in, n, out, scale);
|
|
|
1109 }
|
|
|
1110 static void f32_to_i24_sse41(float const* in, size_t n, uint8_t* out, float scale) {
|
|
|
1111 const __m128i pi0 = _mm_set_epi8(-128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
|
|
|
1112 const __m128i pi1 = _mm_set_epi8(4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5);
|
|
|
1113 const __m128i pi2 = _mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10);
|
|
|
1114 const __m128i pi3 = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128);
|
|
|
1115 const __m128 mul = _mm_set1_ps(scale);
|
|
|
1116
|
|
|
1117 // PROBLEM: if we want to handle wildly out-of-bounds values, we can't do int clipping!
|
|
|
1118 // float clipping is sadly considerably slower than int clipping
|
|
|
1119 const auto lo = _mm_set1_ps(INT24_MIN);
|
|
|
1120 const auto hi = _mm_set1_ps(INT24_MAX);
|
|
|
1121
|
|
|
1122 while (n >= 4 * 4) {
|
|
|
1123 auto f0 = _mm_mul_ps(_mm_loadu_ps(in + 0), mul);
|
|
|
1124 auto f1 = _mm_mul_ps(_mm_loadu_ps(in + 4), mul);
|
|
|
1125 auto f2 = _mm_mul_ps(_mm_loadu_ps(in + 8), mul);
|
|
|
1126 auto f3 = _mm_mul_ps(_mm_loadu_ps(in + 12), mul);
|
|
|
1127 f0 = _mm_min_ps(_mm_max_ps(f0, lo), hi);
|
|
|
1128 f1 = _mm_min_ps(_mm_max_ps(f1, lo), hi);
|
|
|
1129 f2 = _mm_min_ps(_mm_max_ps(f2, lo), hi);
|
|
|
1130 f3 = _mm_min_ps(_mm_max_ps(f3, lo), hi);
|
|
|
1131 __m128i w0 = _mm_cvtps_epi32(f0);
|
|
|
1132 __m128i w1 = _mm_cvtps_epi32(f1);
|
|
|
1133 __m128i w2 = _mm_cvtps_epi32(f2);
|
|
|
1134 __m128i w3 = _mm_cvtps_epi32(f3);
|
|
|
1135
|
|
|
1136 // _mm_shuffle_epi8 : SSSE3
|
|
|
1137 w0 = _mm_shuffle_epi8(w0, pi0);
|
|
|
1138 w1 = _mm_shuffle_epi8(w1, pi1);
|
|
|
1139 w2 = _mm_shuffle_epi8(w2, pi2);
|
|
|
1140 w3 = _mm_shuffle_epi8(w3, pi3);
|
|
|
1141
|
|
|
1142 // _mm_blend_epi16 : SSE4.1
|
|
|
1143 __m128i u0 = _mm_blend_epi16(w0, w1, 0xC0);
|
|
|
1144 __m128i u1 = _mm_blend_epi16(w1, w2, 0xF0);
|
|
|
1145 __m128i u2 = _mm_blend_epi16(w2, w3, 0xFC);
|
|
|
1146
|
|
|
1147 _mm_storeu_si128((__m128i*)(out + 0), u0);
|
|
|
1148 _mm_storeu_si128((__m128i*)(out + 16), u1);
|
|
|
1149 _mm_storeu_si128((__m128i*)(out + 32), u2);
|
|
|
1150
|
|
|
1151 in += 4 * 4;
|
|
|
1152 out += 16 * 3;
|
|
|
1153 n -= 4 * 4;
|
|
|
1154 }
|
|
|
1155
|
|
|
1156 convert_to_int24_noopt(in, n, out, scale);
|
|
|
1157 }
|
|
|
1158
|
|
|
1159 #endif // AUDIO_MATH_SSE
|
|
|
1160
|
|
|
1161 void audio_math::convert_to_int24(const float* in, size_t count, void* out, float scale) {
|
|
|
1162 scale *= 0x800000;
|
|
|
1163
|
|
|
1164 #ifdef AUDIO_MATH_SSE
|
|
|
1165 if (haveSSE41) {
|
|
|
1166 f32_to_i24_sse41(in, count, (uint8_t*)out, scale); return;
|
|
|
1167 }
|
|
|
1168 #endif
|
|
|
1169 convert_to_int24_noopt(in, count, out, scale);
|
|
|
1170 }
|
|
|
1171 void audio_math::convert_to_int24(const double* in, size_t count, void* out, double scale) {
|
|
|
1172 scale *= 0x800000;
|
|
|
1173 #ifdef AUDIO_MATH_SSE
|
|
|
1174 #if allowAVX
|
|
|
1175 if (haveAVX) {
|
|
|
1176 f64_to_i24_avx(in, count, (uint8_t*)out, scale); return;
|
|
|
1177 }
|
|
|
1178 #endif // allowAVX
|
|
|
1179 if (haveSSE41) {
|
|
|
1180 f64_to_i24_sse41(in, count, (uint8_t*)out, scale); return;
|
|
|
1181 }
|
|
|
1182 #endif // AUDIO_MATH_SSE
|
|
|
1183 convert_to_int24_noopt(in, count, out, scale);
|
|
|
1184 }
|
|
|
1185
|
|
|
1186 }
|