Mercurial > foo_out_sdl
comparison foosdk/sdk/pfc/audio_math.cpp @ 1:20d02a178406 default tip
*: check in everything else
yay
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 05 Jan 2026 02:15:46 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:e9bb126753e7 | 1:20d02a178406 |
|---|---|
| 1 #include "pfc-lite.h" | |
| 2 #include "audio_sample.h" | |
| 3 #include "primitives.h" | |
| 4 #include "cpuid.h" | |
| 5 | |
| 6 | |
| 7 #if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__x86_64__) || defined(__SSE2__) | |
| 8 #define AUDIO_MATH_SSE | |
| 9 #include <xmmintrin.h> | |
| 10 #include <tmmintrin.h> // _mm_shuffle_epi8 | |
| 11 #include <smmintrin.h> // _mm_blend_epi16 | |
| 12 | |
| 13 #ifndef _mm_loadu_si32 | |
| 14 #define _mm_loadu_si32(p) _mm_cvtsi32_si128(*(unsigned int const*)(p)) | |
| 15 #endif | |
| 16 #ifndef _mm_storeu_si32 | |
| 17 #define _mm_storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a))) | |
| 18 #endif | |
| 19 | |
| 20 #ifdef __AVX__ | |
| 21 #define allowAVX 1 | |
| 22 #define haveAVX 1 | |
| 23 #elif PFC_HAVE_CPUID | |
| 24 #define allowAVX 1 | |
| 25 static const bool haveAVX = pfc::query_cpu_feature_set(pfc::CPU_HAVE_AVX); | |
| 26 #else | |
| 27 #define allowAVX 0 | |
| 28 #define haveAVX 0 | |
| 29 #endif | |
| 30 | |
| 31 #ifdef __SSE4_1__ | |
| 32 #define haveSSE41 true | |
| 33 #elif PFC_HAVE_CPUID | |
| 34 static const bool haveSSE41 = pfc::query_cpu_feature_set(pfc::CPU_HAVE_SSE41); | |
| 35 #else | |
| 36 #define haveSSE41 false | |
| 37 #endif | |
| 38 | |
| 39 #if allowAVX | |
| 40 #include <immintrin.h> // _mm256_set1_pd | |
| 41 #endif | |
| 42 | |
| 43 #endif // end SSE | |
| 44 | |
| 45 #if defined( __aarch64__ ) || defined( _M_ARM64) || defined( _M_ARM64EC ) | |
| 46 #define AUDIO_MATH_ARM64 | |
| 47 #endif | |
| 48 | |
| 49 #if defined( AUDIO_MATH_ARM64 ) || defined( __ARM_NEON__ ) | |
| 50 #define AUDIO_MATH_NEON | |
| 51 #include <arm_neon.h> | |
| 52 | |
| 53 // No vcvtnq_s32_f32 on ARM32, use vcvtq_s32_f32, close enough | |
| 54 #ifdef AUDIO_MATH_ARM64 | |
| 55 #define vcvtnq_s32_f32_wrap vcvtnq_s32_f32 | |
| 56 #else | |
| 57 #define vcvtnq_s32_f32_wrap vcvtq_s32_f32 | |
| 58 #endif | |
| 59 | |
| 60 #endif | |
| 61 | |
| 62 | |
| 63 #if defined( AUDIO_MATH_ARM64 ) && !defined( __ANDROID__ ) | |
| 64 // Don't do Neon float64 on Android, crashes clang from NDK 25 | |
| 65 #define AUDIO_MATH_NEON_FLOAT64 | |
| 66 #endif | |
| 67 | |
| 68 template<typename float_t> inline static float_t noopt_calculate_peak(const float_t *p_src, t_size p_num) | |
| 69 { | |
| 70 float_t peak = 0; | |
| 71 t_size num = p_num; | |
| 72 for(;num;num--) | |
| 73 { | |
| 74 float_t temp = (float_t)fabs(*(p_src++)); | |
| 75 peak = fmax(peak, temp); | |
| 76 } | |
| 77 return peak; | |
| 78 } | |
| 79 | |
| 80 | |
| 81 template<typename float_t> | |
| 82 inline static void noopt_convert_to_32bit(const float_t* p_source,t_size p_count,t_int32 * p_output, float_t p_scale) | |
| 83 { | |
| 84 t_size num = p_count; | |
| 85 for(;num;--num) | |
| 86 { | |
| 87 t_int64 val = pfc::audio_math::rint64( *(p_source++) * p_scale ); | |
| 88 if (val < INT32_MIN) val = INT32_MIN; | |
| 89 else if (val > INT32_MAX) val = INT32_MAX; | |
| 90 *(p_output++) = (t_int32) val; | |
| 91 } | |
| 92 } | |
| 93 | |
| 94 template<typename float_t> | |
| 95 inline static void noopt_convert_to_16bit(const float_t* p_source,t_size p_count,t_int16 * p_output, float_t p_scale) { | |
| 96 for(t_size n=0;n<p_count;n++) { | |
| 97 *(p_output++) = (t_int16) pfc::clip_t<int32_t>(pfc::audio_math::rint32(*(p_source++)*p_scale),INT16_MIN,INT16_MAX); | |
| 98 } | |
| 99 } | |
| 100 | |
| 101 template<typename float_t> | |
| 102 inline static void noopt_convert_from_int16(const t_int16 * __restrict p_source,t_size p_count, float_t* __restrict p_output, float_t p_scale) | |
| 103 { | |
| 104 t_size num = p_count; | |
| 105 for(;num;num--) | |
| 106 *(p_output++) = (float_t)*(p_source++) * p_scale; | |
| 107 } | |
| 108 | |
| 109 | |
| 110 | |
| 111 template<typename float_t> | |
| 112 inline static void noopt_convert_from_int32(const t_int32 * __restrict p_source,t_size p_count, float_t* __restrict p_output, float_t p_scale) | |
| 113 { | |
| 114 t_size num = p_count; | |
| 115 for(;num;num--) | |
| 116 *(p_output++) = (float_t)( * (p_source++) * p_scale ); | |
| 117 } | |
| 118 | |
| 119 template<typename in_t, typename out_t, typename scale_t> | |
| 120 inline static void noopt_scale(const in_t * p_source,size_t p_count,out_t * p_output,scale_t p_scale) | |
| 121 { | |
| 122 for(t_size n=0;n<p_count;n++) | |
| 123 p_output[n] = (out_t)(p_source[n] * p_scale); | |
| 124 } | |
| 125 template<typename in_t, typename out_t> | |
| 126 inline static void noopt_convert(const in_t* in, out_t* out, size_t count) { | |
| 127 for (size_t walk = 0; walk < count; ++walk) out[walk] = (out_t)in[walk]; | |
| 128 } | |
| 129 | |
| 130 #ifdef AUDIO_MATH_NEON | |
| 131 | |
| 132 #ifdef AUDIO_MATH_ARM64 | |
| 133 #define _vmaxvq_f32_wrap vmaxvq_f32 | |
| 134 #else | |
| 135 inline float _vmaxvq_f32_wrap( float32x4_t arg ) { | |
| 136 return pfc::max_t<float>( pfc::max_t<float>(arg[0], arg[1]), pfc::max_t<float>(arg[2], arg[3]) ); | |
| 137 } | |
| 138 #endif | |
| 139 | |
| 140 inline static float neon_calculate_peak( const float * p_source, size_t p_count ) { | |
| 141 size_t num = p_count / 8; | |
| 142 float32x4_t ret1 = {}, ret2 = {}; | |
| 143 for(;num;--num) { | |
| 144 float32x4_t f32lo = vld1q_f32( p_source ); | |
| 145 float32x4_t f32hi = vld1q_f32( p_source + 4 ); | |
| 146 p_source += 8; | |
| 147 ret1 = vmaxq_f32(ret1, vabsq_f32(f32lo)); | |
| 148 ret2 = vmaxq_f32(ret2, vabsq_f32(f32hi)); | |
| 149 } | |
| 150 | |
| 151 float ret = _vmaxvq_f32_wrap(vmaxq_f32( ret1, ret2 )); | |
| 152 | |
| 153 size_t rem = p_count % 8; | |
| 154 if ( rem != 0 ) { | |
| 155 float v = noopt_calculate_peak( p_source, p_count % 8); | |
| 156 if (v > ret) ret = v; | |
| 157 } | |
| 158 | |
| 159 return ret; | |
| 160 } | |
| 161 | |
| 162 inline static void neon_scale(const float * p_source,size_t p_count, float * p_output,float p_scale) { | |
| 163 size_t num = p_count / 8; | |
| 164 for(;num;--num) { | |
| 165 float32x4_t lo = vld1q_f32( p_source ); | |
| 166 float32x4_t hi = vld1q_f32( p_source + 4 ); | |
| 167 | |
| 168 lo = vmulq_n_f32( lo, p_scale); | |
| 169 hi = vmulq_n_f32( hi, p_scale); | |
| 170 | |
| 171 vst1q_f32( p_output, lo ); | |
| 172 vst1q_f32( p_output+4, hi ); | |
| 173 | |
| 174 p_source += 8; | |
| 175 p_output += 8; | |
| 176 } | |
| 177 | |
| 178 noopt_scale( p_source, p_count % 8, p_output, p_scale); | |
| 179 } | |
| 180 inline static void neon_convert_to_int32(const float * __restrict p_source,t_size p_count, int32_t * __restrict p_output,float p_scale) | |
| 181 { | |
| 182 size_t num = p_count / 8; | |
| 183 for(;num;--num) { | |
| 184 float32x4_t f32lo = vld1q_f32( p_source ); | |
| 185 float32x4_t f32hi = vld1q_f32( p_source + 4 ); | |
| 186 | |
| 187 int32x4_t lo = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32lo, p_scale) ); | |
| 188 int32x4_t hi = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32hi, p_scale) ); | |
| 189 | |
| 190 vst1q_s32(p_output, lo); | |
| 191 vst1q_s32(p_output+4, hi); | |
| 192 | |
| 193 p_source += 8; | |
| 194 p_output += 8; | |
| 195 | |
| 196 } | |
| 197 | |
| 198 noopt_convert_to_32bit(p_source, p_count % 8, p_output, p_scale); | |
| 199 } | |
| 200 | |
| 201 inline static void neon_convert_from_int32(const int32_t * __restrict p_source,t_size p_count, float * __restrict p_output,float p_scale) | |
| 202 { | |
| 203 size_t num = p_count / 8; | |
| 204 size_t rem = p_count % 8; | |
| 205 for(;num;num--) { | |
| 206 int32x4_t i32lo = vld1q_s32( p_source ); | |
| 207 int32x4_t i32hi = vld1q_s32( p_source + 4 ); | |
| 208 float32x4_t f32vl = vcvtq_f32_s32(i32lo); | |
| 209 float32x4_t f32vh = vcvtq_f32_s32(i32hi); | |
| 210 | |
| 211 vst1q_f32(&p_output[0], vmulq_n_f32(f32vl, p_scale)); | |
| 212 vst1q_f32(&p_output[4], vmulq_n_f32(f32vh, p_scale)); | |
| 213 | |
| 214 p_source += 8; | |
| 215 p_output += 8; | |
| 216 | |
| 217 } | |
| 218 | |
| 219 noopt_convert_from_int32( p_source, rem, p_output, p_scale ); | |
| 220 } | |
| 221 | |
| 222 inline static void neon_convert_to_int16(const float* __restrict p_source,t_size p_count, int16_t * __restrict p_output,float p_scale) | |
| 223 { | |
| 224 size_t num = p_count / 8; | |
| 225 size_t rem = p_count % 8; | |
| 226 for(;num;--num) { | |
| 227 float32x4_t f32lo = vld1q_f32( p_source ); | |
| 228 float32x4_t f32hi = vld1q_f32( p_source + 4); | |
| 229 | |
| 230 int32x4_t lo = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32lo, p_scale) ); | |
| 231 int32x4_t hi = vcvtnq_s32_f32_wrap( vmulq_n_f32(f32hi, p_scale) ); | |
| 232 | |
| 233 vst1q_s16(&p_output[0], vcombine_s16( vqmovn_s32( lo ), vqmovn_s32( hi ) ) ); | |
| 234 | |
| 235 p_source += 8; | |
| 236 p_output += 8; | |
| 237 | |
| 238 } | |
| 239 | |
| 240 noopt_convert_to_16bit(p_source, rem, p_output, p_scale); | |
| 241 | |
| 242 } | |
| 243 inline static void neon_convert_from_int16(const t_int16 * __restrict p_source,t_size p_count, float * __restrict p_output,float p_scale) | |
| 244 { | |
| 245 size_t num = p_count / 8; | |
| 246 size_t rem = p_count % 8; | |
| 247 for(;num;num--) { | |
| 248 auto i16lo = vld1_s16(p_source); | |
| 249 auto i16hi = vld1_s16(p_source + 4); | |
| 250 | |
| 251 float32x4_t f32vl = vcvtq_f32_s32(vmovl_s16 (i16lo)); | |
| 252 float32x4_t f32vh = vcvtq_f32_s32(vmovl_s16 (i16hi)); | |
| 253 | |
| 254 vst1q_f32(&p_output[0], vmulq_n_f32(f32vl, p_scale)); | |
| 255 vst1q_f32(&p_output[4], vmulq_n_f32(f32vh, p_scale)); | |
| 256 | |
| 257 p_source += 8; | |
| 258 p_output += 8; | |
| 259 | |
| 260 } | |
| 261 | |
| 262 noopt_convert_from_int16( p_source, rem, p_output, p_scale ); | |
| 263 } | |
| 264 #ifdef AUDIO_MATH_NEON_FLOAT64 | |
| 265 inline static void neon_convert_to_int16(const double* __restrict p_source, t_size p_count, int16_t* __restrict p_output, double p_scale) | |
| 266 { | |
| 267 size_t num = p_count / 4; | |
| 268 size_t rem = p_count % 4; | |
| 269 for (; num; --num) { | |
| 270 float64x2_t f64lo = vld1q_f64(p_source); | |
| 271 float64x2_t f64hi = vld1q_f64(p_source + 2); | |
| 272 | |
| 273 f64lo = vmulq_n_f64(f64lo, p_scale); | |
| 274 f64hi = vmulq_n_f64(f64hi, p_scale); | |
| 275 | |
| 276 int64x2_t lo64 = vcvtnq_s64_f64(f64lo); | |
| 277 int64x2_t hi64 = vcvtnq_s64_f64(f64hi); | |
| 278 | |
| 279 int32x4_t v32 = vcombine_s32(vqmovn_s64(lo64), vqmovn_s64(hi64)); | |
| 280 | |
| 281 | |
| 282 vst1_s16(&p_output[0], vqmovn_s32(v32)); | |
| 283 | |
| 284 p_source += 4; | |
| 285 p_output += 4; | |
| 286 | |
| 287 } | |
| 288 | |
| 289 noopt_convert_to_16bit(p_source, rem, p_output, p_scale); | |
| 290 } | |
| 291 | |
| 292 inline static void neon_convert_from_int16(const t_int16* __restrict p_source, t_size p_count, double* __restrict p_output, double p_scale) | |
| 293 { | |
| 294 size_t num = p_count / 4; | |
| 295 size_t rem = p_count % 4; | |
| 296 for (; num; num--) { | |
| 297 int32x4_t i32 = vmovl_s16(vld1_s16(p_source)); | |
| 298 | |
| 299 int64x2_t lo64 = vmovl_s32( vget_low_s32(i32) ); | |
| 300 int64x2_t hi64 = vmovl_s32(vget_high_s32(i32)); | |
| 301 | |
| 302 float64x2_t f64vl = vcvtq_f64_s64(lo64); | |
| 303 float64x2_t f64vh = vcvtq_f64_s64(hi64); | |
| 304 | |
| 305 vst1q_f64(&p_output[0], vmulq_n_f64(f64vl, p_scale)); | |
| 306 vst1q_f64(&p_output[2], vmulq_n_f64(f64vh, p_scale)); | |
| 307 | |
| 308 p_source += 4; | |
| 309 p_output += 4; | |
| 310 | |
| 311 } | |
| 312 | |
| 313 noopt_convert_from_int16(p_source, rem, p_output, p_scale); | |
| 314 } | |
| 315 #endif // AUDIO_MATH_NEON_FLOAT64 | |
| 316 | |
| 317 #endif // AUDIO_MATH_NEON | |
| 318 | |
| 319 #if defined(AUDIO_MATH_SSE) | |
| 320 | |
| 321 inline void convert_to_32bit_sse2(const float* p_src, size_t numTotal, t_int32* p_dst, float p_mul) | |
| 322 { | |
| 323 | |
| 324 // Implementation notes | |
| 325 // There doesn't seem to be a nice and tidy way to convert float to int32 with graceful clipping to INT32_MIN .. INT32_MAX range. | |
| 326 // While low clipping at INT32_MIN can be accomplished with _mm_max_ps(), high clipping needs float compare THEN substitute bad int with INT32_MAX. | |
| 327 // The best we could do with _mm_min_ps() would result with high clipping at 0x7FFFFF80 instead of 0x7FFFFFFF (INT32_MAX). | |
| 328 // We store masks from float compare and fix ints according to the mask later. | |
| 329 | |
| 330 __m128 mul = _mm_set1_ps(p_mul); | |
| 331 __m128 loF = _mm_set1_ps((float)INT32_MIN); | |
| 332 __m128 hiF = _mm_set1_ps((float)INT32_MAX); | |
| 333 // __m128i loI = _mm_set1_epi32(INT32_MIN); | |
| 334 __m128i hiI = _mm_set1_epi32(INT32_MAX); | |
| 335 | |
| 336 size_t num = numTotal / 4; | |
| 337 size_t rem = numTotal % 4; | |
| 338 for (; num; --num) { | |
| 339 __m128 s = _mm_mul_ps(mul, _mm_loadu_ps(p_src)); p_src += 4; | |
| 340 | |
| 341 s = _mm_max_ps(s, loF); | |
| 342 | |
| 343 // __m128i maskLo = _mm_castps_si128(_mm_cmple_ps(s, loF)); | |
| 344 __m128i maskHi = _mm_castps_si128(_mm_cmpge_ps(s, hiF)); | |
| 345 | |
| 346 __m128i i = _mm_cvtps_epi32(s); | |
| 347 | |
| 348 // i = _mm_or_si128(_mm_andnot_si128(maskLo, i), _mm_and_si128(loI, maskLo)); | |
| 349 i = _mm_or_si128(_mm_andnot_si128(maskHi, i), _mm_and_si128(hiI, maskHi)); | |
| 350 | |
| 351 _mm_storeu_si128((__m128i*) p_dst, i); p_dst += 4; | |
| 352 } | |
| 353 | |
| 354 for (; rem; --rem) { | |
| 355 __m128 s = _mm_mul_ss(_mm_load_ss(p_src++), mul); | |
| 356 s = _mm_max_ss(s, loF); | |
| 357 | |
| 358 // __m128i maskLo = _mm_castps_si128( _mm_cmple_ss(s, loF) ); | |
| 359 __m128i maskHi = _mm_castps_si128(_mm_cmpge_ss(s, hiF)); | |
| 360 | |
| 361 __m128i i = _mm_cvtps_epi32(s); // not ss | |
| 362 | |
| 363 // i = _mm_or_si128(_mm_andnot_si128(maskLo, i), _mm_and_si128(loI, maskLo)); | |
| 364 i = _mm_or_si128(_mm_andnot_si128(maskHi, i), _mm_and_si128(hiI, maskHi)); | |
| 365 | |
| 366 _mm_storeu_si32(p_dst++, i); | |
| 367 } | |
| 368 } | |
| 369 | |
| 370 inline void convert_to_32bit_sse2(const double* p_src, size_t numTotal, t_int32* p_dst, double p_mul) | |
| 371 { | |
| 372 auto mul = _mm_set1_pd(p_mul); | |
| 373 auto loF = _mm_set1_pd(INT32_MIN); | |
| 374 auto hiF = _mm_set1_pd(INT32_MAX); | |
| 375 | |
| 376 size_t num = numTotal / 4; | |
| 377 size_t rem = numTotal % 4; | |
| 378 for (; num; --num) { | |
| 379 auto v1 = _mm_loadu_pd(p_src); | |
| 380 auto v2 = _mm_loadu_pd(p_src + 2); | |
| 381 p_src += 4; | |
| 382 | |
| 383 v1 = _mm_mul_pd(v1, mul); v2 = _mm_mul_pd(v2, mul); | |
| 384 | |
| 385 v1 = _mm_max_pd(v1, loF); v2 = _mm_max_pd(v2, loF); | |
| 386 v1 = _mm_min_pd(v1, hiF); v2 = _mm_min_pd(v2, hiF); | |
| 387 | |
| 388 auto i1 = _mm_cvtpd_epi32(v1), i2 = _mm_cvtpd_epi32(v2); | |
| 389 | |
| 390 | |
| 391 _mm_storeu_si128((__m128i*) p_dst, _mm_unpacklo_epi64(i1, i2)); p_dst += 4; | |
| 392 } | |
| 393 | |
| 394 for (; rem; --rem) { | |
| 395 auto s = _mm_mul_sd(_mm_load_sd(p_src++), mul); | |
| 396 s = _mm_max_sd(s, loF); s = _mm_min_sd(s, hiF); | |
| 397 * p_dst++ = _mm_cvtsd_si32(s); | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 inline void convert_from_int16_sse2(const t_int16 * p_source,t_size p_count,float * p_output,float p_scale) | |
| 402 { | |
| 403 while(!pfc::is_ptr_aligned_t<16>(p_output) && p_count) { | |
| 404 *(p_output++) = (float)*(p_source++) * p_scale; | |
| 405 p_count--; | |
| 406 } | |
| 407 | |
| 408 { | |
| 409 __m128 mul = _mm_set1_ps(p_scale); | |
| 410 __m128i nulls = _mm_setzero_si128(); | |
| 411 __m128i delta1 = _mm_set1_epi16((int16_t)0x8000); | |
| 412 __m128i delta2 = _mm_set1_epi32((int32_t)0x8000); | |
| 413 | |
| 414 for(t_size loop = p_count >> 3;loop;--loop) { | |
| 415 __m128i source, temp1, temp2; __m128 float1, float2; | |
| 416 source = _mm_loadu_si128((__m128i*)p_source); | |
| 417 source = _mm_xor_si128(source,delta1); | |
| 418 temp1 = _mm_unpacklo_epi16(source,nulls); | |
| 419 temp2 = _mm_unpackhi_epi16(source,nulls); | |
| 420 temp1 = _mm_sub_epi32(temp1,delta2); | |
| 421 temp2 = _mm_sub_epi32(temp2,delta2); | |
| 422 p_source += 8; | |
| 423 float1 = _mm_cvtepi32_ps(temp1); | |
| 424 float2 = _mm_cvtepi32_ps(temp2); | |
| 425 float1 = _mm_mul_ps(float1,mul); | |
| 426 float2 = _mm_mul_ps(float2,mul); | |
| 427 _mm_store_ps(p_output,float1); | |
| 428 _mm_store_ps(p_output+4,float2); | |
| 429 p_output += 8; | |
| 430 } | |
| 431 | |
| 432 p_count &= 7; | |
| 433 } | |
| 434 | |
| 435 while(p_count) { | |
| 436 *(p_output++) = (float)*(p_source++) * p_scale; | |
| 437 p_count--; | |
| 438 } | |
| 439 } | |
| 440 | |
| 441 inline static void convert_to_16bit_sse2(const float * p_source,t_size p_count,t_int16 * p_output,float p_scale) | |
| 442 { | |
| 443 size_t num = p_count/8; | |
| 444 size_t rem = p_count%8; | |
| 445 __m128 mul = _mm_set1_ps(p_scale); | |
| 446 for(;num;--num) | |
| 447 { | |
| 448 __m128 temp1,temp2; __m128i itemp1, itemp2; | |
| 449 temp1 = _mm_loadu_ps(p_source); | |
| 450 temp2 = _mm_loadu_ps(p_source+4); | |
| 451 temp1 = _mm_mul_ps(temp1,mul); | |
| 452 temp2 = _mm_mul_ps(temp2,mul); | |
| 453 p_source += 8; | |
| 454 itemp1 = _mm_cvtps_epi32(temp1); | |
| 455 itemp2 = _mm_cvtps_epi32(temp2); | |
| 456 _mm_storeu_si128( (__m128i*)p_output, _mm_packs_epi32(itemp1, itemp2) ); | |
| 457 p_output += 8; | |
| 458 } | |
| 459 | |
| 460 noopt_convert_to_16bit(p_source, rem, p_output, p_scale); | |
| 461 } | |
| 462 | |
| 463 inline static void convert_to_16bit_sse2(const double* p_source, t_size p_count, t_int16* p_output, double p_scale) | |
| 464 { | |
| 465 size_t num = p_count / 8; | |
| 466 size_t rem = p_count % 8; | |
| 467 __m128d mul = _mm_set1_pd(p_scale); | |
| 468 for (; num; --num) | |
| 469 { | |
| 470 __m128d temp1, temp2, temp3, temp4; __m128i itemp1, itemp2; | |
| 471 temp1 = _mm_loadu_pd(p_source); | |
| 472 temp2 = _mm_loadu_pd(p_source + 2); | |
| 473 temp3 = _mm_loadu_pd(p_source + 4); | |
| 474 temp4 = _mm_loadu_pd(p_source + 6); | |
| 475 | |
| 476 p_source += 8; | |
| 477 | |
| 478 temp1 = _mm_mul_pd(temp1, mul); | |
| 479 temp2 = _mm_mul_pd(temp2, mul); | |
| 480 temp3 = _mm_mul_pd(temp3, mul); | |
| 481 temp4 = _mm_mul_pd(temp4, mul); | |
| 482 | |
| 483 | |
| 484 itemp1 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(temp1), _mm_cvtpd_epi32(temp2)); | |
| 485 itemp2 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(temp3), _mm_cvtpd_epi32(temp4)); | |
| 486 | |
| 487 _mm_storeu_si128((__m128i*)p_output, _mm_packs_epi32(itemp1, itemp2)); | |
| 488 p_output += 8; | |
| 489 } | |
| 490 | |
| 491 noopt_convert_to_16bit(p_source, rem, p_output, p_scale); | |
| 492 } | |
| 493 #if allowAVX | |
| 494 inline static void avx_convert_to_16bit(const double* p_source, size_t p_count, int16_t* p_output, double p_scale) { | |
| 495 size_t num = p_count / 8; | |
| 496 size_t rem = p_count % 8; | |
| 497 auto mul = _mm256_set1_pd(p_scale); | |
| 498 for (; num; --num) | |
| 499 { | |
| 500 auto temp1 = _mm256_loadu_pd(p_source); | |
| 501 auto temp2 = _mm256_loadu_pd(p_source + 4); | |
| 502 | |
| 503 p_source += 8; | |
| 504 | |
| 505 temp1 = _mm256_mul_pd(temp1, mul); | |
| 506 temp2 = _mm256_mul_pd(temp2, mul); | |
| 507 | |
| 508 auto itemp1 = _mm256_cvtpd_epi32(temp1); | |
| 509 auto itemp2 = _mm256_cvtpd_epi32(temp2); | |
| 510 | |
| 511 _mm_storeu_si128((__m128i*)p_output, _mm_packs_epi32(itemp1, itemp2)); | |
| 512 p_output += 8; | |
| 513 } | |
| 514 | |
| 515 noopt_convert_to_16bit(p_source, rem, p_output, p_scale); | |
| 516 } | |
| 517 #endif | |
| 518 | |
| 519 inline float sse_calculate_peak( const float * src, size_t count ) { | |
| 520 size_t num = count/8; | |
| 521 size_t rem = count%8; | |
| 522 | |
| 523 __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); | |
| 524 __m128 acc1 = _mm_setzero_ps(), acc2 = _mm_setzero_ps(); | |
| 525 | |
| 526 for(;num;--num) { | |
| 527 __m128 v1 = _mm_loadu_ps( src ); | |
| 528 __m128 v2 = _mm_loadu_ps( src + 4 ); | |
| 529 v1 = _mm_and_ps( v1, mask ); | |
| 530 v2 = _mm_and_ps( v2, mask ); | |
| 531 // Two acc channels so one _mm_max_ps doesn't block the other | |
| 532 acc1 = _mm_max_ps( acc1, v1 ); | |
| 533 acc2 = _mm_max_ps( acc2, v2 ); | |
| 534 src += 8; | |
| 535 } | |
| 536 | |
| 537 float ret; | |
| 538 { | |
| 539 float blah[4]; | |
| 540 _mm_storeu_ps(blah, _mm_max_ps( acc1, acc2 )); | |
| 541 __m128 acc = _mm_load_ss( &blah[0] ); | |
| 542 acc = _mm_max_ss( acc, _mm_load_ss( &blah[1] ) ); | |
| 543 acc = _mm_max_ss( acc, _mm_load_ss( &blah[2] ) ); | |
| 544 acc = _mm_max_ss( acc, _mm_load_ss( &blah[3] ) ); | |
| 545 ret = _mm_cvtss_f32(acc); | |
| 546 } | |
| 547 if ( rem > 0 ) { | |
| 548 __m128 acc = _mm_set_ss( ret ); | |
| 549 for( ;rem; --rem) { | |
| 550 __m128 v = _mm_load_ss( src++ ); | |
| 551 v = _mm_and_ps( v, mask ); | |
| 552 acc = _mm_max_ss( acc, v ); | |
| 553 } | |
| 554 ret = _mm_cvtss_f32(acc); | |
| 555 } | |
| 556 return ret; | |
| 557 } | |
| 558 | |
| 559 #if allowAVX | |
| 560 inline double avx_calculate_peak(const double* src, size_t count) { | |
| 561 size_t num = count / 8; | |
| 562 size_t rem = count % 8; | |
| 563 | |
| 564 auto mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF)); | |
| 565 auto acc1 = _mm256_setzero_pd(), acc2 = _mm256_setzero_pd(); | |
| 566 | |
| 567 for (; num; --num) { | |
| 568 auto v1 = _mm256_loadu_pd(src); | |
| 569 auto v2 = _mm256_loadu_pd(src + 4); | |
| 570 | |
| 571 v1 = _mm256_and_pd(v1, mask); | |
| 572 v2 = _mm256_and_pd(v2, mask); | |
| 573 | |
| 574 acc1 = _mm256_max_pd(acc1, v1); | |
| 575 acc2 = _mm256_max_pd(acc2, v2); | |
| 576 | |
| 577 src += 8; | |
| 578 } | |
| 579 | |
| 580 __m128d acc; | |
| 581 { | |
| 582 acc1 = _mm256_max_pd(acc1, acc2); | |
| 583 | |
| 584 acc = _mm_max_pd(_mm256_extractf128_pd(acc1, 0), _mm256_extractf128_pd(acc1, 1)); | |
| 585 | |
| 586 acc = _mm_max_sd(acc, _mm_shuffle_pd(acc, acc, _MM_SHUFFLE2(0, 1))); | |
| 587 } | |
| 588 | |
| 589 if (rem > 0) { | |
| 590 __m128d mask128 = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFFFFFFFFFFFFFF)); | |
| 591 for (; rem; --rem) { | |
| 592 __m128d v = _mm_load_sd(src++); | |
| 593 v = _mm_and_pd(v, mask128); | |
| 594 acc = _mm_max_sd(acc, v); | |
| 595 } | |
| 596 } | |
| 597 return _mm_cvtsd_f64(acc); | |
| 598 } | |
| 599 #endif // allowAVX | |
| 600 | |
| 601 inline double sse_calculate_peak(const double* src, size_t count) { | |
| 602 size_t num = count / 4; | |
| 603 size_t rem = count % 4; | |
| 604 | |
| 605 __m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFFFFFFFFFFFFFF)); | |
| 606 __m128d acc1 = _mm_setzero_pd(), acc2 = _mm_setzero_pd(); | |
| 607 | |
| 608 for (; num; --num) { | |
| 609 __m128d v1 = _mm_loadu_pd(src); | |
| 610 __m128d v2 = _mm_loadu_pd(src + 2); | |
| 611 v1 = _mm_and_pd(v1, mask); | |
| 612 v2 = _mm_and_pd(v2, mask); | |
| 613 // Two acc channels so one _mm_max_pd doesn't block the other | |
| 614 acc1 = _mm_max_pd(acc1, v1); | |
| 615 acc2 = _mm_max_pd(acc2, v2); | |
| 616 src += 4; | |
| 617 } | |
| 618 | |
| 619 { | |
| 620 acc1 = _mm_max_pd(acc1, acc2); | |
| 621 acc1 = _mm_max_sd(acc1, _mm_shuffle_pd(acc1, acc1, _MM_SHUFFLE2(0, 1))); | |
| 622 } | |
| 623 if (rem > 0) { | |
| 624 for (; rem; --rem) { | |
| 625 __m128d v = _mm_load_sd(src++); | |
| 626 v = _mm_and_pd(v, mask); | |
| 627 acc1 = _mm_max_sd(acc1, v); | |
| 628 } | |
| 629 } | |
| 630 return _mm_cvtsd_f64(acc1); | |
| 631 } | |
| 632 | |
| 633 inline void sse_convert_from_int32(const int32_t* source, size_t count, float* output, float scale) { | |
| 634 __m128 mul = _mm_set1_ps(scale); | |
| 635 for (size_t num = count/8; num; --num) | |
| 636 { | |
| 637 __m128i itemp1, itemp2; __m128 temp1, temp2; | |
| 638 itemp1 = _mm_loadu_si128((__m128i*)source); | |
| 639 itemp2 = _mm_loadu_si128((__m128i*)source + 1); | |
| 640 temp1 = _mm_cvtepi32_ps(itemp1); | |
| 641 temp2 = _mm_cvtepi32_ps(itemp2); | |
| 642 source += 8; | |
| 643 temp1 = _mm_mul_ps(temp1, mul); | |
| 644 temp2 = _mm_mul_ps(temp2, mul); | |
| 645 _mm_storeu_ps(output, temp1); | |
| 646 _mm_storeu_ps(output + 4, temp2); | |
| 647 output += 8; | |
| 648 } | |
| 649 for (size_t rem = count % 8; rem; --rem) { | |
| 650 __m128i i = _mm_loadu_si32(source++); | |
| 651 __m128 f = _mm_cvtepi32_ps(i); | |
| 652 f = _mm_mul_ss(f, mul); | |
| 653 _mm_store_ss(output++, f); | |
| 654 } | |
| 655 } | |
| 656 | |
| 657 inline void sse_convert_from_int32(const int32_t* source, size_t count, double* output, double scale) { | |
| 658 auto mul = _mm_set1_pd(scale); | |
| 659 for (size_t num = count / 8; num; --num) | |
| 660 { | |
| 661 auto itemp1 = _mm_loadu_si128((__m128i*)source); | |
| 662 auto itemp2 = _mm_loadu_si128((__m128i*)source + 1); | |
| 663 auto temp1 = _mm_cvtepi32_pd(itemp1); | |
| 664 auto temp2 = _mm_cvtepi32_pd(_mm_shuffle_epi32(itemp1, _MM_SHUFFLE(1, 0, 3, 2))); | |
| 665 auto temp3 = _mm_cvtepi32_pd(itemp2); | |
| 666 auto temp4 = _mm_cvtepi32_pd(_mm_shuffle_epi32(itemp2, _MM_SHUFFLE(1, 0, 3, 2))); | |
| 667 source += 8; | |
| 668 temp1 = _mm_mul_pd(temp1, mul); | |
| 669 temp2 = _mm_mul_pd(temp2, mul); | |
| 670 temp3 = _mm_mul_pd(temp3, mul); | |
| 671 temp4 = _mm_mul_pd(temp4, mul); | |
| 672 _mm_storeu_pd(output, temp1); | |
| 673 _mm_storeu_pd(output + 2, temp2); | |
| 674 _mm_storeu_pd(output + 4, temp3); | |
| 675 _mm_storeu_pd(output + 6, temp4); | |
| 676 output += 8; | |
| 677 } | |
| 678 for (size_t rem = count % 8; rem; --rem) { | |
| 679 __m128i i = _mm_loadu_si32(source++); | |
| 680 auto f = _mm_cvtepi32_pd(i); | |
| 681 f = _mm_mul_sd(f, mul); | |
| 682 _mm_store_sd(output++, f); | |
| 683 } | |
| 684 } | |
| 685 #if allowAVX | |
| 686 inline void convert_from_int16_avx(const t_int16* p_source, t_size p_count, double* p_output, double p_scale) { | |
| 687 while (!pfc::is_ptr_aligned_t<32>(p_output) && p_count) { | |
| 688 *(p_output++) = (double)*(p_source++) * p_scale; | |
| 689 p_count--; | |
| 690 } | |
| 691 | |
| 692 { | |
| 693 __m256d muld = _mm256_set1_pd(p_scale); | |
| 694 | |
| 695 for (t_size loop = p_count >> 3; loop; --loop) { | |
| 696 auto source = _mm_loadu_si128((__m128i*)p_source); | |
| 697 auto temp1 = _mm_cvtepi16_epi32(source); | |
| 698 auto temp2 = _mm_cvtepi16_epi32(_mm_shuffle_epi32(source, _MM_SHUFFLE(0, 0, 3, 2))); | |
| 699 p_source += 8; | |
| 700 | |
| 701 auto double1 = _mm256_cvtepi32_pd(temp1); | |
| 702 auto double2 = _mm256_cvtepi32_pd(temp2); | |
| 703 | |
| 704 double1 = _mm256_mul_pd(double1, muld); | |
| 705 double2 = _mm256_mul_pd(double2, muld); | |
| 706 | |
| 707 _mm256_store_pd(p_output, double1); | |
| 708 _mm256_store_pd(p_output+4, double2); | |
| 709 | |
| 710 p_output += 8; | |
| 711 } | |
| 712 | |
| 713 p_count &= 7; | |
| 714 } | |
| 715 | |
| 716 while (p_count) { | |
| 717 *(p_output++) = (double)*(p_source++) * p_scale; | |
| 718 p_count--; | |
| 719 } | |
| 720 | |
| 721 } | |
| 722 #endif // allowAVX | |
| 723 | |
| 724 inline void convert_from_int16_sse2(const t_int16* p_source, t_size p_count, double * p_output, double p_scale) | |
| 725 { | |
| 726 while (!pfc::is_ptr_aligned_t<16>(p_output) && p_count) { | |
| 727 *(p_output++) = (double) * (p_source++) * p_scale; | |
| 728 p_count--; | |
| 729 } | |
| 730 | |
| 731 { | |
| 732 __m128d muld = _mm_set1_pd(p_scale); | |
| 733 __m128i nulls = _mm_setzero_si128(); | |
| 734 __m128i delta1 = _mm_set1_epi16((int16_t)0x8000); | |
| 735 __m128i delta2 = _mm_set1_epi32((int32_t)0x8000); | |
| 736 | |
| 737 for (t_size loop = p_count >> 3; loop; --loop) { | |
| 738 __m128i source, temp1, temp2; __m128d double1, double2, double3, double4; | |
| 739 source = _mm_loadu_si128((__m128i*)p_source); | |
| 740 source = _mm_xor_si128(source, delta1); | |
| 741 temp1 = _mm_unpacklo_epi16(source, nulls); | |
| 742 temp2 = _mm_unpackhi_epi16(source, nulls); | |
| 743 temp1 = _mm_sub_epi32(temp1, delta2); | |
| 744 temp2 = _mm_sub_epi32(temp2, delta2); | |
| 745 p_source += 8; | |
| 746 | |
| 747 double1 = _mm_cvtepi32_pd(temp1); | |
| 748 double2 = _mm_cvtepi32_pd(_mm_shuffle_epi32(temp1, _MM_SHUFFLE(3, 2, 3, 2))); | |
| 749 double3 = _mm_cvtepi32_pd(temp2); | |
| 750 double4 = _mm_cvtepi32_pd(_mm_shuffle_epi32(temp2, _MM_SHUFFLE(3, 2, 3, 2))); | |
| 751 | |
| 752 double1 = _mm_mul_pd(double1, muld); | |
| 753 double2 = _mm_mul_pd(double2, muld); | |
| 754 double3 = _mm_mul_pd(double3, muld); | |
| 755 double4 = _mm_mul_pd(double4, muld); | |
| 756 _mm_store_pd(p_output, double1); | |
| 757 _mm_store_pd(p_output + 2, double2); | |
| 758 _mm_store_pd(p_output + 4, double3); | |
| 759 _mm_store_pd(p_output + 6, double4); | |
| 760 | |
| 761 p_output += 8; | |
| 762 } | |
| 763 | |
| 764 p_count &= 7; | |
| 765 } | |
| 766 | |
| 767 while (p_count) { | |
| 768 *(p_output++) = (double) * (p_source++) * p_scale; | |
| 769 p_count--; | |
| 770 } | |
| 771 } | |
| 772 | |
| 773 #endif | |
| 774 | |
| 775 namespace pfc { | |
| 776 void audio_math::scale(const float* p_source, size_t p_count, float* p_output, float p_scale) { | |
| 777 #if defined( AUDIO_MATH_NEON ) | |
| 778 neon_scale(p_source, p_count, p_output, p_scale); | |
| 779 #else | |
| 780 noopt_scale(p_source, p_count, p_output, p_scale); | |
| 781 #endif | |
| 782 } | |
| 783 void audio_math::scale(const double* p_source, size_t p_count, double* p_output, double p_scale) { | |
| 784 noopt_scale(p_source, p_count, p_output, p_scale); | |
| 785 } | |
| 786 | |
| 787 void audio_math::convert_to_int16(const float* p_source, t_size p_count, t_int16* p_output, float p_scale) | |
| 788 { | |
| 789 float scale = (float)(p_scale * 0x8000); | |
| 790 #if defined(AUDIO_MATH_SSE) | |
| 791 convert_to_16bit_sse2(p_source, p_count, p_output, scale); | |
| 792 #elif defined( AUDIO_MATH_NEON ) | |
| 793 neon_convert_to_int16(p_source, p_count, p_output, scale); | |
| 794 #else | |
| 795 noopt_convert_to_16bit(p_source, p_count, p_output, scale); | |
| 796 #endif | |
| 797 } | |
| 798 void audio_math::convert_to_int16(const double* p_source, t_size p_count, t_int16* p_output, double p_scale) | |
| 799 { | |
| 800 double scale = (double)(p_scale * 0x8000); | |
| 801 #if defined(AUDIO_MATH_SSE) | |
| 802 #if allowAVX | |
| 803 if (haveAVX) { | |
| 804 avx_convert_to_16bit(p_source, p_count, p_output, scale); | |
| 805 } else | |
| 806 #endif // allowAVX | |
| 807 { | |
| 808 convert_to_16bit_sse2(p_source, p_count, p_output, scale); | |
| 809 } | |
| 810 #elif defined( AUDIO_MATH_NEON_FLOAT64 ) | |
| 811 neon_convert_to_int16(p_source, p_count, p_output, scale); | |
| 812 #else | |
| 813 noopt_convert_to_16bit(p_source, p_count, p_output, scale); | |
| 814 #endif | |
| 815 } | |
| 816 | |
| 817 void audio_math::convert_from_int16(const t_int16* p_source, t_size p_count, float* p_output, float p_scale) | |
| 818 { | |
| 819 float scale = (float)(p_scale / (double)0x8000); | |
| 820 #if defined(AUDIO_MATH_SSE) | |
| 821 convert_from_int16_sse2(p_source, p_count, p_output, scale); | |
| 822 #elif defined( AUDIO_MATH_NEON ) | |
| 823 neon_convert_from_int16(p_source, p_count, p_output, scale); | |
| 824 #else | |
| 825 noopt_convert_from_int16(p_source, p_count, p_output, scale); | |
| 826 #endif | |
| 827 } | |
| 828 | |
| 829 void audio_math::convert_from_int16(const t_int16* p_source, t_size p_count, double* p_output, double p_scale) | |
| 830 { | |
| 831 double scale = (double)(p_scale / (double)0x8000); | |
| 832 #if defined(AUDIO_MATH_SSE) | |
| 833 #if allowAVX | |
| 834 if (haveAVX) { | |
| 835 convert_from_int16_avx(p_source, p_count, p_output, scale); | |
| 836 } else | |
| 837 #endif | |
| 838 { | |
| 839 convert_from_int16_sse2(p_source, p_count, p_output, scale); | |
| 840 } | |
| 841 #elif defined( AUDIO_MATH_NEON_FLOAT64 ) | |
| 842 neon_convert_from_int16(p_source, p_count, p_output, scale); | |
| 843 #else | |
| 844 noopt_convert_from_int16(p_source, p_count, p_output, scale); | |
| 845 #endif | |
| 846 } | |
| 847 | |
| 848 void audio_math::convert_to_int32(const float* p_source, t_size p_count, t_int32* p_output, float p_scale) | |
| 849 { | |
| 850 float scale = (float)(p_scale * 0x80000000ul); | |
| 851 #if defined(AUDIO_MATH_NEON) | |
| 852 neon_convert_to_int32(p_source, p_count, p_output, scale); | |
| 853 #elif defined(AUDIO_MATH_SSE) | |
| 854 convert_to_32bit_sse2(p_source, p_count, p_output, scale); | |
| 855 #else | |
| 856 noopt_convert_to_32bit(p_source, p_count, p_output, scale); | |
| 857 #endif | |
| 858 } | |
| 859 | |
| 860 void audio_math::convert_to_int32(const double* p_source, t_size p_count, t_int32* p_output, double p_scale) | |
| 861 { | |
| 862 double scale = (double)(p_scale * 0x80000000ul); | |
| 863 #if defined(AUDIO_MATH_SSE) | |
| 864 convert_to_32bit_sse2(p_source, p_count, p_output, scale); | |
| 865 #else | |
| 866 noopt_convert_to_32bit(p_source, p_count, p_output, scale); | |
| 867 #endif | |
| 868 } | |
| 869 | |
| 870 void audio_math::convert_from_int32(const t_int32* p_source, t_size p_count, float* p_output, float p_scale) | |
| 871 { | |
| 872 float scale = (float)(p_scale / (double)0x80000000ul); | |
| 873 // Note: speed difference here is marginal over compiler output as of Xcode 12 | |
| 874 #if defined(AUDIO_MATH_NEON) | |
| 875 neon_convert_from_int32(p_source, p_count, p_output, scale); | |
| 876 #elif defined(AUDIO_MATH_SSE) | |
| 877 sse_convert_from_int32(p_source, p_count, p_output, scale); | |
| 878 #else | |
| 879 noopt_convert_from_int32(p_source, p_count, p_output, scale); | |
| 880 #endif | |
| 881 } | |
| 882 | |
| 883 void audio_math::convert_from_int32(const t_int32* p_source, t_size p_count, double* p_output, double p_scale) | |
| 884 { | |
| 885 double scale = (double)(p_scale / (double)0x80000000ul); | |
| 886 #if defined(AUDIO_MATH_SSE) | |
| 887 sse_convert_from_int32(p_source, p_count, p_output, scale); | |
| 888 #else | |
| 889 noopt_convert_from_int32(p_source, p_count, p_output, scale); | |
| 890 #endif | |
| 891 } | |
| 892 | |
| 893 float audio_math::calculate_peak(const float * p_source, t_size p_count) { | |
| 894 #if defined(AUDIO_MATH_SSE) | |
| 895 return sse_calculate_peak(p_source, p_count); | |
| 896 #elif defined(AUDIO_MATH_NEON) | |
| 897 return neon_calculate_peak(p_source, p_count); | |
| 898 #else | |
| 899 return noopt_calculate_peak(p_source, p_count); | |
| 900 #endif | |
| 901 } | |
| 902 double audio_math::calculate_peak(const double * p_source, t_size p_count) { | |
| 903 #if defined(AUDIO_MATH_SSE) | |
| 904 // Note that avx_calculate_peak failed to score better than sse_calculate_peak | |
| 905 return sse_calculate_peak(p_source, p_count); | |
| 906 #else | |
| 907 return noopt_calculate_peak(p_source, p_count); | |
| 908 #endif | |
| 909 } | |
| 910 | |
| 911 void audio_math::remove_denormals(float* p_buffer, t_size p_count) { | |
| 912 t_uint32* ptr = reinterpret_cast<t_uint32*>(p_buffer); | |
| 913 for (; p_count; p_count--) | |
| 914 { | |
| 915 t_uint32 t = *ptr; | |
| 916 if ((t & 0x007FFFFF) && !(t & 0x7F800000)) *ptr = 0; | |
| 917 ptr++; | |
| 918 } | |
| 919 } | |
| 920 void audio_math::remove_denormals(double* p_buffer, t_size p_count) { | |
| 921 t_uint64* ptr = reinterpret_cast<t_uint64*>(p_buffer); | |
| 922 for (; p_count; p_count--) | |
| 923 { | |
| 924 t_uint64 t = *ptr; | |
| 925 if ((t & 0x000FFFFFFFFFFFFF) && !(t & 0x7FF0000000000000)) *ptr = 0; | |
| 926 ptr++; | |
| 927 } | |
| 928 } | |
| 929 | |
| 930 void audio_math::add_offset(float* p_buffer, float p_delta, size_t p_count) { | |
| 931 for (size_t n = 0; n < p_count; ++n) p_buffer[n] += p_delta; | |
| 932 } | |
| 933 void audio_math::add_offset(double* p_buffer, double p_delta, size_t p_count) { | |
| 934 for (size_t n = 0; n < p_count; ++n) p_buffer[n] += p_delta; | |
| 935 } | |
| 936 | |
| 937 void audio_math::convert(const float* in, float* out, size_t count) { | |
| 938 memcpy(out, in, count * sizeof(float)); | |
| 939 } | |
| 940 void audio_math::convert(const float* in, float* out, size_t count, float scale) { | |
| 941 audio_math::scale(in, count, out, scale); | |
| 942 } | |
| 943 void audio_math::convert(const double* in, double* out, size_t count) { | |
| 944 memcpy(out, in, count * sizeof(double)); | |
| 945 } | |
| 946 void audio_math::convert(const double* in, double* out, size_t count, double scale) { | |
| 947 audio_math::scale(in, count, out, scale); | |
| 948 } | |
| 949 | |
| 950 void audio_math::convert(const float* in, double* out, size_t count) { | |
| 951 // optimize me | |
| 952 noopt_convert(in, out, count); | |
| 953 } | |
| 954 void audio_math::convert(const float* in, double* out, size_t count, double scale) { | |
| 955 // optimize me | |
| 956 noopt_scale(in, count, out, scale); | |
| 957 } | |
| 958 void audio_math::convert(const double* in, float* out, size_t count) { | |
| 959 // optimize me | |
| 960 noopt_convert(in, out, count); | |
| 961 } | |
| 962 void audio_math::convert(const double* in, float* out, size_t count, double scale) { | |
| 963 // optimize me | |
| 964 noopt_scale(in, count, out, scale); | |
| 965 } | |
| 966 | |
| 967 | |
| 968 typedef char store24_t; | |
| 969 static store24_t* store24(store24_t* out, int32_t in) { | |
| 970 *(out++) = ((store24_t*)&in)[0]; | |
| 971 *(out++) = ((store24_t*)&in)[1]; | |
| 972 *(out++) = ((store24_t*)&in)[2]; | |
| 973 return out; | |
| 974 } | |
| 975 static store24_t* store24p(store24_t* out, int32_t in) { | |
| 976 *(int32_t*)out = in; | |
| 977 return out + 3; | |
| 978 } | |
| 979 | |
| 980 static constexpr int32_t INT24_MAX = 0x7FFFFF, INT24_MIN = -0x800000; | |
| 981 | |
| 982 template<typename float_t> void convert_to_int24_noopt(float_t const* in, size_t count, void* out, float_t scale) { | |
| 983 if (count == 0) return; | |
| 984 --count; | |
| 985 auto ptr = reinterpret_cast<store24_t*>(out); | |
| 986 constexpr float_t lo = INT24_MIN, hi = INT24_MAX; | |
| 987 while (count) { | |
| 988 auto vf = *in++ * scale; | |
| 989 if (vf < lo) vf = lo; | |
| 990 else if (vf > hi) vf = hi; | |
| 991 ptr = store24p(ptr, audio_math::rint32(vf)); | |
| 992 --count; | |
| 993 } | |
| 994 | |
| 995 auto vf = *in * scale; | |
| 996 if (vf < lo) vf = lo; | |
| 997 else if (vf > hi) vf = hi; | |
| 998 store24(ptr, audio_math::rint32(vf)); | |
| 999 } | |
| 1000 #ifdef AUDIO_MATH_SSE | |
| 1001 #if allowAVX | |
| 1002 static void f64_to_i24_avx(double const* in, size_t n, uint8_t* out, double scale) { | |
| 1003 const __m128i pi0 = _mm_set_epi8(-128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); | |
| 1004 const __m128i pi1 = _mm_set_epi8(4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5); | |
| 1005 const __m128i pi2 = _mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10); | |
| 1006 const __m128i pi3 = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128); | |
| 1007 const auto mul = _mm256_set1_pd(scale); | |
| 1008 | |
| 1009 // PROBLEM: if we want to handle wildly out-of-bounds values, we can't do int clipping! | |
| 1010 // float clipping is sadly considerably slower than int clipping | |
| 1011 const auto lo = _mm256_set1_pd(INT24_MIN); | |
| 1012 const auto hi = _mm256_set1_pd(INT24_MAX); | |
| 1013 | |
| 1014 while (n >= 4 * 4) { | |
| 1015 auto f0 = _mm256_mul_pd(_mm256_loadu_pd(in + 0), mul); | |
| 1016 auto f1 = _mm256_mul_pd(_mm256_loadu_pd(in + 4), mul); | |
| 1017 auto f2 = _mm256_mul_pd(_mm256_loadu_pd(in + 8), mul); | |
| 1018 auto f3 = _mm256_mul_pd(_mm256_loadu_pd(in + 12), mul); | |
| 1019 f0 = _mm256_max_pd(_mm256_min_pd(f0, hi), lo); | |
| 1020 f1 = _mm256_max_pd(_mm256_min_pd(f1, hi), lo); | |
| 1021 f2 = _mm256_max_pd(_mm256_min_pd(f2, hi), lo); | |
| 1022 f3 = _mm256_max_pd(_mm256_min_pd(f3, hi), lo); | |
| 1023 __m128i w0 = _mm256_cvtpd_epi32(f0); | |
| 1024 __m128i w1 = _mm256_cvtpd_epi32(f1); | |
| 1025 __m128i w2 = _mm256_cvtpd_epi32(f2); | |
| 1026 __m128i w3 = _mm256_cvtpd_epi32(f3); | |
| 1027 | |
| 1028 // _mm_shuffle_epi8 : SSSE3 | |
| 1029 w0 = _mm_shuffle_epi8(w0, pi0); | |
| 1030 w1 = _mm_shuffle_epi8(w1, pi1); | |
| 1031 w2 = _mm_shuffle_epi8(w2, pi2); | |
| 1032 w3 = _mm_shuffle_epi8(w3, pi3); | |
| 1033 | |
| 1034 // _mm_blend_epi16 : SSE4.1 | |
| 1035 __m128i u0 = _mm_blend_epi16(w0, w1, 0xC0); | |
| 1036 __m128i u1 = _mm_blend_epi16(w1, w2, 0xF0); | |
| 1037 __m128i u2 = _mm_blend_epi16(w2, w3, 0xFC); | |
| 1038 | |
| 1039 _mm_storeu_si128((__m128i*)(out + 0), u0); | |
| 1040 _mm_storeu_si128((__m128i*)(out + 16), u1); | |
| 1041 _mm_storeu_si128((__m128i*)(out + 32), u2); | |
| 1042 | |
| 1043 in += 4 * 4; | |
| 1044 out += 16 * 3; | |
| 1045 n -= 4 * 4; | |
| 1046 } | |
| 1047 | |
| 1048 convert_to_int24_noopt(in, n, out, scale); | |
| 1049 } | |
| 1050 #endif // allowAVX | |
| 1051 static void f64_to_i24_sse41(double const* in, size_t n, uint8_t* out, double scale) { | |
| 1052 const __m128i pi0 = _mm_set_epi8(-128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); | |
| 1053 const __m128i pi1 = _mm_set_epi8(4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5); | |
| 1054 const __m128i pi2 = _mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10); | |
| 1055 const __m128i pi3 = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128); | |
| 1056 const auto mul = _mm_set1_pd(scale); | |
| 1057 | |
| 1058 // PROBLEM: if we want to handle wildly out-of-bounds values, we can't do int clipping! | |
| 1059 // float clipping is sadly considerably slower than int clipping | |
| 1060 const auto lo = _mm_set1_pd(INT24_MIN); | |
| 1061 const auto hi = _mm_set1_pd(INT24_MAX); | |
| 1062 | |
| 1063 while (n >= 4 * 4) { | |
| 1064 auto f0 = _mm_mul_pd(_mm_loadu_pd(in + 0), mul); | |
| 1065 auto f1 = _mm_mul_pd(_mm_loadu_pd(in + 2), mul); | |
| 1066 auto f2 = _mm_mul_pd(_mm_loadu_pd(in + 4), mul); | |
| 1067 auto f3 = _mm_mul_pd(_mm_loadu_pd(in + 6), mul); | |
| 1068 auto f4 = _mm_mul_pd(_mm_loadu_pd(in + 8), mul); | |
| 1069 auto f5 = _mm_mul_pd(_mm_loadu_pd(in + 10), mul); | |
| 1070 auto f6 = _mm_mul_pd(_mm_loadu_pd(in + 12), mul); | |
| 1071 auto f7 = _mm_mul_pd(_mm_loadu_pd(in + 14), mul); | |
| 1072 f0 = _mm_max_pd(_mm_min_pd(f0, hi), lo); | |
| 1073 f1 = _mm_max_pd(_mm_min_pd(f1, hi), lo); | |
| 1074 f2 = _mm_max_pd(_mm_min_pd(f2, hi), lo); | |
| 1075 f3 = _mm_max_pd(_mm_min_pd(f3, hi), lo); | |
| 1076 f4 = _mm_max_pd(_mm_min_pd(f4, hi), lo); | |
| 1077 f5 = _mm_max_pd(_mm_min_pd(f5, hi), lo); | |
| 1078 f6 = _mm_max_pd(_mm_min_pd(f6, hi), lo); | |
| 1079 f7 = _mm_max_pd(_mm_min_pd(f7, hi), lo); | |
| 1080 | |
| 1081 | |
| 1082 | |
| 1083 __m128i w0 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f0), _mm_cvtpd_epi32(f1)); | |
| 1084 __m128i w1 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f2), _mm_cvtpd_epi32(f3)); | |
| 1085 __m128i w2 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f4), _mm_cvtpd_epi32(f5)); | |
| 1086 __m128i w3 = _mm_unpacklo_epi64(_mm_cvtpd_epi32(f6), _mm_cvtpd_epi32(f7)); | |
| 1087 | |
| 1088 // _mm_shuffle_epi8 : SSSE3 | |
| 1089 w0 = _mm_shuffle_epi8(w0, pi0); | |
| 1090 w1 = _mm_shuffle_epi8(w1, pi1); | |
| 1091 w2 = _mm_shuffle_epi8(w2, pi2); | |
| 1092 w3 = _mm_shuffle_epi8(w3, pi3); | |
| 1093 | |
| 1094 // _mm_blend_epi16 : SSE4.1 | |
| 1095 __m128i u0 = _mm_blend_epi16(w0, w1, 0xC0); | |
| 1096 __m128i u1 = _mm_blend_epi16(w1, w2, 0xF0); | |
| 1097 __m128i u2 = _mm_blend_epi16(w2, w3, 0xFC); | |
| 1098 | |
| 1099 _mm_storeu_si128((__m128i*)(out + 0), u0); | |
| 1100 _mm_storeu_si128((__m128i*)(out + 16), u1); | |
| 1101 _mm_storeu_si128((__m128i*)(out + 32), u2); | |
| 1102 | |
| 1103 in += 4 * 4; | |
| 1104 out += 16 * 3; | |
| 1105 n -= 4 * 4; | |
| 1106 } | |
| 1107 | |
| 1108 convert_to_int24_noopt(in, n, out, scale); | |
| 1109 } | |
| 1110 static void f32_to_i24_sse41(float const* in, size_t n, uint8_t* out, float scale) { | |
| 1111 const __m128i pi0 = _mm_set_epi8(-128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); | |
| 1112 const __m128i pi1 = _mm_set_epi8(4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10, 9, 8, 6, 5); | |
| 1113 const __m128i pi2 = _mm_set_epi8(9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128, 14, 13, 12, 10); | |
| 1114 const __m128i pi3 = _mm_set_epi8(14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, -128, -128, -128, -128); | |
| 1115 const __m128 mul = _mm_set1_ps(scale); | |
| 1116 | |
| 1117 // PROBLEM: if we want to handle wildly out-of-bounds values, we can't do int clipping! | |
| 1118 // float clipping is sadly considerably slower than int clipping | |
| 1119 const auto lo = _mm_set1_ps(INT24_MIN); | |
| 1120 const auto hi = _mm_set1_ps(INT24_MAX); | |
| 1121 | |
| 1122 while (n >= 4 * 4) { | |
| 1123 auto f0 = _mm_mul_ps(_mm_loadu_ps(in + 0), mul); | |
| 1124 auto f1 = _mm_mul_ps(_mm_loadu_ps(in + 4), mul); | |
| 1125 auto f2 = _mm_mul_ps(_mm_loadu_ps(in + 8), mul); | |
| 1126 auto f3 = _mm_mul_ps(_mm_loadu_ps(in + 12), mul); | |
| 1127 f0 = _mm_min_ps(_mm_max_ps(f0, lo), hi); | |
| 1128 f1 = _mm_min_ps(_mm_max_ps(f1, lo), hi); | |
| 1129 f2 = _mm_min_ps(_mm_max_ps(f2, lo), hi); | |
| 1130 f3 = _mm_min_ps(_mm_max_ps(f3, lo), hi); | |
| 1131 __m128i w0 = _mm_cvtps_epi32(f0); | |
| 1132 __m128i w1 = _mm_cvtps_epi32(f1); | |
| 1133 __m128i w2 = _mm_cvtps_epi32(f2); | |
| 1134 __m128i w3 = _mm_cvtps_epi32(f3); | |
| 1135 | |
| 1136 // _mm_shuffle_epi8 : SSSE3 | |
| 1137 w0 = _mm_shuffle_epi8(w0, pi0); | |
| 1138 w1 = _mm_shuffle_epi8(w1, pi1); | |
| 1139 w2 = _mm_shuffle_epi8(w2, pi2); | |
| 1140 w3 = _mm_shuffle_epi8(w3, pi3); | |
| 1141 | |
| 1142 // _mm_blend_epi16 : SSE4.1 | |
| 1143 __m128i u0 = _mm_blend_epi16(w0, w1, 0xC0); | |
| 1144 __m128i u1 = _mm_blend_epi16(w1, w2, 0xF0); | |
| 1145 __m128i u2 = _mm_blend_epi16(w2, w3, 0xFC); | |
| 1146 | |
| 1147 _mm_storeu_si128((__m128i*)(out + 0), u0); | |
| 1148 _mm_storeu_si128((__m128i*)(out + 16), u1); | |
| 1149 _mm_storeu_si128((__m128i*)(out + 32), u2); | |
| 1150 | |
| 1151 in += 4 * 4; | |
| 1152 out += 16 * 3; | |
| 1153 n -= 4 * 4; | |
| 1154 } | |
| 1155 | |
| 1156 convert_to_int24_noopt(in, n, out, scale); | |
| 1157 } | |
| 1158 | |
| 1159 #endif // AUDIO_MATH_SSE | |
| 1160 | |
| 1161 void audio_math::convert_to_int24(const float* in, size_t count, void* out, float scale) { | |
| 1162 scale *= 0x800000; | |
| 1163 | |
| 1164 #ifdef AUDIO_MATH_SSE | |
| 1165 if (haveSSE41) { | |
| 1166 f32_to_i24_sse41(in, count, (uint8_t*)out, scale); return; | |
| 1167 } | |
| 1168 #endif | |
| 1169 convert_to_int24_noopt(in, count, out, scale); | |
| 1170 } | |
| 1171 void audio_math::convert_to_int24(const double* in, size_t count, void* out, double scale) { | |
| 1172 scale *= 0x800000; | |
| 1173 #ifdef AUDIO_MATH_SSE | |
| 1174 #if allowAVX | |
| 1175 if (haveAVX) { | |
| 1176 f64_to_i24_avx(in, count, (uint8_t*)out, scale); return; | |
| 1177 } | |
| 1178 #endif // allowAVX | |
| 1179 if (haveSSE41) { | |
| 1180 f64_to_i24_sse41(in, count, (uint8_t*)out, scale); return; | |
| 1181 } | |
| 1182 #endif // AUDIO_MATH_SSE | |
| 1183 convert_to_int24_noopt(in, count, out, scale); | |
| 1184 } | |
| 1185 | |
| 1186 } |
