Merge changes I48db8476,Ieb222293,If3bb87da,I22ca2c60
* changes:
audio_utils: Biquad state space implementation
audio_utils: Biquad refactorization
audio_utils: Biquad test update to 2nd order IIR
audio_utils: Biquad update benchmark for 24 channels
diff --git a/audio_utils/benchmarks/biquad_filter_benchmark.cpp b/audio_utils/benchmarks/biquad_filter_benchmark.cpp
index 0a61593..e912b18 100644
--- a/audio_utils/benchmarks/biquad_filter_benchmark.cpp
+++ b/audio_utils/benchmarks/biquad_filter_benchmark.cpp
@@ -25,6 +25,8 @@
#include <audio_utils/BiquadFilter.h>
#include <audio_utils/format.h>
+#pragma GCC diagnostic ignored "-Wunused-function" // we use override array assignment
+
static constexpr size_t DATA_SIZE = 1024;
// The coefficients is a HPF with sampling frequency as 48000, center frequency as 600,
// and Q as 0.707. As all the coefficients are not zero, they can be used to benchmark
@@ -82,138 +84,327 @@
BENCHMARK(BM_BiquadFilter1D)->Apply(BiquadFilter1DArgs);
/*******************************************************************
- * A test result running on Pixel 4 for comparison.
- * The first parameter indicates the input data is subnormal or not.
- * 0 for normal input data, 1 for subnormal input data.
- * The second parameter indicates the channel count.
- * The third parameter indicates the occupancy of the coefficients.
- * -----------------------------------------------------------------
- * Benchmark Time CPU Iterations
- * -----------------------------------------------------------------
- * BM_BiquadFilter/0/1/1 734 ns 732 ns 740671
- * BM_BiquadFilter/0/1/2 554 ns 553 ns 1266836
- * BM_BiquadFilter/0/1/3 647 ns 645 ns 1085314
- * BM_BiquadFilter/0/1/4 834 ns 832 ns 841345
- * BM_BiquadFilter/0/1/5 1068 ns 1065 ns 657343
- * BM_BiquadFilter/0/1/6 767 ns 765 ns 915223
- * BM_BiquadFilter/0/1/7 929 ns 926 ns 756405
- * BM_BiquadFilter/0/1/8 2173 ns 2168 ns 322949
- * BM_BiquadFilter/0/1/9 1831 ns 1826 ns 383304
- * BM_BiquadFilter/0/1/10 1931 ns 1927 ns 363386
- * BM_BiquadFilter/0/1/11 2536 ns 2529 ns 276783
- * BM_BiquadFilter/0/1/12 2535 ns 2529 ns 276826
- * BM_BiquadFilter/0/1/13 2174 ns 2169 ns 322755
- * BM_BiquadFilter/0/1/14 3257 ns 3250 ns 215383
- * BM_BiquadFilter/0/1/15 2175 ns 2169 ns 322711
- * BM_BiquadFilter/0/1/16 1494 ns 1491 ns 469625
- * BM_BiquadFilter/0/1/17 2176 ns 2171 ns 322423
- * BM_BiquadFilter/0/1/18 1568 ns 1564 ns 447467
- * BM_BiquadFilter/0/1/19 1325 ns 1322 ns 529762
- * BM_BiquadFilter/0/1/20 1926 ns 1921 ns 364534
- * BM_BiquadFilter/0/1/21 2630 ns 2623 ns 266027
- * BM_BiquadFilter/0/1/22 1998 ns 1993 ns 351210
- * BM_BiquadFilter/0/1/23 1882 ns 1877 ns 373028
- * BM_BiquadFilter/0/1/24 2536 ns 2529 ns 276818
- * BM_BiquadFilter/0/1/25 2176 ns 2170 ns 322627
- * BM_BiquadFilter/0/1/26 3258 ns 3250 ns 215440
- * BM_BiquadFilter/0/1/27 2175 ns 2169 ns 322724
- * BM_BiquadFilter/0/1/28 2535 ns 2529 ns 276823
- * BM_BiquadFilter/0/1/29 2175 ns 2170 ns 322560
- * BM_BiquadFilter/0/1/30 3259 ns 3250 ns 215354
- * BM_BiquadFilter/0/1/31 2176 ns 2171 ns 322492
- * BM_BiquadFilter/0/2/1 1470 ns 1466 ns 477411
- * BM_BiquadFilter/0/2/2 1109 ns 1107 ns 632666
- * BM_BiquadFilter/0/2/3 1297 ns 1294 ns 540804
- * BM_BiquadFilter/0/2/4 1681 ns 1677 ns 417675
- * BM_BiquadFilter/0/2/5 2137 ns 2132 ns 328721
- * BM_BiquadFilter/0/2/6 1572 ns 1569 ns 447114
- * BM_BiquadFilter/0/2/7 1736 ns 1732 ns 385563
- * BM_BiquadFilter/0/2/8 4331 ns 4320 ns 162048
- * BM_BiquadFilter/0/2/9 3795 ns 3785 ns 184166
- * BM_BiquadFilter/0/2/10 3832 ns 3823 ns 183015
- * BM_BiquadFilter/0/2/11 5060 ns 5047 ns 138660
- * BM_BiquadFilter/0/2/12 5046 ns 5034 ns 139033
- * BM_BiquadFilter/0/2/13 4333 ns 4322 ns 161952
- * BM_BiquadFilter/0/2/14 6500 ns 6482 ns 108022
- * BM_BiquadFilter/0/2/15 4335 ns 4324 ns 161915
- * BM_BiquadFilter/0/2/16 2965 ns 2957 ns 236771
- * BM_BiquadFilter/0/2/17 4368 ns 4358 ns 160829
- * BM_BiquadFilter/0/2/18 3193 ns 3186 ns 219766
- * BM_BiquadFilter/0/2/19 2804 ns 2798 ns 250201
- * BM_BiquadFilter/0/2/20 3839 ns 3830 ns 182731
- * BM_BiquadFilter/0/2/21 5310 ns 5296 ns 133012
- * BM_BiquadFilter/0/2/22 3995 ns 3984 ns 175672
- * BM_BiquadFilter/0/2/23 3755 ns 3745 ns 186960
- * BM_BiquadFilter/0/2/24 5060 ns 5045 ns 138733
- * BM_BiquadFilter/0/2/25 4343 ns 4330 ns 161632
- * BM_BiquadFilter/0/2/26 6505 ns 6489 ns 107871
- * BM_BiquadFilter/0/2/27 4348 ns 4336 ns 161436
- * BM_BiquadFilter/0/2/28 5068 ns 5054 ns 138515
- * BM_BiquadFilter/0/2/29 4401 ns 4389 ns 158834
- * BM_BiquadFilter/0/2/30 6514 ns 6497 ns 107752
- * BM_BiquadFilter/0/2/31 4352 ns 4341 ns 161242
- * BM_BiquadFilter/1/1/1 734 ns 732 ns 955891
- * BM_BiquadFilter/1/1/2 554 ns 552 ns 1267096
- * BM_BiquadFilter/1/1/3 647 ns 645 ns 1084919
- * BM_BiquadFilter/1/1/4 834 ns 832 ns 841505
- * BM_BiquadFilter/1/1/5 1068 ns 1065 ns 657299
- * BM_BiquadFilter/1/1/6 767 ns 765 ns 915192
- * BM_BiquadFilter/1/1/7 927 ns 924 ns 761606
- * BM_BiquadFilter/1/1/8 2174 ns 2168 ns 322888
- * BM_BiquadFilter/1/1/9 1832 ns 1826 ns 383311
- * BM_BiquadFilter/1/1/10 1932 ns 1926 ns 363384
- * BM_BiquadFilter/1/1/11 2536 ns 2529 ns 276796
- * BM_BiquadFilter/1/1/12 2536 ns 2529 ns 276843
- * BM_BiquadFilter/1/1/13 2175 ns 2169 ns 322743
- * BM_BiquadFilter/1/1/14 3259 ns 3250 ns 215420
- * BM_BiquadFilter/1/1/15 2175 ns 2169 ns 322745
- * BM_BiquadFilter/1/1/16 1495 ns 1491 ns 469661
- * BM_BiquadFilter/1/1/17 2177 ns 2171 ns 322388
- * BM_BiquadFilter/1/1/18 1569 ns 1564 ns 447468
- * BM_BiquadFilter/1/1/19 1325 ns 1322 ns 529736
- * BM_BiquadFilter/1/1/20 1927 ns 1922 ns 363962
- * BM_BiquadFilter/1/1/21 2624 ns 2617 ns 267102
- * BM_BiquadFilter/1/1/22 1999 ns 1993 ns 351169
- * BM_BiquadFilter/1/1/23 1882 ns 1877 ns 372944
- * BM_BiquadFilter/1/1/24 2536 ns 2529 ns 276751
- * BM_BiquadFilter/1/1/25 2176 ns 2170 ns 322560
- * BM_BiquadFilter/1/1/26 3259 ns 3250 ns 215389
- * BM_BiquadFilter/1/1/27 2175 ns 2169 ns 322640
- * BM_BiquadFilter/1/1/28 2536 ns 2529 ns 276786
- * BM_BiquadFilter/1/1/29 2176 ns 2170 ns 322533
- * BM_BiquadFilter/1/1/30 3260 ns 3251 ns 215325
- * BM_BiquadFilter/1/1/31 2177 ns 2171 ns 322425
- * BM_BiquadFilter/1/2/1 1471 ns 1467 ns 477222
- * BM_BiquadFilter/1/2/2 1109 ns 1107 ns 632565
- * BM_BiquadFilter/1/2/3 1298 ns 1294 ns 541051
- * BM_BiquadFilter/1/2/4 1682 ns 1678 ns 417354
- * BM_BiquadFilter/1/2/5 2136 ns 2129 ns 328967
- * BM_BiquadFilter/1/2/6 1572 ns 1568 ns 446095
- * BM_BiquadFilter/1/2/7 1792 ns 1788 ns 399598
- * BM_BiquadFilter/1/2/8 4333 ns 4320 ns 162032
- * BM_BiquadFilter/1/2/9 3807 ns 3797 ns 184097
- * BM_BiquadFilter/1/2/10 3842 ns 3831 ns 182711
- * BM_BiquadFilter/1/2/11 5062 ns 5048 ns 138636
- * BM_BiquadFilter/1/2/12 5050 ns 5036 ns 139031
- * BM_BiquadFilter/1/2/13 4335 ns 4323 ns 161943
- * BM_BiquadFilter/1/2/14 6500 ns 6483 ns 108020
- * BM_BiquadFilter/1/2/15 4336 ns 4324 ns 161873
- * BM_BiquadFilter/1/2/16 2966 ns 2957 ns 236709
- * BM_BiquadFilter/1/2/17 4359 ns 4348 ns 160581
- * BM_BiquadFilter/1/2/18 3196 ns 3187 ns 219532
- * BM_BiquadFilter/1/2/19 2805 ns 2798 ns 250157
- * BM_BiquadFilter/1/2/20 3839 ns 3829 ns 182628
- * BM_BiquadFilter/1/2/21 5291 ns 5276 ns 131153
- * BM_BiquadFilter/1/2/22 3994 ns 3984 ns 175699
- * BM_BiquadFilter/1/2/23 3757 ns 3747 ns 186864
- * BM_BiquadFilter/1/2/24 5060 ns 5046 ns 138754
- * BM_BiquadFilter/1/2/25 4343 ns 4331 ns 161614
- * BM_BiquadFilter/1/2/26 6512 ns 6491 ns 107852
- * BM_BiquadFilter/1/2/27 4348 ns 4336 ns 161419
- * BM_BiquadFilter/1/2/28 5068 ns 5055 ns 138481
- * BM_BiquadFilter/1/2/29 4386 ns 4374 ns 159635
- * BM_BiquadFilter/1/2/30 6514 ns 6498 ns 107752
- * BM_BiquadFilter/1/2/31 4353 ns 4342 ns 161227
+ A test result running on Pixel 4XL for comparison.
+
+ Parameterized Test BM_BiquadFilter1D/A
+ <A> is 0 or 1 indicating if the input data is subnormal or not.
+
+ Parameterized Test BM_BiquadFilter<TYPE>/A/B/C
+ <A> is 0 or 1 indicating if the input data is subnormal or not.
+ <B> is the channel count, starting from 1
+ <C> indicates the occupancy of the coefficients as a bitmask (1 - 31) representing
+ b0, b1, b2, a0, a1. 31 indicates all Biquad coefficients are non-zero.
+
+-----------------------------------------------------------------------------------
+Benchmark Time CPU Iterations
+-----------------------------------------------------------------------------------
+BM_BiquadFilter1D/0 559 ns 558 ns 1254596
+BM_BiquadFilter1D/1 563 ns 561 ns 1246670
+BM_BiquadFilterFloatOptimized/0/1/31 2263 ns 2257 ns 310326
+BM_BiquadFilterFloatOptimized/0/2/31 2471 ns 2465 ns 289044
+BM_BiquadFilterFloatOptimized/0/3/31 2827 ns 2818 ns 248148
+BM_BiquadFilterFloatOptimized/0/4/31 2443 ns 2436 ns 287402
+BM_BiquadFilterFloatOptimized/0/5/31 2843 ns 2836 ns 246912
+BM_BiquadFilterFloatOptimized/0/6/31 3188 ns 3180 ns 220494
+BM_BiquadFilterFloatOptimized/0/7/31 3910 ns 3898 ns 179652
+BM_BiquadFilterFloatOptimized/0/8/31 3167 ns 3157 ns 221764
+BM_BiquadFilterFloatOptimized/0/9/31 3947 ns 3936 ns 177855
+BM_BiquadFilterFloatOptimized/0/10/31 4204 ns 4193 ns 166963
+BM_BiquadFilterFloatOptimized/0/11/31 5313 ns 5298 ns 131865
+BM_BiquadFilterFloatOptimized/0/12/31 4205 ns 4193 ns 166925
+BM_BiquadFilterFloatOptimized/0/13/31 5303 ns 5289 ns 132321
+BM_BiquadFilterFloatOptimized/0/14/31 5501 ns 5487 ns 127623
+BM_BiquadFilterFloatOptimized/0/15/31 6572 ns 6551 ns 106827
+BM_BiquadFilterFloatOptimized/0/16/31 5438 ns 5426 ns 129016
+BM_BiquadFilterFloatOptimized/0/17/31 7729 ns 7708 ns 90812
+BM_BiquadFilterFloatOptimized/0/18/31 8007 ns 7984 ns 87910
+BM_BiquadFilterFloatOptimized/0/19/31 8586 ns 8560 ns 81801
+BM_BiquadFilterFloatOptimized/0/20/31 7811 ns 7789 ns 89871
+BM_BiquadFilterFloatOptimized/0/21/31 8714 ns 8691 ns 80518
+BM_BiquadFilterFloatOptimized/0/22/31 8851 ns 8820 ns 79287
+BM_BiquadFilterFloatOptimized/0/23/31 9606 ns 9576 ns 73108
+BM_BiquadFilterFloatOptimized/0/24/31 8676 ns 8647 ns 80944
+BM_BiquadFilterFloatOptimized/0/1/1 559 ns 557 ns 1255379
+BM_BiquadFilterFloatOptimized/0/1/2 649 ns 647 ns 1081153
+BM_BiquadFilterFloatOptimized/0/1/3 649 ns 647 ns 1081862
+BM_BiquadFilterFloatOptimized/0/1/4 848 ns 846 ns 827513
+BM_BiquadFilterFloatOptimized/0/1/5 847 ns 844 ns 829861
+BM_BiquadFilterFloatOptimized/0/1/6 844 ns 841 ns 829051
+BM_BiquadFilterFloatOptimized/0/1/7 848 ns 845 ns 827492
+BM_BiquadFilterFloatOptimized/0/1/8 2289 ns 2282 ns 307077
+BM_BiquadFilterFloatOptimized/0/1/9 2296 ns 2288 ns 303532
+BM_BiquadFilterFloatOptimized/0/1/10 2305 ns 2299 ns 304972
+BM_BiquadFilterFloatOptimized/0/1/11 2287 ns 2281 ns 307199
+BM_BiquadFilterFloatOptimized/0/1/12 2262 ns 2256 ns 310300
+BM_BiquadFilterFloatOptimized/0/1/13 2263 ns 2257 ns 310400
+BM_BiquadFilterFloatOptimized/0/1/14 2262 ns 2257 ns 310236
+BM_BiquadFilterFloatOptimized/0/1/15 2262 ns 2256 ns 309767
+BM_BiquadFilterFloatOptimized/0/1/16 2262 ns 2256 ns 310116
+BM_BiquadFilterFloatOptimized/0/1/17 2262 ns 2256 ns 310345
+BM_BiquadFilterFloatOptimized/0/1/18 2261 ns 2255 ns 310106
+BM_BiquadFilterFloatOptimized/0/1/19 2263 ns 2256 ns 310242
+BM_BiquadFilterFloatOptimized/0/1/20 2262 ns 2255 ns 310131
+BM_BiquadFilterFloatOptimized/0/1/21 2262 ns 2256 ns 310250
+BM_BiquadFilterFloatOptimized/0/1/22 2264 ns 2257 ns 310329
+BM_BiquadFilterFloatOptimized/0/1/23 2264 ns 2257 ns 310011
+BM_BiquadFilterFloatOptimized/0/1/24 2264 ns 2258 ns 310008
+BM_BiquadFilterFloatOptimized/0/1/25 2263 ns 2256 ns 310203
+BM_BiquadFilterFloatOptimized/0/1/26 2264 ns 2258 ns 310016
+BM_BiquadFilterFloatOptimized/0/1/27 2264 ns 2258 ns 310046
+BM_BiquadFilterFloatOptimized/0/1/28 2263 ns 2257 ns 310325
+BM_BiquadFilterFloatOptimized/0/1/29 2264 ns 2257 ns 310048
+BM_BiquadFilterFloatOptimized/0/1/30 2265 ns 2258 ns 310008
+BM_BiquadFilterFloatOptimized/0/1/31 2263 ns 2257 ns 310046
+BM_BiquadFilterFloatOptimized/0/2/1 613 ns 611 ns 1143690
+BM_BiquadFilterFloatOptimized/0/2/2 801 ns 799 ns 877788
+BM_BiquadFilterFloatOptimized/0/2/3 799 ns 797 ns 875867
+BM_BiquadFilterFloatOptimized/0/2/4 1507 ns 1502 ns 465824
+BM_BiquadFilterFloatOptimized/0/2/5 1505 ns 1501 ns 466685
+BM_BiquadFilterFloatOptimized/0/2/6 1500 ns 1496 ns 468398
+BM_BiquadFilterFloatOptimized/0/2/7 1505 ns 1501 ns 466086
+BM_BiquadFilterFloatOptimized/0/2/8 2237 ns 2231 ns 313969
+BM_BiquadFilterFloatOptimized/0/2/9 2233 ns 2227 ns 314388
+BM_BiquadFilterFloatOptimized/0/2/10 2234 ns 2228 ns 314181
+BM_BiquadFilterFloatOptimized/0/2/11 2236 ns 2230 ns 313480
+BM_BiquadFilterFloatOptimized/0/2/12 2474 ns 2467 ns 287311
+BM_BiquadFilterFloatOptimized/0/2/13 2448 ns 2441 ns 284113
+BM_BiquadFilterFloatOptimized/0/2/14 2512 ns 2504 ns 283932
+BM_BiquadFilterFloatOptimized/0/2/15 2448 ns 2442 ns 285294
+BM_BiquadFilterFloatOptimized/0/2/16 2459 ns 2451 ns 283455
+BM_BiquadFilterFloatOptimized/0/2/17 2466 ns 2459 ns 282706
+BM_BiquadFilterFloatOptimized/0/2/18 2493 ns 2486 ns 279019
+BM_BiquadFilterFloatOptimized/0/2/19 2453 ns 2446 ns 285647
+BM_BiquadFilterFloatOptimized/0/2/20 2475 ns 2468 ns 285345
+BM_BiquadFilterFloatOptimized/0/2/21 2506 ns 2499 ns 286030
+BM_BiquadFilterFloatOptimized/0/2/22 2512 ns 2505 ns 280230
+BM_BiquadFilterFloatOptimized/0/2/23 2466 ns 2458 ns 284371
+BM_BiquadFilterFloatOptimized/0/2/24 2454 ns 2446 ns 286968
+BM_BiquadFilterFloatOptimized/0/2/25 2478 ns 2470 ns 284297
+BM_BiquadFilterFloatOptimized/0/2/26 2509 ns 2501 ns 281390
+BM_BiquadFilterFloatOptimized/0/2/27 2477 ns 2469 ns 282065
+BM_BiquadFilterFloatOptimized/0/2/28 2484 ns 2476 ns 284814
+BM_BiquadFilterFloatOptimized/0/2/29 2443 ns 2435 ns 283155
+BM_BiquadFilterFloatOptimized/0/2/30 2488 ns 2481 ns 283433
+BM_BiquadFilterFloatOptimized/0/2/31 2452 ns 2444 ns 286627
+BM_BiquadFilterFloatOptimized/0/3/1 1015 ns 1012 ns 691448
+BM_BiquadFilterFloatOptimized/0/3/2 1362 ns 1357 ns 516073
+BM_BiquadFilterFloatOptimized/0/3/3 1358 ns 1354 ns 516419
+BM_BiquadFilterFloatOptimized/0/3/4 2305 ns 2298 ns 304561
+BM_BiquadFilterFloatOptimized/0/3/5 2303 ns 2296 ns 304712
+BM_BiquadFilterFloatOptimized/0/3/6 2292 ns 2285 ns 306149
+BM_BiquadFilterFloatOptimized/0/3/7 2304 ns 2297 ns 304959
+BM_BiquadFilterFloatOptimized/0/3/8 2318 ns 2311 ns 301399
+BM_BiquadFilterFloatOptimized/0/3/9 2310 ns 2303 ns 303435
+BM_BiquadFilterFloatOptimized/0/3/10 2313 ns 2306 ns 303399
+BM_BiquadFilterFloatOptimized/0/3/11 2318 ns 2311 ns 302755
+BM_BiquadFilterFloatOptimized/0/3/12 2831 ns 2823 ns 247948
+BM_BiquadFilterFloatOptimized/0/3/13 2830 ns 2822 ns 248677
+BM_BiquadFilterFloatOptimized/0/3/14 2850 ns 2842 ns 246677
+BM_BiquadFilterFloatOptimized/0/3/15 2827 ns 2819 ns 248340
+BM_BiquadFilterFloatOptimized/0/3/16 2834 ns 2825 ns 247686
+BM_BiquadFilterFloatOptimized/0/3/17 2828 ns 2819 ns 247784
+BM_BiquadFilterFloatOptimized/0/3/18 2842 ns 2833 ns 248007
+BM_BiquadFilterFloatOptimized/0/3/19 2843 ns 2834 ns 248472
+BM_BiquadFilterFloatOptimized/0/3/20 2829 ns 2820 ns 248677
+BM_BiquadFilterFloatOptimized/0/3/21 2833 ns 2824 ns 248061
+BM_BiquadFilterFloatOptimized/0/3/22 2849 ns 2841 ns 246611
+BM_BiquadFilterFloatOptimized/0/3/23 2827 ns 2818 ns 247863
+BM_BiquadFilterFloatOptimized/0/3/24 2830 ns 2820 ns 247550
+BM_BiquadFilterFloatOptimized/0/3/25 2836 ns 2827 ns 248435
+BM_BiquadFilterFloatOptimized/0/3/26 2859 ns 2850 ns 245727
+BM_BiquadFilterFloatOptimized/0/3/27 2828 ns 2818 ns 247914
+BM_BiquadFilterFloatOptimized/0/3/28 2818 ns 2809 ns 248637
+BM_BiquadFilterFloatOptimized/0/3/29 2820 ns 2810 ns 249170
+BM_BiquadFilterFloatOptimized/0/3/30 2858 ns 2849 ns 245760
+BM_BiquadFilterFloatOptimized/0/3/31 2832 ns 2823 ns 247969
+BM_BiquadFilterFloatOptimized/0/4/1 650 ns 648 ns 1080613
+BM_BiquadFilterFloatOptimized/0/4/2 808 ns 805 ns 868597
+BM_BiquadFilterFloatOptimized/0/4/3 809 ns 807 ns 867460
+BM_BiquadFilterFloatOptimized/0/4/4 844 ns 841 ns 831865
+BM_BiquadFilterFloatOptimized/0/4/5 845 ns 842 ns 833152
+BM_BiquadFilterFloatOptimized/0/4/6 844 ns 842 ns 831276
+BM_BiquadFilterFloatOptimized/0/4/7 843 ns 841 ns 831231
+BM_BiquadFilterFloatOptimized/0/4/8 2193 ns 2186 ns 320234
+BM_BiquadFilterFloatOptimized/0/4/9 2194 ns 2187 ns 320273
+BM_BiquadFilterFloatOptimized/0/4/10 2193 ns 2185 ns 320448
+BM_BiquadFilterFloatOptimized/0/4/11 2192 ns 2185 ns 320000
+BM_BiquadFilterFloatOptimized/0/4/12 2448 ns 2440 ns 285447
+BM_BiquadFilterFloatOptimized/0/4/13 2454 ns 2447 ns 282923
+BM_BiquadFilterFloatOptimized/0/4/14 2359 ns 2352 ns 297360
+BM_BiquadFilterFloatOptimized/0/4/15 2467 ns 2459 ns 286699
+BM_BiquadFilterFloatOptimized/0/4/16 2442 ns 2435 ns 286231
+BM_BiquadFilterFloatOptimized/0/4/17 2446 ns 2438 ns 287105
+BM_BiquadFilterFloatOptimized/0/4/18 2361 ns 2354 ns 298082
+BM_BiquadFilterFloatOptimized/0/4/19 2452 ns 2444 ns 284776
+BM_BiquadFilterFloatOptimized/0/4/20 2459 ns 2451 ns 284338
+BM_BiquadFilterFloatOptimized/0/4/21 2437 ns 2429 ns 289188
+BM_BiquadFilterFloatOptimized/0/4/22 2358 ns 2350 ns 297415
+BM_BiquadFilterFloatOptimized/0/4/23 2459 ns 2451 ns 287279
+BM_BiquadFilterFloatOptimized/0/4/24 2452 ns 2444 ns 281625
+BM_BiquadFilterFloatOptimized/0/4/25 2389 ns 2382 ns 290665
+BM_BiquadFilterFloatOptimized/0/4/26 2360 ns 2354 ns 298354
+BM_BiquadFilterFloatOptimized/0/4/27 2433 ns 2425 ns 288930
+BM_BiquadFilterFloatOptimized/0/4/28 2447 ns 2440 ns 285827
+BM_BiquadFilterFloatOptimized/0/4/29 2450 ns 2444 ns 282857
+BM_BiquadFilterFloatOptimized/0/4/30 2360 ns 2353 ns 297327
+BM_BiquadFilterFloatOptimized/0/4/31 2463 ns 2455 ns 281644
+BM_BiquadFilterFloatOptimized/1/1/1 559 ns 558 ns 1255286
+BM_BiquadFilterFloatOptimized/1/1/2 649 ns 647 ns 1081171
+BM_BiquadFilterFloatOptimized/1/1/3 649 ns 647 ns 1081604
+BM_BiquadFilterFloatOptimized/1/1/4 848 ns 845 ns 828382
+BM_BiquadFilterFloatOptimized/1/1/5 847 ns 844 ns 827653
+BM_BiquadFilterFloatOptimized/1/1/6 848 ns 845 ns 831001
+BM_BiquadFilterFloatOptimized/1/1/7 848 ns 845 ns 828299
+BM_BiquadFilterFloatOptimized/1/1/8 2291 ns 2284 ns 306513
+BM_BiquadFilterFloatOptimized/1/1/9 2297 ns 2290 ns 305585
+BM_BiquadFilterFloatOptimized/1/1/10 2310 ns 2303 ns 304022
+BM_BiquadFilterFloatOptimized/1/1/11 2286 ns 2279 ns 307592
+BM_BiquadFilterFloatOptimized/1/1/12 2263 ns 2256 ns 310010
+BM_BiquadFilterFloatOptimized/1/1/13 2264 ns 2258 ns 310167
+BM_BiquadFilterFloatOptimized/1/1/14 2264 ns 2257 ns 310392
+BM_BiquadFilterFloatOptimized/1/1/15 2264 ns 2257 ns 310298
+BM_BiquadFilterFloatOptimized/1/1/16 2263 ns 2256 ns 310058
+BM_BiquadFilterFloatOptimized/1/1/17 2264 ns 2257 ns 310262
+BM_BiquadFilterFloatOptimized/1/1/18 2263 ns 2256 ns 310337
+BM_BiquadFilterFloatOptimized/1/1/19 2266 ns 2258 ns 310162
+BM_BiquadFilterFloatOptimized/1/1/20 2264 ns 2257 ns 309968
+BM_BiquadFilterFloatOptimized/1/1/21 2264 ns 2256 ns 310341
+BM_BiquadFilterFloatOptimized/1/1/22 2266 ns 2258 ns 310193
+BM_BiquadFilterFloatOptimized/1/1/23 2263 ns 2257 ns 310310
+BM_BiquadFilterFloatOptimized/1/1/24 2265 ns 2257 ns 310182
+BM_BiquadFilterFloatOptimized/1/1/25 2263 ns 2256 ns 309999
+BM_BiquadFilterFloatOptimized/1/1/26 2265 ns 2258 ns 309943
+BM_BiquadFilterFloatOptimized/1/1/27 2264 ns 2257 ns 310175
+BM_BiquadFilterFloatOptimized/1/1/28 2265 ns 2258 ns 310119
+BM_BiquadFilterFloatOptimized/1/1/29 2265 ns 2258 ns 310231
+BM_BiquadFilterFloatOptimized/1/1/30 2266 ns 2258 ns 309976
+BM_BiquadFilterFloatOptimized/1/1/31 2266 ns 2258 ns 310062
+BM_BiquadFilterFloatOptimized/1/2/1 613 ns 611 ns 1144258
+BM_BiquadFilterFloatOptimized/1/2/2 801 ns 798 ns 877198
+BM_BiquadFilterFloatOptimized/1/2/3 800 ns 798 ns 877731
+BM_BiquadFilterFloatOptimized/1/2/4 1508 ns 1503 ns 465854
+BM_BiquadFilterFloatOptimized/1/2/5 1505 ns 1500 ns 466533
+BM_BiquadFilterFloatOptimized/1/2/6 1502 ns 1497 ns 468915
+BM_BiquadFilterFloatOptimized/1/2/7 1505 ns 1501 ns 466371
+BM_BiquadFilterFloatOptimized/1/2/8 2238 ns 2231 ns 313808
+BM_BiquadFilterFloatOptimized/1/2/9 2236 ns 2228 ns 313914
+BM_BiquadFilterFloatOptimized/1/2/10 2235 ns 2228 ns 314155
+BM_BiquadFilterFloatOptimized/1/2/11 2237 ns 2231 ns 313280
+BM_BiquadFilterFloatOptimized/1/2/12 2482 ns 2474 ns 287057
+BM_BiquadFilterFloatOptimized/1/2/13 2491 ns 2483 ns 280960
+BM_BiquadFilterFloatOptimized/1/2/14 2520 ns 2513 ns 283093
+BM_BiquadFilterFloatOptimized/1/2/15 2452 ns 2445 ns 285828
+BM_BiquadFilterFloatOptimized/1/2/16 2503 ns 2495 ns 286339
+BM_BiquadFilterFloatOptimized/1/2/17 2499 ns 2491 ns 285757
+BM_BiquadFilterFloatOptimized/1/2/18 2509 ns 2502 ns 281775
+BM_BiquadFilterFloatOptimized/1/2/19 2491 ns 2483 ns 281628
+BM_BiquadFilterFloatOptimized/1/2/20 2466 ns 2458 ns 289553
+BM_BiquadFilterFloatOptimized/1/2/21 2456 ns 2447 ns 284222
+BM_BiquadFilterFloatOptimized/1/2/22 2502 ns 2494 ns 277208
+BM_BiquadFilterFloatOptimized/1/2/23 2458 ns 2450 ns 281951
+BM_BiquadFilterFloatOptimized/1/2/24 2474 ns 2466 ns 290638
+BM_BiquadFilterFloatOptimized/1/2/25 2473 ns 2465 ns 285659
+BM_BiquadFilterFloatOptimized/1/2/26 2504 ns 2497 ns 278294
+BM_BiquadFilterFloatOptimized/1/2/27 2467 ns 2459 ns 279088
+BM_BiquadFilterFloatOptimized/1/2/28 2487 ns 2479 ns 281668
+BM_BiquadFilterFloatOptimized/1/2/29 2469 ns 2462 ns 274237
+BM_BiquadFilterFloatOptimized/1/2/30 2502 ns 2495 ns 282294
+BM_BiquadFilterFloatOptimized/1/2/31 2438 ns 2430 ns 285546
+BM_BiquadFilterFloatOptimized/1/3/1 1016 ns 1012 ns 691507
+BM_BiquadFilterFloatOptimized/1/3/2 1361 ns 1357 ns 514162
+BM_BiquadFilterFloatOptimized/1/3/3 1359 ns 1355 ns 516211
+BM_BiquadFilterFloatOptimized/1/3/4 2304 ns 2297 ns 304380
+BM_BiquadFilterFloatOptimized/1/3/5 2305 ns 2297 ns 304978
+BM_BiquadFilterFloatOptimized/1/3/6 2291 ns 2283 ns 306726
+BM_BiquadFilterFloatOptimized/1/3/7 2305 ns 2297 ns 304572
+BM_BiquadFilterFloatOptimized/1/3/8 2332 ns 2325 ns 303627
+BM_BiquadFilterFloatOptimized/1/3/9 2313 ns 2305 ns 303341
+BM_BiquadFilterFloatOptimized/1/3/10 2317 ns 2310 ns 303791
+BM_BiquadFilterFloatOptimized/1/3/11 2325 ns 2317 ns 302256
+BM_BiquadFilterFloatOptimized/1/3/12 2828 ns 2819 ns 248120
+BM_BiquadFilterFloatOptimized/1/3/13 2831 ns 2821 ns 248747
+BM_BiquadFilterFloatOptimized/1/3/14 2849 ns 2840 ns 247287
+BM_BiquadFilterFloatOptimized/1/3/15 2829 ns 2821 ns 247794
+BM_BiquadFilterFloatOptimized/1/3/16 2830 ns 2821 ns 248154
+BM_BiquadFilterFloatOptimized/1/3/17 2833 ns 2823 ns 249270
+BM_BiquadFilterFloatOptimized/1/3/18 2853 ns 2843 ns 247248
+BM_BiquadFilterFloatOptimized/1/3/19 2839 ns 2830 ns 248141
+BM_BiquadFilterFloatOptimized/1/3/20 2830 ns 2820 ns 248693
+BM_BiquadFilterFloatOptimized/1/3/21 2832 ns 2823 ns 247991
+BM_BiquadFilterFloatOptimized/1/3/22 2850 ns 2841 ns 246602
+BM_BiquadFilterFloatOptimized/1/3/23 2828 ns 2819 ns 248859
+BM_BiquadFilterFloatOptimized/1/3/24 2835 ns 2826 ns 247837
+BM_BiquadFilterFloatOptimized/1/3/25 2837 ns 2828 ns 246757
+BM_BiquadFilterFloatOptimized/1/3/26 2860 ns 2851 ns 245739
+BM_BiquadFilterFloatOptimized/1/3/27 2830 ns 2821 ns 248257
+BM_BiquadFilterFloatOptimized/1/3/28 2826 ns 2817 ns 249220
+BM_BiquadFilterFloatOptimized/1/3/29 2820 ns 2811 ns 248787
+BM_BiquadFilterFloatOptimized/1/3/30 2853 ns 2844 ns 245666
+BM_BiquadFilterFloatOptimized/1/3/31 2829 ns 2821 ns 248423
+BM_BiquadFilterFloatOptimized/1/4/1 650 ns 648 ns 1080678
+BM_BiquadFilterFloatOptimized/1/4/2 808 ns 806 ns 868623
+BM_BiquadFilterFloatOptimized/1/4/3 810 ns 807 ns 867092
+BM_BiquadFilterFloatOptimized/1/4/4 844 ns 842 ns 831679
+BM_BiquadFilterFloatOptimized/1/4/5 843 ns 841 ns 830964
+BM_BiquadFilterFloatOptimized/1/4/6 845 ns 842 ns 831367
+BM_BiquadFilterFloatOptimized/1/4/7 845 ns 842 ns 830992
+BM_BiquadFilterFloatOptimized/1/4/8 2193 ns 2186 ns 320186
+BM_BiquadFilterFloatOptimized/1/4/9 2194 ns 2187 ns 320621
+BM_BiquadFilterFloatOptimized/1/4/10 2194 ns 2187 ns 320577
+BM_BiquadFilterFloatOptimized/1/4/11 2192 ns 2184 ns 320221
+BM_BiquadFilterFloatOptimized/1/4/12 2471 ns 2462 ns 286817
+BM_BiquadFilterFloatOptimized/1/4/13 2450 ns 2442 ns 283590
+BM_BiquadFilterFloatOptimized/1/4/14 2359 ns 2351 ns 297716
+BM_BiquadFilterFloatOptimized/1/4/15 2472 ns 2464 ns 286187
+BM_BiquadFilterFloatOptimized/1/4/16 2450 ns 2442 ns 291296
+BM_BiquadFilterFloatOptimized/1/4/17 2460 ns 2453 ns 287031
+BM_BiquadFilterFloatOptimized/1/4/18 2358 ns 2350 ns 297376
+BM_BiquadFilterFloatOptimized/1/4/19 2469 ns 2462 ns 286443
+BM_BiquadFilterFloatOptimized/1/4/20 2470 ns 2462 ns 286582
+BM_BiquadFilterFloatOptimized/1/4/21 2420 ns 2412 ns 288366
+BM_BiquadFilterFloatOptimized/1/4/22 2359 ns 2351 ns 297356
+BM_BiquadFilterFloatOptimized/1/4/23 2448 ns 2440 ns 286534
+BM_BiquadFilterFloatOptimized/1/4/24 2458 ns 2450 ns 286994
+BM_BiquadFilterFloatOptimized/1/4/25 2420 ns 2412 ns 287406
+BM_BiquadFilterFloatOptimized/1/4/26 2363 ns 2354 ns 298029
+BM_BiquadFilterFloatOptimized/1/4/27 2421 ns 2414 ns 288812
+BM_BiquadFilterFloatOptimized/1/4/28 2449 ns 2442 ns 286253
+BM_BiquadFilterFloatOptimized/1/4/29 2448 ns 2441 ns 287234
+BM_BiquadFilterFloatOptimized/1/4/30 2361 ns 2354 ns 297402
+BM_BiquadFilterFloatOptimized/1/4/31 2465 ns 2457 ns 287662
+BM_BiquadFilterFloatNonOptimized/0/1/31 2261 ns 2257 ns 310189
+BM_BiquadFilterFloatNonOptimized/0/2/31 4525 ns 4510 ns 155178
+BM_BiquadFilterFloatNonOptimized/0/3/31 6781 ns 6760 ns 103524
+BM_BiquadFilterFloatNonOptimized/0/4/31 9037 ns 9008 ns 77684
+BM_BiquadFilterFloatNonOptimized/0/5/31 11298 ns 11259 ns 62169
+BM_BiquadFilterFloatNonOptimized/0/6/31 13575 ns 13534 ns 51708
+BM_BiquadFilterFloatNonOptimized/0/7/31 15841 ns 15796 ns 44308
+BM_BiquadFilterFloatNonOptimized/0/8/31 18105 ns 18047 ns 38797
+BM_BiquadFilterFloatNonOptimized/0/9/31 20335 ns 20270 ns 34532
+BM_BiquadFilterFloatNonOptimized/0/10/31 22603 ns 22534 ns 31063
+BM_BiquadFilterFloatNonOptimized/0/11/31 24875 ns 24798 ns 28225
+BM_BiquadFilterFloatNonOptimized/0/12/31 27122 ns 27038 ns 25889
+BM_BiquadFilterFloatNonOptimized/0/13/31 29416 ns 29326 ns 23869
+BM_BiquadFilterFloatNonOptimized/0/14/31 31703 ns 31602 ns 22151
+BM_BiquadFilterFloatNonOptimized/0/15/31 33945 ns 33849 ns 20678
+BM_BiquadFilterFloatNonOptimized/0/16/31 36232 ns 36123 ns 19379
+BM_BiquadFilterFloatNonOptimized/0/17/31 38577 ns 38464 ns 18205
+BM_BiquadFilterFloatNonOptimized/0/18/31 41054 ns 40924 ns 17106
+BM_BiquadFilterFloatNonOptimized/0/19/31 43278 ns 43133 ns 16228
+BM_BiquadFilterFloatNonOptimized/0/20/31 45549 ns 45414 ns 15411
+BM_BiquadFilterFloatNonOptimized/0/21/31 48024 ns 47867 ns 14625
+BM_BiquadFilterFloatNonOptimized/0/22/31 50268 ns 50109 ns 13965
+BM_BiquadFilterFloatNonOptimized/0/23/31 52268 ns 52090 ns 13440
+BM_BiquadFilterFloatNonOptimized/0/24/31 54528 ns 54350 ns 12883
+BM_BiquadFilterDoubleOptimized/0/1/31 2264 ns 2258 ns 309965
+BM_BiquadFilterDoubleOptimized/0/2/31 2503 ns 2495 ns 270274
+BM_BiquadFilterDoubleOptimized/0/3/31 2787 ns 2778 ns 251685
+BM_BiquadFilterDoubleOptimized/0/4/31 3175 ns 3165 ns 221339
+BM_BiquadFilterDoubleNonOptimized/0/1/31 2264 ns 2257 ns 310154
+BM_BiquadFilterDoubleNonOptimized/0/2/31 4523 ns 4508 ns 155292
+BM_BiquadFilterDoubleNonOptimized/0/3/31 6778 ns 6756 ns 103599
+BM_BiquadFilterDoubleNonOptimized/0/4/31 9063 ns 9033 ns 77461
+
*******************************************************************/
template <typename F>
@@ -299,10 +490,11 @@
}
}
+BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterQuickArgs);
+BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterFullArgs);
+// Other tests of interest
+BENCHMARK(BM_BiquadFilterFloatNonOptimized)->Apply(BiquadFilterQuickArgs);
BENCHMARK(BM_BiquadFilterDoubleOptimized)->Apply(BiquadFilterDoubleArgs);
BENCHMARK(BM_BiquadFilterDoubleNonOptimized)->Apply(BiquadFilterDoubleArgs);
-BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterQuickArgs);
-BENCHMARK(BM_BiquadFilterFloatNonOptimized)->Apply(BiquadFilterQuickArgs);
-BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterFullArgs);
BENCHMARK_MAIN();
diff --git a/audio_utils/include/audio_utils/BiquadFilter.h b/audio_utils/include/audio_utils/BiquadFilter.h
index c2f481b..b3683e3 100644
--- a/audio_utils/include/audio_utils/BiquadFilter.h
+++ b/audio_utils/include/audio_utils/BiquadFilter.h
@@ -14,8 +14,7 @@
* limitations under the License.
*/
-#ifndef ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
-#define ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
+#pragma once
#include "intrinsic_utils.h"
@@ -36,12 +35,332 @@
#define USE_NEON
#endif
+// Use dither to prevent subnormals for CPUs that raise an exception.
+#pragma push_macro("USE_DITHER")
+#undef USE_DITHER
+
+#if defined(__i386__) || defined(__x86_x64__)
+#define USE_DITHER
+#endif
+
namespace android::audio_utils {
static constexpr size_t kBiquadNumCoefs = 5;
static constexpr size_t kBiquadNumDelays = 2;
+/**
+ * The BiquadDirect2Transpose is a low overhead
+ * Biquad filter with coefficients b0, b1, b2, a1, a2.
+ *
+ * This can be used by itself, but it is preferred for best data management
+ * to use the BiquadFilter abstraction below.
+ *
+ * T is the data type (scalar or vector).
+ * F is the filter coefficient type. It is either a scalar or vector (matching T).
+ */
+template <typename T, typename F>
+struct BiquadDirect2Transpose {
+ F coef_[5]; // these are stored with the denominator a's negated.
+ T s1_; // delay state 1
+ T s2_; // delay state 2
+
+ // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
+ // as expressed by a bitmask.
+ static inline constexpr size_t required_occupancies_[] = {
+ 0x1, // constant scale
+ 0x3, // single zero
+ 0x7, // double zero
+ 0x9, // single pole
+ 0xb, // (11) first order IIR
+ 0x1b, // (27) double pole + single zero
+ 0x1f, // (31) second order IIR (full Biquad)
+ };
+
+ // Take care the order of arguments - starts with b's then goes to a's.
+ // The a's are "positive" reference, some filters take negative.
+ BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
+ const T& s1 = {}, const T& s2 = {})
+ // : coef_{b0, b1, b2, -a1, -a2}
+ : coef_{ b0,
+ b1,
+ b2,
+ intrinsics::vneg(a1),
+ intrinsics::vneg(a2) }
+ , s1_{s1}
+ , s2_{s2} {
+ }
+
+ // D is the data type. It must be the same element type of T or F.
+ // Take care the order of input and output.
+ template<typename D, size_t OCCUPANCY = 0x1f>
+ __attribute__((always_inline)) // required for 1ch speedup (30% faster)
+ void process(D* output, const D* input, size_t frames, size_t stride) {
+ using namespace intrinsics;
+ // For SSE it is possible to vdup F to T if F is scalar.
+ const F b0 = coef_[0]; // b0
+ const F b1 = coef_[1]; // b1
+ const F b2 = coef_[2]; // b2
+ const F negativeA1 = coef_[3]; // -a1
+ const F negativeA2 = coef_[4]; // -a2
+ T s1 = s1_;
+ T s2 = s2_;
+ T xn, yn; // OK to declare temps outside loop rather than at the point of initialization.
+#ifdef USE_DITHER
+ constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
+ T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
+#endif
+
+ // Unroll control. Make sure the constexpr remains constexpr :-).
+ constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
+ constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2; // below this won't be unrolled.
+ constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled.
+ constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
+ CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
+ size_t remainder = 0;
+ if constexpr (UNROLL_LOOPS > 1) {
+ remainder = frames % UNROLL_LOOPS;
+ frames /= UNROLL_LOOPS;
+ }
+
+ // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
+ // The other alternative is to use a MACRO, but that doesn't read as well.
+ const auto KERNEL = [&]() __attribute__((always_inline)) {
+ xn = vld1<T>(input);
+ input += stride;
+#ifdef USE_DITHER
+ xn = vadd(xn, dither);
+ dither = vneg(dither);
+#endif
+
+ yn = s1;
+ if constexpr (OCCUPANCY >> 0 & 1) {
+ yn = vmla(yn, b0, xn);
+ }
+ vst1(output, yn);
+ output += stride;
+
+ s1 = s2;
+ if constexpr (OCCUPANCY >> 3 & 1) {
+ s1 = vmla(s1, negativeA1, yn);
+ }
+ if constexpr (OCCUPANCY >> 1 & 1) {
+ s1 = vmla(s1, b1, xn);
+ }
+ if constexpr (OCCUPANCY >> 2 & 1) {
+ s2 = vmul(b2, xn);
+ } else {
+ s2 = vdupn<T>(0.f);
+ }
+ if constexpr (OCCUPANCY >> 4 & 1) {
+ s2 = vmla(s2, negativeA2, yn);
+ }
+ };
+
+ while (frames > 0) {
+ #pragma unroll
+ for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
+ KERNEL();
+ }
+ frames--;
+ }
+ if constexpr (UNROLL_LOOPS > 1) {
+ for (size_t i = 0; i < remainder; ++i) {
+ KERNEL();
+ }
+ }
+ s1_ = s1;
+ s2_ = s2;
+ }
+};
+
+/**
+ * A state space formulation of filtering converts a n-th order difference equation update
+ * to a first order vector difference equation. For the Biquad filter, the state space form
+ * has improved numerical precision properties with poles near the unit circle as well as
+ * increased speed due to better parallelization of the state update [1][2].
+ *
+ * [1] Raph Levien: (observerable canonical form)
+ * https://github.com/google/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
+ *
+ * [2] Julius O Smith III: (controllable canonical form)
+ * https://ccrma.stanford.edu/~jos/filters/State_Space_Filters.html
+ *
+ * The signal flow is as follows, where for scalar x and y, the matrix D is a scalar d.
+ *
+ *
+ * +------[ d ]--------------------------+
+ * | S |
+ * x ----+--[ B ]--(+)--[ z^-1 ]---+---[ C ]--(+)--- y
+ * | |
+ * +----[ A ]-----+
+ *
+ * The 2nd order Biquad IIR coefficients are as follows in observerable canonical form:
+ *
+ * y = [C_1 C_2] | S_1 | + d x
+ * | S_2 |
+ *
+ *
+ * | S_1 | = | A_11 A_12 | | S_1 | + | B_1 | x
+ * | S_2 | | A_21 A_22 | | S_2 | | B_2 |
+ *
+ *
+ * A_11 = -a1
+ * A_12 = 1
+ * A_21 = -a2
+ * A_22 = 0
+ *
+ * B_1 = b1 - b0 * a1
+ * B_2 = b2 - b0 * a2
+ *
+ * C_1 = 1
+ * C_2 = 0
+ *
+ * d = b0
+ *
+ * Implementation details: The state space filter is typically expressed in either observable or
+ * controllable canonical form [3]. We follow the observable form here.
+ * Raph [4] discovered that the single channel Biquad implementation can be further optimized
+ * by doing 2 filter updates at once (improving speed on NEON by about 20%).
+ * Doing 2 updates at once costs 8 multiplies / sample instead of 5 multiples / sample,
+ * but uses a 4 x 4 matrix multiply, exploiting single cycle multiply-add SIMD throughput.
+ * TODO: consider this optimization.
+ *
+ * [3] Mathworks
+ * https://www.mathworks.com/help/control/ug/canonical-state-space-realizations.html
+ * [4] Raph Levien
+ * https://github.com/kysko/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
+ *
+ * The template variables
+ * T is the data type (scalar or vector).
+ * F is the filter coefficient type. It is either a scalar or vector (matching T).
+ */
+template <typename T, typename F>
+struct BiquadStateSpace {
+ F coef_[5]; // these are stored as state-space converted.
+ T s1_; // delay state 1
+ T s2_; // delay state 2
+
+ // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
+ // as expressed by a bitmask. This must include 31.
+ static inline constexpr size_t required_occupancies_[] = {
+ 1, // constant scale
+ 3, // single zero
+ 7, // double zero
+ 9, // single pole
+ 11, // first order IIR
+ 27, // double pole + single zero
+ 31, // second order IIR (full Biquad)
+ };
+
+ // Take care the order of arguments - starts with b's then goes to a's.
+ // The a's are "positive" reference, some filters take negative.
+ BiquadStateSpace(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
+ const T& s1 = {}, const T& s2 = {})
+ // : coef_{b0, b1 - b0 * a1, b2 - b0 * a2, -a1, -a2}
+ : coef_{ b0,
+ intrinsics::vsub(b1, intrinsics::vmul(b0, a1)),
+ intrinsics::vsub(b2, intrinsics::vmul(b0, a2)),
+ intrinsics::vneg(a1),
+ intrinsics::vneg(a2) }
+ , s1_{s1}
+ , s2_{s2} {
+ }
+
+ // D is the data type. It must be the same element type of T or F.
+ // Take care the order of input and output.
+ template<typename D, size_t OCCUPANCY = 0x1f>
+ void process(D* output, const D* input, size_t frames, size_t stride) {
+ using namespace intrinsics;
+ const F b0 = coef_[0]; // b0
+ const F b1ss = coef_[1]; // b1 - b0 * a1,
+ const F b2ss = coef_[2]; // b2 - b0 * a2,
+ const F negativeA1 = coef_[3]; // -a1
+ const F negativeA2 = coef_[4]; // -a2
+ T s1 = s1_;
+ T s2 = s2_;
+ T x, new_s1; // OK to declare temps here rather than at the point of initialization.
+#ifdef USE_DITHER
+ constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
+ T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
+#endif
+ constexpr bool b0_present = (OCCUPANCY & 0x1) != 0;
+ constexpr bool a1_present = (OCCUPANCY & 0x8) != 0;
+ constexpr bool a2_present = (OCCUPANCY & 0x10) != 0;
+ constexpr bool b1ss_present = (OCCUPANCY & 0x2) != 0 ||
+ (b0_present && a1_present);
+ constexpr bool b2ss_present = (OCCUPANCY & 0x4) != 0 ||
+ (b0_present && a2_present);
+
+
+ // Unroll control. Make sure the constexpr remains constexpr :-).
+ constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
+ constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 1; // below this won't be unrolled.
+ constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled.
+ constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
+ CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
+ size_t remainder = 0;
+ if constexpr (UNROLL_LOOPS > 1) {
+ remainder = frames % UNROLL_LOOPS;
+ frames /= UNROLL_LOOPS;
+ }
+
+ // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
+ // The other alternative is to use a MACRO, but that doesn't read as well.
+ const auto KERNEL = [&]() __attribute__((always_inline)) {
+ x = vld1<T>(input);
+ input += stride;
+#ifdef USE_DITHER
+ x = vadd(x, dither);
+ dither = vneg(dither);
+#endif
+ // vst1(output, vadd(s1, vmul(b0, x)));
+ // output += stride;
+ // new_s1 = vadd(vadd(vmul(b1ss, x), vmul(negativeA1, s1)), s2);
+ // s2 = vadd(vmul(b2ss, x), vmul(negativeA2, s1));
+
+ if constexpr (b0_present) {
+ vst1(output, vadd(s1, vmul(b0, x)));
+ } else /* constexpr */ {
+ vst1(output, s1);
+ }
+ output += stride;
+ new_s1 = s2;
+ if constexpr (b1ss_present) {
+ new_s1 = vadd(new_s1, vmul(b1ss, x));
+ }
+ if constexpr (a1_present) {
+ new_s1 = vadd(new_s1, vmul(negativeA1, s1));
+ }
+ if constexpr (b2ss_present) {
+ s2 = vmul(b2ss, x);
+ if constexpr (a2_present) {
+ s2 = vadd(s2, vmul(negativeA2, s1));
+ }
+ } else if constexpr (a2_present) {
+ s2 = vmul(negativeA2, s1);
+ }
+ s1 = new_s1;
+ };
+
+ while (frames > 0) {
+ #pragma unroll
+ for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
+ KERNEL();
+ }
+ frames--;
+ }
+ if constexpr (UNROLL_LOOPS > 1) {
+ for (size_t i = 0; i < remainder; ++i) {
+ KERNEL();
+ }
+ }
+ s1_ = s1;
+ s2_ = s2;
+ }
+};
+
namespace details {
+
// Helper methods for constructing a constexpr array of function pointers.
// As function pointers are efficient and have no constructor/destructor
// this is preferred over std::function.
@@ -105,67 +424,6 @@
}
}
-// For biquad_filter_fast, we template based on whether coef[i] is nonzero - this should be
-// determined in a constexpr fashion for optimization.
-
-// Helper which takes a stride to allow column processing of interleaved audio streams.
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_1fast(D *out, const D *in, size_t frames, size_t stride,
- size_t channelCount, D *delays, const D *coefs, size_t localStride) {
-#if defined(__i386__) || defined(__x86_x64__)
- D delta = std::numeric_limits<float>::min() * (1 << 24);
-#endif
- D b0, b1, b2, negativeA1, negativeA2;
-
- if constexpr (SAME_COEF_PER_CHANNEL) {
- b0 = coefs[0];
- b1 = coefs[1];
- b2 = coefs[2];
- negativeA1 = -coefs[3];
- negativeA2 = -coefs[4];
- }
- for (size_t i = 0; i < channelCount; ++i) {
- if constexpr (!SAME_COEF_PER_CHANNEL) {
- b0 = coefs[0];
- b1 = coefs[localStride];
- b2 = coefs[2 * localStride];
- negativeA1 = -coefs[3 * localStride];
- negativeA2 = -coefs[4 * localStride];
- ++coefs;
- }
-
- D s1n1 = delays[0];
- D s2n1 = delays[localStride];
- const D *input = &in[i];
- D *output = &out[i];
- for (size_t j = frames; j > 0; --j) {
- // Adding a delta to avoid subnormal exception handling on the x86/x64 platform;
- // this is not a problem with the ARM platform. The delta will not affect the
- // precision of the result.
-#if defined(__i386__) || defined(__x86_x64__)
- const D xn = *input + delta;
-#else
- const D xn = *input;
-#endif
- D yn = (OCCUPANCY >> 0 & 1) * b0 * xn + s1n1;
- s1n1 = (OCCUPANCY >> 1 & 1) * b1 * xn + (OCCUPANCY >> 3 & 1) * negativeA1 * yn + s2n1;
- s2n1 = (OCCUPANCY >> 2 & 1) * b2 * xn + (OCCUPANCY >> 4 & 1) * negativeA2 * yn;
-
- input += stride;
-
- *output = yn;
- output += stride;
-
-#if defined(__i386__) || defined(__x86_x64__)
- delta = -delta;
-#endif
- }
- delays[0] = s1n1;
- delays[localStride] = s2n1;
- ++delays;
- }
-}
-
// Helper function to zero channels in the input buffer.
// This is used for the degenerate coefficient case which results in all zeroes.
template <typename D>
@@ -180,90 +438,69 @@
}
}
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_fast(D *out, const D *in, size_t frames, size_t stride,
- size_t channelCount, D *delays, const D *coefs, size_t localStride) {
- if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
- zeroChannels(out, frames, stride, channelCount);
- return;
- }
- biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>(
- out, in, frames, stride, channelCount, delays, coefs, localStride);
-}
-
-#ifdef USE_NEON
-
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
-void biquad_filter_neon_impl(F *out, const F *in, size_t frames, size_t stride,
+template <template <typename, typename> typename FilterType,
+ size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
+void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride,
size_t channelCount, F *delays, const F *coefs, size_t localStride) {
using namespace android::audio_utils::intrinsics;
constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
- T b0, b1, b2, negativeA1, negativeA2;
- if constexpr (SAME_COEF_PER_CHANNEL) {
- b0 = vdupn<T>(coefs[0]);
- b1 = vdupn<T>(coefs[1]);
- b2 = vdupn<T>(coefs[2]);
- negativeA1 = vneg(vdupn<T>(coefs[3]));
- negativeA2 = vneg(vdupn<T>(coefs[4]));
- }
+ const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride;
+ using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>;
+
for (size_t i = 0; i < channelCount; i += elements) {
- if constexpr (!SAME_COEF_PER_CHANNEL) {
- b0 = vld1<T>(coefs);
- b1 = vld1<T>(coefs + localStride);
- b2 = vld1<T>(coefs + localStride * 2);
- negativeA1 = vneg(vld1<T>(coefs + localStride * 3));
- negativeA2 = vneg(vld1<T>(coefs + localStride * 4));
- coefs += elements;
- }
T s1 = vld1<T>(&delays[0]);
T s2 = vld1<T>(&delays[localStride]);
- const F *input = &in[i];
- F *output = &out[i];
- for (size_t j = frames; j > 0; --j) {
- T xn = vld1<T>(input);
- T yn = s1;
- if constexpr (OCCUPANCY >> 0 & 1) {
- yn = vmla(yn, b0, xn);
- }
- s1 = s2;
- if constexpr (OCCUPANCY >> 3 & 1) {
- s1 = vmla(s1, negativeA1, yn);
- }
- if constexpr (OCCUPANCY >> 1 & 1) {
- s1 = vmla(s1, b1, xn);
- }
- if constexpr (OCCUPANCY >> 2 & 1) {
- s2 = vmul(b2, xn);
- } else {
- s2 = vdupn<T>(0.f);
- }
- if constexpr (OCCUPANCY >> 4 & 1) {
- s2 = vmla(s2, negativeA2, yn);
- }
-
- input += stride;
- vst1(output, yn);
- output += stride;
- }
- vst1(&delays[0], s1);
- vst1(&delays[localStride], s2);
+ FilterType<T, CoefType> kernel(
+ vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride),
+ vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3),
+ vld1<CoefType>(coefs + coefStride * 4),
+ s1, s2);
+ if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements;
+ kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride);
+ vst1(&delays[0], kernel.s1_);
+ vst1(&delays[localStride], kernel.s2_);
delays += elements;
}
}
-#define BIQUAD_FILTER_CASE(N, ... /* type */) \
+// Find the nearest occupancy mask that includes all the desired bits.
+template <typename T, size_t N>
+static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) {
+ if (occupancy < 32) {
+ for (auto test : occupancies) {
+ if ((occupancy & test) == occupancy) return test;
+ }
+ }
+ return 31;
+}
+
+enum FILTER_OPTION {
+ FILTER_OPTION_SCALAR_ONLY = (1 << 0),
+};
+
+// Default biquad type.
+template <typename T, typename F>
+using BiquadFilterType = BiquadStateSpace<T, F>;
+
+#define BIQUAD_FILTER_CASE(N, FilterType, ... /* type */) \
case N: { \
- biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, __VA_ARGS__>( \
+ using VectorType = __VA_ARGS__; \
+ biquad_filter_func_impl< \
+ FilterType, \
+ nearestOccupancy(OCCUPANCY, \
+ FilterType<VectorType, D>::required_occupancies_), \
+ SAME_COEF_PER_CHANNEL, VectorType>( \
out + offset, in + offset, frames, stride, remaining, \
delays + offset, c, localStride); \
goto exit; \
}
template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride,
- size_t channelCount, D *delays, const D *coefs, size_t localStride) {
+void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride,
+ size_t channelCount, D *delays, const D *coefs, size_t localStride,
+ FILTER_OPTION filterOptions) {
if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
zeroChannels(out, frames, stride, channelCount);
return;
@@ -274,41 +511,53 @@
// using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
// using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
+#ifdef USE_NEON
+ // use NEON types to ensure we have the proper intrinsic acceleration.
+ using alt_16_t = float32x4x4_t;
+ using alt_8_t = float32x4x2_t;
+ using alt_4_t = float32x4_t;
+#else
+ // Use C++ types, no NEON needed.
+ using alt_16_t = intrinsics::internal_array_t<float, 16>;
+ using alt_8_t = intrinsics::internal_array_t<float, 8>;
+ using alt_4_t = intrinsics::internal_array_t<float, 4>;
+#endif
+
for (size_t offset = 0; offset < channelCount; ) {
size_t remaining = channelCount - offset;
auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
+ if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar;
if constexpr (std::is_same_v<D, float>) {
switch (remaining) {
default:
if (remaining >= 16) {
remaining &= ~15;
- biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, float32x4x4_t>(
+ biquad_filter_func_impl<
+ BiquadFilterType,
+ nearestOccupancy(OCCUPANCY,
+ BiquadFilterType<D, D>::required_occupancies_),
+ SAME_COEF_PER_CHANNEL, alt_16_t>(
out + offset, in + offset, frames, stride, remaining,
delays + offset, c, localStride);
offset += remaining;
continue;
}
break; // case 1 handled at bottom.
- BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
- BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
- BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
- BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
- BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
- BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
- BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
- // We choose the NEON intrinsic type over internal_array for 8 to
- // check if there is any performance difference in benchmark (should be similar).
- // BIQUAD_FILTER_CASE(8, intrinsics::internal_array_t<float, 8>)
- BIQUAD_FILTER_CASE(8, float32x4x2_t)
- BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
- BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
- BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
- BIQUAD_FILTER_CASE(4, float32x4_t)
- // We choose the NEON intrinsic type over internal_array for 4 to
- // check if there is any performance difference in benchmark (should be similar).
- // BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<float, 4>)
- BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
- BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
+ BIQUAD_FILTER_CASE(15, BiquadFilterType, intrinsics::internal_array_t<float, 15>)
+ BIQUAD_FILTER_CASE(14, BiquadFilterType, intrinsics::internal_array_t<float, 14>)
+ BIQUAD_FILTER_CASE(13, BiquadFilterType, intrinsics::internal_array_t<float, 13>)
+ BIQUAD_FILTER_CASE(12, BiquadFilterType, intrinsics::internal_array_t<float, 12>)
+ BIQUAD_FILTER_CASE(11, BiquadFilterType, intrinsics::internal_array_t<float, 11>)
+ BIQUAD_FILTER_CASE(10, BiquadFilterType, intrinsics::internal_array_t<float, 10>)
+ BIQUAD_FILTER_CASE(9, BiquadFilterType, intrinsics::internal_array_t<float, 9>)
+ BIQUAD_FILTER_CASE(8, BiquadFilterType, alt_8_t)
+ BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<float, 7>)
+ BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<float, 6>)
+ BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<float, 5>)
+ BIQUAD_FILTER_CASE(4, BiquadFilterType, alt_4_t)
+ BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<float, 3>)
+ BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<float, 2>)
+ // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>)
}
} else if constexpr (std::is_same_v<D, double>) {
#if defined(__aarch64__)
@@ -316,27 +565,34 @@
default:
if (remaining >= 8) {
remaining &= ~7;
- biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL,
- intrinsics::internal_array_t<double, 8>>(
+ biquad_filter_func_impl<BiquadFilterType,
+ nearestOccupancy(OCCUPANCY,
+ BiquadFilterType<D, D>::required_occupancies_),
+ SAME_COEF_PER_CHANNEL,
+ intrinsics::internal_array_t<double, 8>>(
out + offset, in + offset, frames, stride, remaining,
delays + offset, c, localStride);
offset += remaining;
continue;
}
break; // case 1 handled at bottom.
- BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
- BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
- BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
- BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
- BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
- BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
+ BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<double, 7>)
+ BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<double, 6>)
+ BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<double, 5>)
+ BIQUAD_FILTER_CASE(4, BiquadFilterType, intrinsics::internal_array_t<double, 4>)
+ BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<double, 3>)
+ BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<double, 2>)
};
#endif
}
+ scalar:
// Essentially the code below is scalar, the same as
// biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
// but formulated with NEON intrinsic-like call pattern.
- biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, D>(
+ biquad_filter_func_impl<BiquadFilterType,
+ nearestOccupancy(OCCUPANCY,
+ BiquadFilterType<D, D>::required_occupancies_),
+ SAME_COEF_PER_CHANNEL, D>(
out + offset, in + offset, frames, stride, remaining,
delays + offset, c, localStride);
offset += remaining;
@@ -344,8 +600,6 @@
exit:;
}
-#endif // USE_NEON
-
} // namespace details
/**
@@ -584,16 +838,14 @@
}
// Select the proper filtering function from our array.
- (void)optimized; // avoid unused variable warning.
- mFunc = mFilterFast[category]; // default if we don't have processor optimization.
-
-#ifdef USE_NEON
- /* if constexpr (std::is_same_v<D, float>) */ {
- if (optimized) {
- mFunc = mFilterNeon[category];
- }
+ if (optimized) {
+ mFilterOptions = (details::FILTER_OPTION)
+ (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY);
+ } else {
+ mFilterOptions = (details::FILTER_OPTION)
+ (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY);
}
-#endif
+ mFunc = mFilterFuncs[category];
}
/**
@@ -603,7 +855,7 @@
* \param in pointer to the input data
* \param frames number of audio frames to be processed
*/
- void process(D* out, const D *in, size_t frames) {
+ void process(D* out, const D* in, size_t frames) {
process(out, in, frames, mChannelCount);
}
@@ -615,10 +867,10 @@
* \param frames number of audio frames to be processed
* \param stride the total number of samples associated with a frame, if not channelCount.
*/
- void process(D* out, const D *in, size_t frames, size_t stride) {
+ void process(D* out, const D* in, size_t frames, size_t stride) {
assert(stride >= mChannelCount);
mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
- mCoefs.data(), mChannelCount);
+ mCoefs.data(), mChannelCount, mFilterOptions);
}
/**
@@ -655,7 +907,7 @@
auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
auto delays = mDelays.data() + fromEnd;
mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
- delays, coefs, mChannelCount);
+ delays, coefs, mChannelCount, mFilterOptions);
}
auto delays = mDelays.data() + baseIdx;
@@ -664,13 +916,13 @@
// sliding one audio sample at a time.
mFunc(inout, inout,
frames - channelBlock + 1, 1 /* stride */, channelBlock,
- delays, coefs, mChannelCount);
+ delays, coefs, mChannelCount, mFilterOptions);
// drain data pipe.
for (size_t i = 1; i < channelBlock; ++i) {
mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
1 /* frames */, 1 /* stride */, channelBlock - i,
- delays, coefs, mChannelCount);
+ delays, coefs, mChannelCount, mFilterOptions);
}
}
}
@@ -681,7 +933,7 @@
auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
mFunc(inout, inout,
frames, 1 /* stride */, 1 /* channelCount */,
- mDelays.data() + fromEnd, coefs, mChannelCount);
+ mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions);
}
}
@@ -746,121 +998,57 @@
*/
std::vector<D> mDelays;
- using filter_func = decltype(details::biquad_filter_fast<0, true, D>);
+ details::FILTER_OPTION mFilterOptions{};
- /**
- * \var filter_func* mFunc
+ // Consider making a separate delegation class.
+ /*
+ * We store an array of functions based on the occupancy.
*
- * The current filter function selected for the channel occupancy of the Biquad.
- */
- filter_func *mFunc;
-
- // Create a functional wrapper to feed "biquad_filter_fast" to
- // make_functional_array() to populate the array.
- //
- // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
- // b0 b1 b2 a1 a2 (from lsb to msb)
- template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
- struct FuncWrap {
- template<typename T>
- static constexpr size_t nearest() {
- // Combine cases to both improve expected performance and reduce code space.
- // Some occupancy masks provide worse performance than more occupied masks.
- constexpr size_t required_occupancies[] = {
- 1, // constant scale
- 3, // single zero
- 7, // double zero
- 9, // single pole
- // 11, // first order IIR (unnecessary optimization, close enough to 31).
- 27, // double pole + single zero
- 31, // second order IIR (full Biquad)
- };
- if constexpr (OCCUPANCY < 32) {
- for (auto test : required_occupancies) {
- if ((OCCUPANCY & test) == OCCUPANCY) return test;
- }
- } else {
- static_assert(intrinsics::dependent_false_v<T>);
- }
- return 0; // never gets here.
- }
-
- static void func(D* out, const D *in, size_t frames, size_t stride,
- size_t channelCount, D *delays, const D *coef, size_t localStride) {
- constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
- details::biquad_filter_fast<NEAREST_OCCUPANCY, SC>(
- out, in, frames, stride, channelCount, delays, coef, localStride);
- }
- };
-
- /**
- * \var mFilterFast
- *
- * std::array of functions based on coefficient occupancy.
+ * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
+ * b0 b1 b2 a1 a2 (from lsb to msb)
*
* static inline constexpr std::array<filter_func*, M> mArray = {
- * biquad_filter_fast<0>,
- * biquad_filter_fast<1>,
- * biquad_filter_fast<2>,
+ * biquad_filter_func<0>,
+ * biquad_filter_func<1>,
+ * biquad_filter_func<2>,
* ...
- * biquad_filter_fast<(1 << kBiquadNumCoefs) - 1>,
+ * biquad_filter_func<(1 << kBiquadNumCoefs) - 1>,
* };
*
* Every time the coefficients are changed, we select the processing function from
* this table.
*/
- static inline constexpr auto mFilterFast =
- details::make_functional_array<
- FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
-#ifdef USE_NEON
- // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
- // b0 b1 b2 a1 a2 (from lsb to msb)
-
+ // Used to build the functional array.
template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
- struct FuncWrapNeon {
- template<typename T>
- static constexpr size_t nearest() {
- // combine cases to both improve expected performance and reduce code space.
- //
- // This lists the occupancies we will specialize functions for.
- constexpr size_t required_occupancies[] = {
- 1, // constant scale
- 3, // single zero
- 7, // double zero
- 9, // single pole
- 11, // first order IIR
- 27, // double pole + single zero
- 31, // second order IIR (full Biquad)
- };
- if constexpr (OCCUPANCY < 32) {
- for (auto test : required_occupancies) {
- if ((OCCUPANCY & test) == OCCUPANCY) return test;
- }
- } else {
- static_assert(intrinsics::dependent_false_v<T>);
- }
- return 0; // never gets here.
- }
-
+ struct FuncWrap {
static void func(D* out, const D *in, size_t frames, size_t stride,
- size_t channelCount, D *delays, const D *coef, size_t localStride) {
- constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
- details::biquad_filter_neon<NEAREST_OCCUPANCY, SC>(
- out, in, frames, stride, channelCount, delays, coef, localStride);
+ size_t channelCount, D *delays, const D *coef, size_t localStride,
+ details::FILTER_OPTION filterOptions) {
+ constexpr size_t NEAREST_OCCUPANCY =
+ details::nearestOccupancy(
+ OCCUPANCY, details::BiquadFilterType<D, D>::required_occupancies_);
+ details::biquad_filter_func<NEAREST_OCCUPANCY, SC>(
+ out, in, frames, stride, channelCount, delays, coef, localStride,
+ filterOptions);
}
};
- // Neon optimized array of functions.
- static inline constexpr auto mFilterNeon =
+ // Vector optimized array of functions.
+ static inline constexpr auto mFilterFuncs =
details::make_functional_array<
- FuncWrapNeon, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
-#endif // USE_NEON
+ FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
+ /**
+ * \var filter_func* mFunc
+ *
+ * The current filter function selected for the channel occupancy of the Biquad.
+ * It will be one of mFilterFuncs.
+ */
+ std::decay_t<decltype(mFilterFuncs[0])> mFunc;
};
} // namespace android::audio_utils
+#pragma pop_macro("USE_DITHER")
#pragma pop_macro("USE_NEON")
-
-#endif // !ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
diff --git a/audio_utils/include/audio_utils/intrinsic_utils.h b/audio_utils/include/audio_utils/intrinsic_utils.h
index ed2b2bb..0c333e0 100644
--- a/audio_utils/include/audio_utils/intrinsic_utils.h
+++ b/audio_utils/include/audio_utils/intrinsic_utils.h
@@ -78,6 +78,45 @@
using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
*/
+// add a + b
+template<typename T>
+static inline T vadd(T a, T b) {
+ if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+ return a + b;
+
+#ifdef USE_NEON
+ } else if constexpr (std::is_same_v<T, float32x2_t>) {
+ return vadd_f32(a, b);
+ } else if constexpr (std::is_same_v<T, float32x4_t>) {
+ return vaddq_f32(a, b);
+#if defined(__aarch64__)
+ } else if constexpr (std::is_same_v<T, float64x2_t>) {
+ return vaddq_f64(a, b);
+#endif
+#endif // USE_NEON
+
+ } else /* constexpr */ {
+ T ret;
+ auto &[retval] = ret; // single-member struct
+ const auto &[aval] = a;
+ const auto &[bval] = b;
+ if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+ for (size_t i = 0; i < std::size(aval); ++i) {
+ retval[i] = vadd(aval[i], bval[i]);
+ }
+ return ret;
+ } else /* constexpr */ {
+ auto &[r1, r2] = retval;
+ const auto &[a1, a2] = aval;
+ const auto &[b1, b2] = bval;
+ r1 = vadd(a1, b1);
+ r2 = vadd(a2, b2);
+ return ret;
+ }
+ }
+}
+
// duplicate float into all elements.
template<typename T, typename F>
static inline T vdupn(F f) {
@@ -156,6 +195,73 @@
}
}
+/**
+ * Returns c as follows:
+ * c_i = a_i * b_i if a and b are the same vector type or
+ * c_i = a_i * b if a is a vector and b is scalar or
+ * c_i = a * b_i if a is scalar and b is a vector.
+ */
+template<typename T, typename S, typename F>
+static inline T vmla(T a, S b, F c) {
+ // Both types T and S are non-primitive and they are not equal. T == S handled below.
+ (void) a;
+ (void) b;
+ (void) c;
+ static_assert(dependent_false_v<T>);
+}
+
+template<typename T, typename F>
+static inline T vmla(T a, T b, F c) {
+ if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+ if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+ return a + b * c;
+ } else {
+ static_assert(dependent_false_v<T>);
+ }
+ } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+ // handle the lane variant
+#ifdef USE_NEON
+ if constexpr (std::is_same_v<T, float32x2_t>) {
+ return vmla_n_f32(a, b, c);
+ } else if constexpr (std::is_same_v<T, float32x4_t>) {
+ return vmlaq_n_f32(a, b,c);
+#if defined(__aarch64__)
+ } else if constexpr (std::is_same_v<T, float64x2_t>) {
+ return vmlaq_n_f64(a, b);
+#endif
+ } else
+#endif // USE_NEON
+ {
+ T ret;
+ auto &[retval] = ret; // single-member struct
+ const auto &[aval] = a;
+ const auto &[bval] = b;
+ if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+ for (size_t i = 0; i < std::size(aval); ++i) {
+ retval[i] = vmla(aval[i], bval[i], c);
+ }
+ return ret;
+ } else /* constexpr */ {
+ auto &[r1, r2] = retval;
+ const auto &[a1, a2] = aval;
+ const auto &[b1, b2] = bval;
+ r1 = vmla(a1, b1, c);
+ r2 = vmla(a2, b2, c);
+ return ret;
+ }
+ }
+ } else {
+ // Both types T and F are non-primitive and they are not equal.
+ static_assert(dependent_false_v<T>);
+ }
+}
+
+template<typename T, typename F>
+static inline T vmla(T a, F b, T c) {
+ return vmla(a, c, b);
+}
+
// fused multiply-add a + b * c
template<typename T>
static inline T vmla(T a, T b, T c) {
@@ -197,7 +303,57 @@
}
}
-// multiply a * b
+/**
+ * Returns c as follows:
+ * c_i = a_i * b_i if a and b are the same vector type or
+ * c_i = a_i * b if a is a vector and b is scalar or
+ * c_i = a * b_i if a is scalar and b is a vector.
+ */
+template<typename T, typename F>
+static inline auto vmul(T a, F b) {
+ if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+ if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+ return a * b;
+ } else /* constexpr */ {
+ return vmul(b, a); // we prefer T to be the vector/struct form.
+ }
+ } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+ // handle the lane variant
+#ifdef USE_NEON
+ if constexpr (std::is_same_v<T, float32x2_t>) {
+ return vmul_n_f32(a, b);
+ } else if constexpr (std::is_same_v<T, float32x4_t>) {
+ return vmulq_n_f32(a, b);
+#if defined(__aarch64__)
+ } else if constexpr (std::is_same_v<T, float64x2_t>) {
+ return vmulq_n_f64(a, b);
+#endif
+ } else
+#endif // USE_NEON
+ {
+ T ret;
+ auto &[retval] = ret; // single-member struct
+ const auto &[aval] = a;
+ if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+ for (size_t i = 0; i < std::size(aval); ++i) {
+ retval[i] = vmul(aval[i], b);
+ }
+ return ret;
+ } else /* constexpr */ {
+ auto &[r1, r2] = retval;
+ const auto &[a1, a2] = aval;
+ r1 = vmul(a1, b);
+ r2 = vmul(a2, b);
+ return ret;
+ }
+ }
+ } else {
+ // Both types T and F are non-primitive and they are not equal.
+ static_assert(dependent_false_v<T>);
+ }
+}
+
template<typename T>
static inline T vmul(T a, T b) {
if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
@@ -308,6 +464,45 @@
}
}
+// subtract a - b
+template<typename T>
+static inline T vsub(T a, T b) {
+ if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+ return a - b;
+
+#ifdef USE_NEON
+ } else if constexpr (std::is_same_v<T, float32x2_t>) {
+ return vsub_f32(a, b);
+ } else if constexpr (std::is_same_v<T, float32x4_t>) {
+ return vsubq_f32(a, b);
+#if defined(__aarch64__)
+ } else if constexpr (std::is_same_v<T, float64x2_t>) {
+ return vsubq_f64(a, b);
+#endif
+#endif // USE_NEON
+
+ } else /* constexpr */ {
+ T ret;
+ auto &[retval] = ret; // single-member struct
+ const auto &[aval] = a;
+ const auto &[bval] = b;
+ if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+ for (size_t i = 0; i < std::size(aval); ++i) {
+ retval[i] = vsub(aval[i], bval[i]);
+ }
+ return ret;
+ } else /* constexpr */ {
+ auto &[r1, r2] = retval;
+ const auto &[a1, a2] = aval;
+ const auto &[b1, b2] = bval;
+ r1 = vsub(a1, b1);
+ r2 = vsub(a2, b2);
+ return ret;
+ }
+ }
+}
+
} // namespace android::audio_utils::intrinsics
#pragma pop_macro("USE_NEON")
diff --git a/audio_utils/tests/biquad_filter_tests.cpp b/audio_utils/tests/biquad_filter_tests.cpp
index 3be2a5d..f90b863 100644
--- a/audio_utils/tests/biquad_filter_tests.cpp
+++ b/audio_utils/tests/biquad_filter_tests.cpp
@@ -29,12 +29,13 @@
/************************************************************************************
* Reference data, must not change.
- * The reference output data is from running in matlab y = filter(b, a, x), where
- * b = [2.0f, 3.0f]
- * a = [1.0f, 0.2f]
- * x = [-0.1f, -0.2f, -0.3f, -0.4f, -0.5f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f]
- * The output y = [-0.2f, -0.66f, -1.068f, -1.4864f, -1.9027f,
- * -0.9195f, 0.8839f, 1.0232f, 1.4954f, 1.9009f].
+ * The reference output data is from running in matlab or octave y = filter(b, a, x), where
+ * b = [2.0 3.0 4.0]
+ * a = [1.0 0.2 0.3]
+ * x = [-0.1 -0.2 -0.3 -0.4 -0.5 0.1 0.2 0.3 0.4 0.5]
+ * filter(b, a, x)
+ *
+ * The output y = [-0.2 -0.66 -1.4080 -2.0204 -2.5735 -1.7792 -0.1721 2.1682 2.1180 2.3259].
* The reference data construct the input and output as 2D array so that it can be
* use to practice calling BiquadFilter::process multiple times.
************************************************************************************/
@@ -43,11 +44,12 @@
constexpr float INPUT[PERIOD][FRAME_COUNT] = {
{-0.1f, -0.2f, -0.3f, -0.4f, -0.5f},
{0.1f, 0.2f, 0.3f, 0.4f, 0.5f}};
+// COEFS in order of [ b0 b1 b2 a1 a2 ], normalized form where a0 = 1.
constexpr std::array<float, kBiquadNumCoefs> COEFS = {
- 2.0f, 3.0f, 0.0f, 0.2f, 0.0f };
+ 2.0f, 3.0f, 4.0f, 0.2f, 0.3f };
constexpr float OUTPUT[PERIOD][FRAME_COUNT] = {
- {-0.2f, -0.66f, -1.068f, -1.4864f, -1.9027f},
- {-0.9195f, 0.8839f, 1.0232f, 1.4954f, 1.9009f}};
+ {-0.2f, -0.66f, -1.4080f, -2.0204f, -2.5735f},
+ {-1.7792f, -0.1721f, 2.1682f, 2.1180f, 2.3259f}};
constexpr float EPS = 1e-4f;
template <typename S, typename D>
diff --git a/audio_utils/tests/intrinsic_tests.cpp b/audio_utils/tests/intrinsic_tests.cpp
index 6a16747..d9686ef 100644
--- a/audio_utils/tests/intrinsic_tests.cpp
+++ b/audio_utils/tests/intrinsic_tests.cpp
@@ -25,6 +25,13 @@
using FloatTypes = ::testing::Types<float, double>;
TYPED_TEST_CASE(IntrisicUtilsTest, FloatTypes);
+TYPED_TEST(IntrisicUtilsTest, vadd) {
+ constexpr TypeParam a = 0.25f;
+ constexpr TypeParam b = 0.5f;
+ constexpr TypeParam result = a + b;
+ ASSERT_EQ(result, android::audio_utils::intrinsics::vadd(a, b));
+}
+
TYPED_TEST(IntrisicUtilsTest, vdupn) {
constexpr TypeParam value = 1.f;
ASSERT_EQ(value, android::audio_utils::intrinsics::vdupn<TypeParam>(value));
@@ -62,3 +69,10 @@
&destination, android::audio_utils::intrinsics::vdupn<TypeParam>(value));
ASSERT_EQ(value, destination);
}
+
+TYPED_TEST(IntrisicUtilsTest, vsub) {
+ constexpr TypeParam a = 1.25f;
+ constexpr TypeParam b = 1.5f;
+ constexpr TypeParam result = a - b;
+ ASSERT_EQ(result, android::audio_utils::intrinsics::vsub(a, b));
+}