Merge changes I48db8476,Ieb222293,If3bb87da,I22ca2c60

* changes:
  audio_utils: Biquad state space implementation
  audio_utils: Biquad refactorization
  audio_utils: Biquad test update to 2nd order IIR
  audio_utils: Biquad update benchmark for 24 channels
diff --git a/audio_utils/benchmarks/biquad_filter_benchmark.cpp b/audio_utils/benchmarks/biquad_filter_benchmark.cpp
index 0a61593..e912b18 100644
--- a/audio_utils/benchmarks/biquad_filter_benchmark.cpp
+++ b/audio_utils/benchmarks/biquad_filter_benchmark.cpp
@@ -25,6 +25,8 @@
 #include <audio_utils/BiquadFilter.h>
 #include <audio_utils/format.h>
 
+#pragma GCC diagnostic ignored "-Wunused-function"  // we use override array assignment
+
 static constexpr size_t DATA_SIZE = 1024;
 // The coefficients is a HPF with sampling frequency as 48000, center frequency as 600,
 // and Q as 0.707. As all the coefficients are not zero, they can be used to benchmark
@@ -82,138 +84,327 @@
 BENCHMARK(BM_BiquadFilter1D)->Apply(BiquadFilter1DArgs);
 
 /*******************************************************************
- * A test result running on Pixel 4 for comparison.
- * The first parameter indicates the input data is subnormal or not.
- * 0 for normal input data, 1 for subnormal input data.
- * The second parameter indicates the channel count.
- * The third parameter indicates the occupancy of the coefficients.
- * -----------------------------------------------------------------
- * Benchmark                       Time             CPU   Iterations
- * -----------------------------------------------------------------
- * BM_BiquadFilter/0/1/1         734 ns          732 ns       740671
- * BM_BiquadFilter/0/1/2         554 ns          553 ns      1266836
- * BM_BiquadFilter/0/1/3         647 ns          645 ns      1085314
- * BM_BiquadFilter/0/1/4         834 ns          832 ns       841345
- * BM_BiquadFilter/0/1/5        1068 ns         1065 ns       657343
- * BM_BiquadFilter/0/1/6         767 ns          765 ns       915223
- * BM_BiquadFilter/0/1/7         929 ns          926 ns       756405
- * BM_BiquadFilter/0/1/8        2173 ns         2168 ns       322949
- * BM_BiquadFilter/0/1/9        1831 ns         1826 ns       383304
- * BM_BiquadFilter/0/1/10       1931 ns         1927 ns       363386
- * BM_BiquadFilter/0/1/11       2536 ns         2529 ns       276783
- * BM_BiquadFilter/0/1/12       2535 ns         2529 ns       276826
- * BM_BiquadFilter/0/1/13       2174 ns         2169 ns       322755
- * BM_BiquadFilter/0/1/14       3257 ns         3250 ns       215383
- * BM_BiquadFilter/0/1/15       2175 ns         2169 ns       322711
- * BM_BiquadFilter/0/1/16       1494 ns         1491 ns       469625
- * BM_BiquadFilter/0/1/17       2176 ns         2171 ns       322423
- * BM_BiquadFilter/0/1/18       1568 ns         1564 ns       447467
- * BM_BiquadFilter/0/1/19       1325 ns         1322 ns       529762
- * BM_BiquadFilter/0/1/20       1926 ns         1921 ns       364534
- * BM_BiquadFilter/0/1/21       2630 ns         2623 ns       266027
- * BM_BiquadFilter/0/1/22       1998 ns         1993 ns       351210
- * BM_BiquadFilter/0/1/23       1882 ns         1877 ns       373028
- * BM_BiquadFilter/0/1/24       2536 ns         2529 ns       276818
- * BM_BiquadFilter/0/1/25       2176 ns         2170 ns       322627
- * BM_BiquadFilter/0/1/26       3258 ns         3250 ns       215440
- * BM_BiquadFilter/0/1/27       2175 ns         2169 ns       322724
- * BM_BiquadFilter/0/1/28       2535 ns         2529 ns       276823
- * BM_BiquadFilter/0/1/29       2175 ns         2170 ns       322560
- * BM_BiquadFilter/0/1/30       3259 ns         3250 ns       215354
- * BM_BiquadFilter/0/1/31       2176 ns         2171 ns       322492
- * BM_BiquadFilter/0/2/1        1470 ns         1466 ns       477411
- * BM_BiquadFilter/0/2/2        1109 ns         1107 ns       632666
- * BM_BiquadFilter/0/2/3        1297 ns         1294 ns       540804
- * BM_BiquadFilter/0/2/4        1681 ns         1677 ns       417675
- * BM_BiquadFilter/0/2/5        2137 ns         2132 ns       328721
- * BM_BiquadFilter/0/2/6        1572 ns         1569 ns       447114
- * BM_BiquadFilter/0/2/7        1736 ns         1732 ns       385563
- * BM_BiquadFilter/0/2/8        4331 ns         4320 ns       162048
- * BM_BiquadFilter/0/2/9        3795 ns         3785 ns       184166
- * BM_BiquadFilter/0/2/10       3832 ns         3823 ns       183015
- * BM_BiquadFilter/0/2/11       5060 ns         5047 ns       138660
- * BM_BiquadFilter/0/2/12       5046 ns         5034 ns       139033
- * BM_BiquadFilter/0/2/13       4333 ns         4322 ns       161952
- * BM_BiquadFilter/0/2/14       6500 ns         6482 ns       108022
- * BM_BiquadFilter/0/2/15       4335 ns         4324 ns       161915
- * BM_BiquadFilter/0/2/16       2965 ns         2957 ns       236771
- * BM_BiquadFilter/0/2/17       4368 ns         4358 ns       160829
- * BM_BiquadFilter/0/2/18       3193 ns         3186 ns       219766
- * BM_BiquadFilter/0/2/19       2804 ns         2798 ns       250201
- * BM_BiquadFilter/0/2/20       3839 ns         3830 ns       182731
- * BM_BiquadFilter/0/2/21       5310 ns         5296 ns       133012
- * BM_BiquadFilter/0/2/22       3995 ns         3984 ns       175672
- * BM_BiquadFilter/0/2/23       3755 ns         3745 ns       186960
- * BM_BiquadFilter/0/2/24       5060 ns         5045 ns       138733
- * BM_BiquadFilter/0/2/25       4343 ns         4330 ns       161632
- * BM_BiquadFilter/0/2/26       6505 ns         6489 ns       107871
- * BM_BiquadFilter/0/2/27       4348 ns         4336 ns       161436
- * BM_BiquadFilter/0/2/28       5068 ns         5054 ns       138515
- * BM_BiquadFilter/0/2/29       4401 ns         4389 ns       158834
- * BM_BiquadFilter/0/2/30       6514 ns         6497 ns       107752
- * BM_BiquadFilter/0/2/31       4352 ns         4341 ns       161242
- * BM_BiquadFilter/1/1/1         734 ns          732 ns       955891
- * BM_BiquadFilter/1/1/2         554 ns          552 ns      1267096
- * BM_BiquadFilter/1/1/3         647 ns          645 ns      1084919
- * BM_BiquadFilter/1/1/4         834 ns          832 ns       841505
- * BM_BiquadFilter/1/1/5        1068 ns         1065 ns       657299
- * BM_BiquadFilter/1/1/6         767 ns          765 ns       915192
- * BM_BiquadFilter/1/1/7         927 ns          924 ns       761606
- * BM_BiquadFilter/1/1/8        2174 ns         2168 ns       322888
- * BM_BiquadFilter/1/1/9        1832 ns         1826 ns       383311
- * BM_BiquadFilter/1/1/10       1932 ns         1926 ns       363384
- * BM_BiquadFilter/1/1/11       2536 ns         2529 ns       276796
- * BM_BiquadFilter/1/1/12       2536 ns         2529 ns       276843
- * BM_BiquadFilter/1/1/13       2175 ns         2169 ns       322743
- * BM_BiquadFilter/1/1/14       3259 ns         3250 ns       215420
- * BM_BiquadFilter/1/1/15       2175 ns         2169 ns       322745
- * BM_BiquadFilter/1/1/16       1495 ns         1491 ns       469661
- * BM_BiquadFilter/1/1/17       2177 ns         2171 ns       322388
- * BM_BiquadFilter/1/1/18       1569 ns         1564 ns       447468
- * BM_BiquadFilter/1/1/19       1325 ns         1322 ns       529736
- * BM_BiquadFilter/1/1/20       1927 ns         1922 ns       363962
- * BM_BiquadFilter/1/1/21       2624 ns         2617 ns       267102
- * BM_BiquadFilter/1/1/22       1999 ns         1993 ns       351169
- * BM_BiquadFilter/1/1/23       1882 ns         1877 ns       372944
- * BM_BiquadFilter/1/1/24       2536 ns         2529 ns       276751
- * BM_BiquadFilter/1/1/25       2176 ns         2170 ns       322560
- * BM_BiquadFilter/1/1/26       3259 ns         3250 ns       215389
- * BM_BiquadFilter/1/1/27       2175 ns         2169 ns       322640
- * BM_BiquadFilter/1/1/28       2536 ns         2529 ns       276786
- * BM_BiquadFilter/1/1/29       2176 ns         2170 ns       322533
- * BM_BiquadFilter/1/1/30       3260 ns         3251 ns       215325
- * BM_BiquadFilter/1/1/31       2177 ns         2171 ns       322425
- * BM_BiquadFilter/1/2/1        1471 ns         1467 ns       477222
- * BM_BiquadFilter/1/2/2        1109 ns         1107 ns       632565
- * BM_BiquadFilter/1/2/3        1298 ns         1294 ns       541051
- * BM_BiquadFilter/1/2/4        1682 ns         1678 ns       417354
- * BM_BiquadFilter/1/2/5        2136 ns         2129 ns       328967
- * BM_BiquadFilter/1/2/6        1572 ns         1568 ns       446095
- * BM_BiquadFilter/1/2/7        1792 ns         1788 ns       399598
- * BM_BiquadFilter/1/2/8        4333 ns         4320 ns       162032
- * BM_BiquadFilter/1/2/9        3807 ns         3797 ns       184097
- * BM_BiquadFilter/1/2/10       3842 ns         3831 ns       182711
- * BM_BiquadFilter/1/2/11       5062 ns         5048 ns       138636
- * BM_BiquadFilter/1/2/12       5050 ns         5036 ns       139031
- * BM_BiquadFilter/1/2/13       4335 ns         4323 ns       161943
- * BM_BiquadFilter/1/2/14       6500 ns         6483 ns       108020
- * BM_BiquadFilter/1/2/15       4336 ns         4324 ns       161873
- * BM_BiquadFilter/1/2/16       2966 ns         2957 ns       236709
- * BM_BiquadFilter/1/2/17       4359 ns         4348 ns       160581
- * BM_BiquadFilter/1/2/18       3196 ns         3187 ns       219532
- * BM_BiquadFilter/1/2/19       2805 ns         2798 ns       250157
- * BM_BiquadFilter/1/2/20       3839 ns         3829 ns       182628
- * BM_BiquadFilter/1/2/21       5291 ns         5276 ns       131153
- * BM_BiquadFilter/1/2/22       3994 ns         3984 ns       175699
- * BM_BiquadFilter/1/2/23       3757 ns         3747 ns       186864
- * BM_BiquadFilter/1/2/24       5060 ns         5046 ns       138754
- * BM_BiquadFilter/1/2/25       4343 ns         4331 ns       161614
- * BM_BiquadFilter/1/2/26       6512 ns         6491 ns       107852
- * BM_BiquadFilter/1/2/27       4348 ns         4336 ns       161419
- * BM_BiquadFilter/1/2/28       5068 ns         5055 ns       138481
- * BM_BiquadFilter/1/2/29       4386 ns         4374 ns       159635
- * BM_BiquadFilter/1/2/30       6514 ns         6498 ns       107752
- * BM_BiquadFilter/1/2/31       4353 ns         4342 ns       161227
+ A test result running on Pixel 4XL for comparison.
+
+ Parameterized Test BM_BiquadFilter1D/A
+ <A> is 0 or 1 indicating if the input data is subnormal or not.
+
+ Parameterized Test BM_BiquadFilter<TYPE>/A/B/C
+ <A> is 0 or 1 indicating if the input data is subnormal or not.
+ <B> is the channel count, starting from 1
+ <C> indicates the occupancy of the coefficients as a bitmask (1 - 31) representing
+     b0, b1, b2, a0, a1.  31 indicates all Biquad coefficients are non-zero.
+
+-----------------------------------------------------------------------------------
+Benchmark                                         Time             CPU   Iterations
+-----------------------------------------------------------------------------------
+BM_BiquadFilter1D/0                             559 ns          558 ns      1254596
+BM_BiquadFilter1D/1                             563 ns          561 ns      1246670
+BM_BiquadFilterFloatOptimized/0/1/31           2263 ns         2257 ns       310326
+BM_BiquadFilterFloatOptimized/0/2/31           2471 ns         2465 ns       289044
+BM_BiquadFilterFloatOptimized/0/3/31           2827 ns         2818 ns       248148
+BM_BiquadFilterFloatOptimized/0/4/31           2443 ns         2436 ns       287402
+BM_BiquadFilterFloatOptimized/0/5/31           2843 ns         2836 ns       246912
+BM_BiquadFilterFloatOptimized/0/6/31           3188 ns         3180 ns       220494
+BM_BiquadFilterFloatOptimized/0/7/31           3910 ns         3898 ns       179652
+BM_BiquadFilterFloatOptimized/0/8/31           3167 ns         3157 ns       221764
+BM_BiquadFilterFloatOptimized/0/9/31           3947 ns         3936 ns       177855
+BM_BiquadFilterFloatOptimized/0/10/31          4204 ns         4193 ns       166963
+BM_BiquadFilterFloatOptimized/0/11/31          5313 ns         5298 ns       131865
+BM_BiquadFilterFloatOptimized/0/12/31          4205 ns         4193 ns       166925
+BM_BiquadFilterFloatOptimized/0/13/31          5303 ns         5289 ns       132321
+BM_BiquadFilterFloatOptimized/0/14/31          5501 ns         5487 ns       127623
+BM_BiquadFilterFloatOptimized/0/15/31          6572 ns         6551 ns       106827
+BM_BiquadFilterFloatOptimized/0/16/31          5438 ns         5426 ns       129016
+BM_BiquadFilterFloatOptimized/0/17/31          7729 ns         7708 ns        90812
+BM_BiquadFilterFloatOptimized/0/18/31          8007 ns         7984 ns        87910
+BM_BiquadFilterFloatOptimized/0/19/31          8586 ns         8560 ns        81801
+BM_BiquadFilterFloatOptimized/0/20/31          7811 ns         7789 ns        89871
+BM_BiquadFilterFloatOptimized/0/21/31          8714 ns         8691 ns        80518
+BM_BiquadFilterFloatOptimized/0/22/31          8851 ns         8820 ns        79287
+BM_BiquadFilterFloatOptimized/0/23/31          9606 ns         9576 ns        73108
+BM_BiquadFilterFloatOptimized/0/24/31          8676 ns         8647 ns        80944
+BM_BiquadFilterFloatOptimized/0/1/1             559 ns          557 ns      1255379
+BM_BiquadFilterFloatOptimized/0/1/2             649 ns          647 ns      1081153
+BM_BiquadFilterFloatOptimized/0/1/3             649 ns          647 ns      1081862
+BM_BiquadFilterFloatOptimized/0/1/4             848 ns          846 ns       827513
+BM_BiquadFilterFloatOptimized/0/1/5             847 ns          844 ns       829861
+BM_BiquadFilterFloatOptimized/0/1/6             844 ns          841 ns       829051
+BM_BiquadFilterFloatOptimized/0/1/7             848 ns          845 ns       827492
+BM_BiquadFilterFloatOptimized/0/1/8            2289 ns         2282 ns       307077
+BM_BiquadFilterFloatOptimized/0/1/9            2296 ns         2288 ns       303532
+BM_BiquadFilterFloatOptimized/0/1/10           2305 ns         2299 ns       304972
+BM_BiquadFilterFloatOptimized/0/1/11           2287 ns         2281 ns       307199
+BM_BiquadFilterFloatOptimized/0/1/12           2262 ns         2256 ns       310300
+BM_BiquadFilterFloatOptimized/0/1/13           2263 ns         2257 ns       310400
+BM_BiquadFilterFloatOptimized/0/1/14           2262 ns         2257 ns       310236
+BM_BiquadFilterFloatOptimized/0/1/15           2262 ns         2256 ns       309767
+BM_BiquadFilterFloatOptimized/0/1/16           2262 ns         2256 ns       310116
+BM_BiquadFilterFloatOptimized/0/1/17           2262 ns         2256 ns       310345
+BM_BiquadFilterFloatOptimized/0/1/18           2261 ns         2255 ns       310106
+BM_BiquadFilterFloatOptimized/0/1/19           2263 ns         2256 ns       310242
+BM_BiquadFilterFloatOptimized/0/1/20           2262 ns         2255 ns       310131
+BM_BiquadFilterFloatOptimized/0/1/21           2262 ns         2256 ns       310250
+BM_BiquadFilterFloatOptimized/0/1/22           2264 ns         2257 ns       310329
+BM_BiquadFilterFloatOptimized/0/1/23           2264 ns         2257 ns       310011
+BM_BiquadFilterFloatOptimized/0/1/24           2264 ns         2258 ns       310008
+BM_BiquadFilterFloatOptimized/0/1/25           2263 ns         2256 ns       310203
+BM_BiquadFilterFloatOptimized/0/1/26           2264 ns         2258 ns       310016
+BM_BiquadFilterFloatOptimized/0/1/27           2264 ns         2258 ns       310046
+BM_BiquadFilterFloatOptimized/0/1/28           2263 ns         2257 ns       310325
+BM_BiquadFilterFloatOptimized/0/1/29           2264 ns         2257 ns       310048
+BM_BiquadFilterFloatOptimized/0/1/30           2265 ns         2258 ns       310008
+BM_BiquadFilterFloatOptimized/0/1/31           2263 ns         2257 ns       310046
+BM_BiquadFilterFloatOptimized/0/2/1             613 ns          611 ns      1143690
+BM_BiquadFilterFloatOptimized/0/2/2             801 ns          799 ns       877788
+BM_BiquadFilterFloatOptimized/0/2/3             799 ns          797 ns       875867
+BM_BiquadFilterFloatOptimized/0/2/4            1507 ns         1502 ns       465824
+BM_BiquadFilterFloatOptimized/0/2/5            1505 ns         1501 ns       466685
+BM_BiquadFilterFloatOptimized/0/2/6            1500 ns         1496 ns       468398
+BM_BiquadFilterFloatOptimized/0/2/7            1505 ns         1501 ns       466086
+BM_BiquadFilterFloatOptimized/0/2/8            2237 ns         2231 ns       313969
+BM_BiquadFilterFloatOptimized/0/2/9            2233 ns         2227 ns       314388
+BM_BiquadFilterFloatOptimized/0/2/10           2234 ns         2228 ns       314181
+BM_BiquadFilterFloatOptimized/0/2/11           2236 ns         2230 ns       313480
+BM_BiquadFilterFloatOptimized/0/2/12           2474 ns         2467 ns       287311
+BM_BiquadFilterFloatOptimized/0/2/13           2448 ns         2441 ns       284113
+BM_BiquadFilterFloatOptimized/0/2/14           2512 ns         2504 ns       283932
+BM_BiquadFilterFloatOptimized/0/2/15           2448 ns         2442 ns       285294
+BM_BiquadFilterFloatOptimized/0/2/16           2459 ns         2451 ns       283455
+BM_BiquadFilterFloatOptimized/0/2/17           2466 ns         2459 ns       282706
+BM_BiquadFilterFloatOptimized/0/2/18           2493 ns         2486 ns       279019
+BM_BiquadFilterFloatOptimized/0/2/19           2453 ns         2446 ns       285647
+BM_BiquadFilterFloatOptimized/0/2/20           2475 ns         2468 ns       285345
+BM_BiquadFilterFloatOptimized/0/2/21           2506 ns         2499 ns       286030
+BM_BiquadFilterFloatOptimized/0/2/22           2512 ns         2505 ns       280230
+BM_BiquadFilterFloatOptimized/0/2/23           2466 ns         2458 ns       284371
+BM_BiquadFilterFloatOptimized/0/2/24           2454 ns         2446 ns       286968
+BM_BiquadFilterFloatOptimized/0/2/25           2478 ns         2470 ns       284297
+BM_BiquadFilterFloatOptimized/0/2/26           2509 ns         2501 ns       281390
+BM_BiquadFilterFloatOptimized/0/2/27           2477 ns         2469 ns       282065
+BM_BiquadFilterFloatOptimized/0/2/28           2484 ns         2476 ns       284814
+BM_BiquadFilterFloatOptimized/0/2/29           2443 ns         2435 ns       283155
+BM_BiquadFilterFloatOptimized/0/2/30           2488 ns         2481 ns       283433
+BM_BiquadFilterFloatOptimized/0/2/31           2452 ns         2444 ns       286627
+BM_BiquadFilterFloatOptimized/0/3/1            1015 ns         1012 ns       691448
+BM_BiquadFilterFloatOptimized/0/3/2            1362 ns         1357 ns       516073
+BM_BiquadFilterFloatOptimized/0/3/3            1358 ns         1354 ns       516419
+BM_BiquadFilterFloatOptimized/0/3/4            2305 ns         2298 ns       304561
+BM_BiquadFilterFloatOptimized/0/3/5            2303 ns         2296 ns       304712
+BM_BiquadFilterFloatOptimized/0/3/6            2292 ns         2285 ns       306149
+BM_BiquadFilterFloatOptimized/0/3/7            2304 ns         2297 ns       304959
+BM_BiquadFilterFloatOptimized/0/3/8            2318 ns         2311 ns       301399
+BM_BiquadFilterFloatOptimized/0/3/9            2310 ns         2303 ns       303435
+BM_BiquadFilterFloatOptimized/0/3/10           2313 ns         2306 ns       303399
+BM_BiquadFilterFloatOptimized/0/3/11           2318 ns         2311 ns       302755
+BM_BiquadFilterFloatOptimized/0/3/12           2831 ns         2823 ns       247948
+BM_BiquadFilterFloatOptimized/0/3/13           2830 ns         2822 ns       248677
+BM_BiquadFilterFloatOptimized/0/3/14           2850 ns         2842 ns       246677
+BM_BiquadFilterFloatOptimized/0/3/15           2827 ns         2819 ns       248340
+BM_BiquadFilterFloatOptimized/0/3/16           2834 ns         2825 ns       247686
+BM_BiquadFilterFloatOptimized/0/3/17           2828 ns         2819 ns       247784
+BM_BiquadFilterFloatOptimized/0/3/18           2842 ns         2833 ns       248007
+BM_BiquadFilterFloatOptimized/0/3/19           2843 ns         2834 ns       248472
+BM_BiquadFilterFloatOptimized/0/3/20           2829 ns         2820 ns       248677
+BM_BiquadFilterFloatOptimized/0/3/21           2833 ns         2824 ns       248061
+BM_BiquadFilterFloatOptimized/0/3/22           2849 ns         2841 ns       246611
+BM_BiquadFilterFloatOptimized/0/3/23           2827 ns         2818 ns       247863
+BM_BiquadFilterFloatOptimized/0/3/24           2830 ns         2820 ns       247550
+BM_BiquadFilterFloatOptimized/0/3/25           2836 ns         2827 ns       248435
+BM_BiquadFilterFloatOptimized/0/3/26           2859 ns         2850 ns       245727
+BM_BiquadFilterFloatOptimized/0/3/27           2828 ns         2818 ns       247914
+BM_BiquadFilterFloatOptimized/0/3/28           2818 ns         2809 ns       248637
+BM_BiquadFilterFloatOptimized/0/3/29           2820 ns         2810 ns       249170
+BM_BiquadFilterFloatOptimized/0/3/30           2858 ns         2849 ns       245760
+BM_BiquadFilterFloatOptimized/0/3/31           2832 ns         2823 ns       247969
+BM_BiquadFilterFloatOptimized/0/4/1             650 ns          648 ns      1080613
+BM_BiquadFilterFloatOptimized/0/4/2             808 ns          805 ns       868597
+BM_BiquadFilterFloatOptimized/0/4/3             809 ns          807 ns       867460
+BM_BiquadFilterFloatOptimized/0/4/4             844 ns          841 ns       831865
+BM_BiquadFilterFloatOptimized/0/4/5             845 ns          842 ns       833152
+BM_BiquadFilterFloatOptimized/0/4/6             844 ns          842 ns       831276
+BM_BiquadFilterFloatOptimized/0/4/7             843 ns          841 ns       831231
+BM_BiquadFilterFloatOptimized/0/4/8            2193 ns         2186 ns       320234
+BM_BiquadFilterFloatOptimized/0/4/9            2194 ns         2187 ns       320273
+BM_BiquadFilterFloatOptimized/0/4/10           2193 ns         2185 ns       320448
+BM_BiquadFilterFloatOptimized/0/4/11           2192 ns         2185 ns       320000
+BM_BiquadFilterFloatOptimized/0/4/12           2448 ns         2440 ns       285447
+BM_BiquadFilterFloatOptimized/0/4/13           2454 ns         2447 ns       282923
+BM_BiquadFilterFloatOptimized/0/4/14           2359 ns         2352 ns       297360
+BM_BiquadFilterFloatOptimized/0/4/15           2467 ns         2459 ns       286699
+BM_BiquadFilterFloatOptimized/0/4/16           2442 ns         2435 ns       286231
+BM_BiquadFilterFloatOptimized/0/4/17           2446 ns         2438 ns       287105
+BM_BiquadFilterFloatOptimized/0/4/18           2361 ns         2354 ns       298082
+BM_BiquadFilterFloatOptimized/0/4/19           2452 ns         2444 ns       284776
+BM_BiquadFilterFloatOptimized/0/4/20           2459 ns         2451 ns       284338
+BM_BiquadFilterFloatOptimized/0/4/21           2437 ns         2429 ns       289188
+BM_BiquadFilterFloatOptimized/0/4/22           2358 ns         2350 ns       297415
+BM_BiquadFilterFloatOptimized/0/4/23           2459 ns         2451 ns       287279
+BM_BiquadFilterFloatOptimized/0/4/24           2452 ns         2444 ns       281625
+BM_BiquadFilterFloatOptimized/0/4/25           2389 ns         2382 ns       290665
+BM_BiquadFilterFloatOptimized/0/4/26           2360 ns         2354 ns       298354
+BM_BiquadFilterFloatOptimized/0/4/27           2433 ns         2425 ns       288930
+BM_BiquadFilterFloatOptimized/0/4/28           2447 ns         2440 ns       285827
+BM_BiquadFilterFloatOptimized/0/4/29           2450 ns         2444 ns       282857
+BM_BiquadFilterFloatOptimized/0/4/30           2360 ns         2353 ns       297327
+BM_BiquadFilterFloatOptimized/0/4/31           2463 ns         2455 ns       281644
+BM_BiquadFilterFloatOptimized/1/1/1             559 ns          558 ns      1255286
+BM_BiquadFilterFloatOptimized/1/1/2             649 ns          647 ns      1081171
+BM_BiquadFilterFloatOptimized/1/1/3             649 ns          647 ns      1081604
+BM_BiquadFilterFloatOptimized/1/1/4             848 ns          845 ns       828382
+BM_BiquadFilterFloatOptimized/1/1/5             847 ns          844 ns       827653
+BM_BiquadFilterFloatOptimized/1/1/6             848 ns          845 ns       831001
+BM_BiquadFilterFloatOptimized/1/1/7             848 ns          845 ns       828299
+BM_BiquadFilterFloatOptimized/1/1/8            2291 ns         2284 ns       306513
+BM_BiquadFilterFloatOptimized/1/1/9            2297 ns         2290 ns       305585
+BM_BiquadFilterFloatOptimized/1/1/10           2310 ns         2303 ns       304022
+BM_BiquadFilterFloatOptimized/1/1/11           2286 ns         2279 ns       307592
+BM_BiquadFilterFloatOptimized/1/1/12           2263 ns         2256 ns       310010
+BM_BiquadFilterFloatOptimized/1/1/13           2264 ns         2258 ns       310167
+BM_BiquadFilterFloatOptimized/1/1/14           2264 ns         2257 ns       310392
+BM_BiquadFilterFloatOptimized/1/1/15           2264 ns         2257 ns       310298
+BM_BiquadFilterFloatOptimized/1/1/16           2263 ns         2256 ns       310058
+BM_BiquadFilterFloatOptimized/1/1/17           2264 ns         2257 ns       310262
+BM_BiquadFilterFloatOptimized/1/1/18           2263 ns         2256 ns       310337
+BM_BiquadFilterFloatOptimized/1/1/19           2266 ns         2258 ns       310162
+BM_BiquadFilterFloatOptimized/1/1/20           2264 ns         2257 ns       309968
+BM_BiquadFilterFloatOptimized/1/1/21           2264 ns         2256 ns       310341
+BM_BiquadFilterFloatOptimized/1/1/22           2266 ns         2258 ns       310193
+BM_BiquadFilterFloatOptimized/1/1/23           2263 ns         2257 ns       310310
+BM_BiquadFilterFloatOptimized/1/1/24           2265 ns         2257 ns       310182
+BM_BiquadFilterFloatOptimized/1/1/25           2263 ns         2256 ns       309999
+BM_BiquadFilterFloatOptimized/1/1/26           2265 ns         2258 ns       309943
+BM_BiquadFilterFloatOptimized/1/1/27           2264 ns         2257 ns       310175
+BM_BiquadFilterFloatOptimized/1/1/28           2265 ns         2258 ns       310119
+BM_BiquadFilterFloatOptimized/1/1/29           2265 ns         2258 ns       310231
+BM_BiquadFilterFloatOptimized/1/1/30           2266 ns         2258 ns       309976
+BM_BiquadFilterFloatOptimized/1/1/31           2266 ns         2258 ns       310062
+BM_BiquadFilterFloatOptimized/1/2/1             613 ns          611 ns      1144258
+BM_BiquadFilterFloatOptimized/1/2/2             801 ns          798 ns       877198
+BM_BiquadFilterFloatOptimized/1/2/3             800 ns          798 ns       877731
+BM_BiquadFilterFloatOptimized/1/2/4            1508 ns         1503 ns       465854
+BM_BiquadFilterFloatOptimized/1/2/5            1505 ns         1500 ns       466533
+BM_BiquadFilterFloatOptimized/1/2/6            1502 ns         1497 ns       468915
+BM_BiquadFilterFloatOptimized/1/2/7            1505 ns         1501 ns       466371
+BM_BiquadFilterFloatOptimized/1/2/8            2238 ns         2231 ns       313808
+BM_BiquadFilterFloatOptimized/1/2/9            2236 ns         2228 ns       313914
+BM_BiquadFilterFloatOptimized/1/2/10           2235 ns         2228 ns       314155
+BM_BiquadFilterFloatOptimized/1/2/11           2237 ns         2231 ns       313280
+BM_BiquadFilterFloatOptimized/1/2/12           2482 ns         2474 ns       287057
+BM_BiquadFilterFloatOptimized/1/2/13           2491 ns         2483 ns       280960
+BM_BiquadFilterFloatOptimized/1/2/14           2520 ns         2513 ns       283093
+BM_BiquadFilterFloatOptimized/1/2/15           2452 ns         2445 ns       285828
+BM_BiquadFilterFloatOptimized/1/2/16           2503 ns         2495 ns       286339
+BM_BiquadFilterFloatOptimized/1/2/17           2499 ns         2491 ns       285757
+BM_BiquadFilterFloatOptimized/1/2/18           2509 ns         2502 ns       281775
+BM_BiquadFilterFloatOptimized/1/2/19           2491 ns         2483 ns       281628
+BM_BiquadFilterFloatOptimized/1/2/20           2466 ns         2458 ns       289553
+BM_BiquadFilterFloatOptimized/1/2/21           2456 ns         2447 ns       284222
+BM_BiquadFilterFloatOptimized/1/2/22           2502 ns         2494 ns       277208
+BM_BiquadFilterFloatOptimized/1/2/23           2458 ns         2450 ns       281951
+BM_BiquadFilterFloatOptimized/1/2/24           2474 ns         2466 ns       290638
+BM_BiquadFilterFloatOptimized/1/2/25           2473 ns         2465 ns       285659
+BM_BiquadFilterFloatOptimized/1/2/26           2504 ns         2497 ns       278294
+BM_BiquadFilterFloatOptimized/1/2/27           2467 ns         2459 ns       279088
+BM_BiquadFilterFloatOptimized/1/2/28           2487 ns         2479 ns       281668
+BM_BiquadFilterFloatOptimized/1/2/29           2469 ns         2462 ns       274237
+BM_BiquadFilterFloatOptimized/1/2/30           2502 ns         2495 ns       282294
+BM_BiquadFilterFloatOptimized/1/2/31           2438 ns         2430 ns       285546
+BM_BiquadFilterFloatOptimized/1/3/1            1016 ns         1012 ns       691507
+BM_BiquadFilterFloatOptimized/1/3/2            1361 ns         1357 ns       514162
+BM_BiquadFilterFloatOptimized/1/3/3            1359 ns         1355 ns       516211
+BM_BiquadFilterFloatOptimized/1/3/4            2304 ns         2297 ns       304380
+BM_BiquadFilterFloatOptimized/1/3/5            2305 ns         2297 ns       304978
+BM_BiquadFilterFloatOptimized/1/3/6            2291 ns         2283 ns       306726
+BM_BiquadFilterFloatOptimized/1/3/7            2305 ns         2297 ns       304572
+BM_BiquadFilterFloatOptimized/1/3/8            2332 ns         2325 ns       303627
+BM_BiquadFilterFloatOptimized/1/3/9            2313 ns         2305 ns       303341
+BM_BiquadFilterFloatOptimized/1/3/10           2317 ns         2310 ns       303791
+BM_BiquadFilterFloatOptimized/1/3/11           2325 ns         2317 ns       302256
+BM_BiquadFilterFloatOptimized/1/3/12           2828 ns         2819 ns       248120
+BM_BiquadFilterFloatOptimized/1/3/13           2831 ns         2821 ns       248747
+BM_BiquadFilterFloatOptimized/1/3/14           2849 ns         2840 ns       247287
+BM_BiquadFilterFloatOptimized/1/3/15           2829 ns         2821 ns       247794
+BM_BiquadFilterFloatOptimized/1/3/16           2830 ns         2821 ns       248154
+BM_BiquadFilterFloatOptimized/1/3/17           2833 ns         2823 ns       249270
+BM_BiquadFilterFloatOptimized/1/3/18           2853 ns         2843 ns       247248
+BM_BiquadFilterFloatOptimized/1/3/19           2839 ns         2830 ns       248141
+BM_BiquadFilterFloatOptimized/1/3/20           2830 ns         2820 ns       248693
+BM_BiquadFilterFloatOptimized/1/3/21           2832 ns         2823 ns       247991
+BM_BiquadFilterFloatOptimized/1/3/22           2850 ns         2841 ns       246602
+BM_BiquadFilterFloatOptimized/1/3/23           2828 ns         2819 ns       248859
+BM_BiquadFilterFloatOptimized/1/3/24           2835 ns         2826 ns       247837
+BM_BiquadFilterFloatOptimized/1/3/25           2837 ns         2828 ns       246757
+BM_BiquadFilterFloatOptimized/1/3/26           2860 ns         2851 ns       245739
+BM_BiquadFilterFloatOptimized/1/3/27           2830 ns         2821 ns       248257
+BM_BiquadFilterFloatOptimized/1/3/28           2826 ns         2817 ns       249220
+BM_BiquadFilterFloatOptimized/1/3/29           2820 ns         2811 ns       248787
+BM_BiquadFilterFloatOptimized/1/3/30           2853 ns         2844 ns       245666
+BM_BiquadFilterFloatOptimized/1/3/31           2829 ns         2821 ns       248423
+BM_BiquadFilterFloatOptimized/1/4/1             650 ns          648 ns      1080678
+BM_BiquadFilterFloatOptimized/1/4/2             808 ns          806 ns       868623
+BM_BiquadFilterFloatOptimized/1/4/3             810 ns          807 ns       867092
+BM_BiquadFilterFloatOptimized/1/4/4             844 ns          842 ns       831679
+BM_BiquadFilterFloatOptimized/1/4/5             843 ns          841 ns       830964
+BM_BiquadFilterFloatOptimized/1/4/6             845 ns          842 ns       831367
+BM_BiquadFilterFloatOptimized/1/4/7             845 ns          842 ns       830992
+BM_BiquadFilterFloatOptimized/1/4/8            2193 ns         2186 ns       320186
+BM_BiquadFilterFloatOptimized/1/4/9            2194 ns         2187 ns       320621
+BM_BiquadFilterFloatOptimized/1/4/10           2194 ns         2187 ns       320577
+BM_BiquadFilterFloatOptimized/1/4/11           2192 ns         2184 ns       320221
+BM_BiquadFilterFloatOptimized/1/4/12           2471 ns         2462 ns       286817
+BM_BiquadFilterFloatOptimized/1/4/13           2450 ns         2442 ns       283590
+BM_BiquadFilterFloatOptimized/1/4/14           2359 ns         2351 ns       297716
+BM_BiquadFilterFloatOptimized/1/4/15           2472 ns         2464 ns       286187
+BM_BiquadFilterFloatOptimized/1/4/16           2450 ns         2442 ns       291296
+BM_BiquadFilterFloatOptimized/1/4/17           2460 ns         2453 ns       287031
+BM_BiquadFilterFloatOptimized/1/4/18           2358 ns         2350 ns       297376
+BM_BiquadFilterFloatOptimized/1/4/19           2469 ns         2462 ns       286443
+BM_BiquadFilterFloatOptimized/1/4/20           2470 ns         2462 ns       286582
+BM_BiquadFilterFloatOptimized/1/4/21           2420 ns         2412 ns       288366
+BM_BiquadFilterFloatOptimized/1/4/22           2359 ns         2351 ns       297356
+BM_BiquadFilterFloatOptimized/1/4/23           2448 ns         2440 ns       286534
+BM_BiquadFilterFloatOptimized/1/4/24           2458 ns         2450 ns       286994
+BM_BiquadFilterFloatOptimized/1/4/25           2420 ns         2412 ns       287406
+BM_BiquadFilterFloatOptimized/1/4/26           2363 ns         2354 ns       298029
+BM_BiquadFilterFloatOptimized/1/4/27           2421 ns         2414 ns       288812
+BM_BiquadFilterFloatOptimized/1/4/28           2449 ns         2442 ns       286253
+BM_BiquadFilterFloatOptimized/1/4/29           2448 ns         2441 ns       287234
+BM_BiquadFilterFloatOptimized/1/4/30           2361 ns         2354 ns       297402
+BM_BiquadFilterFloatOptimized/1/4/31           2465 ns         2457 ns       287662
+BM_BiquadFilterFloatNonOptimized/0/1/31        2261 ns         2257 ns       310189
+BM_BiquadFilterFloatNonOptimized/0/2/31        4525 ns         4510 ns       155178
+BM_BiquadFilterFloatNonOptimized/0/3/31        6781 ns         6760 ns       103524
+BM_BiquadFilterFloatNonOptimized/0/4/31        9037 ns         9008 ns        77684
+BM_BiquadFilterFloatNonOptimized/0/5/31       11298 ns        11259 ns        62169
+BM_BiquadFilterFloatNonOptimized/0/6/31       13575 ns        13534 ns        51708
+BM_BiquadFilterFloatNonOptimized/0/7/31       15841 ns        15796 ns        44308
+BM_BiquadFilterFloatNonOptimized/0/8/31       18105 ns        18047 ns        38797
+BM_BiquadFilterFloatNonOptimized/0/9/31       20335 ns        20270 ns        34532
+BM_BiquadFilterFloatNonOptimized/0/10/31      22603 ns        22534 ns        31063
+BM_BiquadFilterFloatNonOptimized/0/11/31      24875 ns        24798 ns        28225
+BM_BiquadFilterFloatNonOptimized/0/12/31      27122 ns        27038 ns        25889
+BM_BiquadFilterFloatNonOptimized/0/13/31      29416 ns        29326 ns        23869
+BM_BiquadFilterFloatNonOptimized/0/14/31      31703 ns        31602 ns        22151
+BM_BiquadFilterFloatNonOptimized/0/15/31      33945 ns        33849 ns        20678
+BM_BiquadFilterFloatNonOptimized/0/16/31      36232 ns        36123 ns        19379
+BM_BiquadFilterFloatNonOptimized/0/17/31      38577 ns        38464 ns        18205
+BM_BiquadFilterFloatNonOptimized/0/18/31      41054 ns        40924 ns        17106
+BM_BiquadFilterFloatNonOptimized/0/19/31      43278 ns        43133 ns        16228
+BM_BiquadFilterFloatNonOptimized/0/20/31      45549 ns        45414 ns        15411
+BM_BiquadFilterFloatNonOptimized/0/21/31      48024 ns        47867 ns        14625
+BM_BiquadFilterFloatNonOptimized/0/22/31      50268 ns        50109 ns        13965
+BM_BiquadFilterFloatNonOptimized/0/23/31      52268 ns        52090 ns        13440
+BM_BiquadFilterFloatNonOptimized/0/24/31      54528 ns        54350 ns        12883
+BM_BiquadFilterDoubleOptimized/0/1/31          2264 ns         2258 ns       309965
+BM_BiquadFilterDoubleOptimized/0/2/31          2503 ns         2495 ns       270274
+BM_BiquadFilterDoubleOptimized/0/3/31          2787 ns         2778 ns       251685
+BM_BiquadFilterDoubleOptimized/0/4/31          3175 ns         3165 ns       221339
+BM_BiquadFilterDoubleNonOptimized/0/1/31       2264 ns         2257 ns       310154
+BM_BiquadFilterDoubleNonOptimized/0/2/31       4523 ns         4508 ns       155292
+BM_BiquadFilterDoubleNonOptimized/0/3/31       6778 ns         6756 ns       103599
+BM_BiquadFilterDoubleNonOptimized/0/4/31       9063 ns         9033 ns        77461
+
  *******************************************************************/
 
 template <typename F>
@@ -299,10 +490,11 @@
     }
 }
 
+BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterQuickArgs);
+BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterFullArgs);
+// Other tests of interest
+BENCHMARK(BM_BiquadFilterFloatNonOptimized)->Apply(BiquadFilterQuickArgs);
 BENCHMARK(BM_BiquadFilterDoubleOptimized)->Apply(BiquadFilterDoubleArgs);
 BENCHMARK(BM_BiquadFilterDoubleNonOptimized)->Apply(BiquadFilterDoubleArgs);
-BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterQuickArgs);
-BENCHMARK(BM_BiquadFilterFloatNonOptimized)->Apply(BiquadFilterQuickArgs);
-BENCHMARK(BM_BiquadFilterFloatOptimized)->Apply(BiquadFilterFullArgs);
 
 BENCHMARK_MAIN();
diff --git a/audio_utils/include/audio_utils/BiquadFilter.h b/audio_utils/include/audio_utils/BiquadFilter.h
index c2f481b..b3683e3 100644
--- a/audio_utils/include/audio_utils/BiquadFilter.h
+++ b/audio_utils/include/audio_utils/BiquadFilter.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
-#define ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
+#pragma once
 
 #include "intrinsic_utils.h"
 
@@ -36,12 +35,332 @@
 #define USE_NEON
 #endif
 
+// Use dither to prevent subnormals for CPUs that raise an exception.
+#pragma push_macro("USE_DITHER")
+#undef USE_DITHER
+
+#if defined(__i386__) || defined(__x86_x64__)
+#define USE_DITHER
+#endif
+
 namespace android::audio_utils {
 
 static constexpr size_t kBiquadNumCoefs  = 5;
 static constexpr size_t kBiquadNumDelays = 2;
 
+/**
+ * The BiquadDirect2Transpose is a low overhead
+ * Biquad filter with coefficients b0, b1, b2, a1, a2.
+ *
+ * This can be used by itself, but it is preferred for best data management
+ * to use the BiquadFilter abstraction below.
+ *
+ * T is the data type (scalar or vector).
+ * F is the filter coefficient type.  It is either a scalar or vector (matching T).
+ */
+template <typename T, typename F>
+struct BiquadDirect2Transpose {
+    F coef_[5]; // these are stored with the denominator a's negated.
+    T s1_; // delay state 1
+    T s2_; // delay state 2
+
+    // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
+    // as expressed by a bitmask.
+    static inline constexpr size_t required_occupancies_[] = {
+        0x1,  // constant scale
+        0x3,  // single zero
+        0x7,  // double zero
+        0x9,  // single pole
+        0xb,  // (11) first order IIR
+        0x1b, // (27) double pole + single zero
+        0x1f, // (31) second order IIR (full Biquad)
+    };
+
+    // Take care the order of arguments - starts with b's then goes to a's.
+    // The a's are "positive" reference, some filters take negative.
+    BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
+            const T& s1 = {}, const T& s2 = {})
+        // : coef_{b0, b1, b2, -a1, -a2}
+        : coef_{ b0,
+            b1,
+            b2,
+            intrinsics::vneg(a1),
+            intrinsics::vneg(a2) }
+        , s1_{s1}
+        , s2_{s2} {
+    }
+
+    // D is the data type.  It must be the same element type of T or F.
+    // Take care the order of input and output.
+    template<typename D, size_t OCCUPANCY = 0x1f>
+    __attribute__((always_inline)) // required for 1ch speedup (30% faster)
+    void process(D* output, const D* input, size_t frames, size_t stride) {
+        using namespace intrinsics;
+        // For SSE it is possible to vdup F to T if F is scalar.
+        const F b0 = coef_[0];         // b0
+        const F b1 = coef_[1];         // b1
+        const F b2 = coef_[2];         // b2
+        const F negativeA1 = coef_[3]; // -a1
+        const F negativeA2 = coef_[4]; // -a2
+        T s1 = s1_;
+        T s2 = s2_;
+        T xn, yn; // OK to declare temps outside loop rather than at the point of initialization.
+#ifdef USE_DITHER
+        constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
+        T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
+#endif
+
+        // Unroll control.  Make sure the constexpr remains constexpr :-).
+        constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
+        constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2;   // below this won't be unrolled.
+        constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16;  // above this won't be unrolled.
+        constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
+                CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
+        size_t remainder = 0;
+        if constexpr (UNROLL_LOOPS > 1) {
+            remainder = frames % UNROLL_LOOPS;
+            frames /= UNROLL_LOOPS;
+        }
+
+        // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
+        // The other alternative is to use a MACRO, but that doesn't read as well.
+        const auto KERNEL = [&]() __attribute__((always_inline)) {
+            xn = vld1<T>(input);
+            input += stride;
+#ifdef USE_DITHER
+            xn = vadd(xn, dither);
+            dither = vneg(dither);
+#endif
+
+            yn = s1;
+            if constexpr (OCCUPANCY >> 0 & 1) {
+                yn = vmla(yn, b0, xn);
+            }
+            vst1(output, yn);
+            output += stride;
+
+            s1 = s2;
+            if constexpr (OCCUPANCY >> 3 & 1) {
+                s1 = vmla(s1, negativeA1, yn);
+            }
+            if constexpr (OCCUPANCY >> 1 & 1) {
+                s1 = vmla(s1, b1, xn);
+            }
+            if constexpr (OCCUPANCY >> 2 & 1) {
+                s2 = vmul(b2, xn);
+            } else {
+                s2 = vdupn<T>(0.f);
+            }
+            if constexpr (OCCUPANCY >> 4 & 1) {
+                s2 = vmla(s2, negativeA2, yn);
+            }
+        };
+
+        while (frames > 0) {
+            #pragma unroll
+            for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
+                KERNEL();
+            }
+            frames--;
+        }
+        if constexpr (UNROLL_LOOPS > 1) {
+            for (size_t i = 0; i < remainder; ++i) {
+                KERNEL();
+            }
+        }
+        s1_ = s1;
+        s2_ = s2;
+    }
+};
+
+/**
+ * A state space formulation of filtering converts a n-th order difference equation update
+ * to a first order vector difference equation. For the Biquad filter, the state space form
+ * has improved numerical precision properties with poles near the unit circle as well as
+ * increased speed due to better parallelization of the state update [1][2].
+ *
+ * [1] Raph Levien: (observerable canonical form)
+ * https://github.com/google/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
+ *
+ * [2] Julius O Smith III: (controllable canonical form)
+ * https://ccrma.stanford.edu/~jos/filters/State_Space_Filters.html
+ *
+ * The signal flow is as follows, where for scalar x and y, the matrix D is a scalar d.
+ *
+ *
+ *        +------[ d ]--------------------------+
+ *        |                         S           |
+ *  x ----+--[ B ]--(+)--[ z^-1 ]---+---[ C ]--(+)--- y
+ *                   |              |
+ *                   +----[ A ]-----+
+ *
+ * The 2nd order Biquad IIR coefficients are as follows in observerable canonical form:
+ *
+ * y = [C_1 C_2] | S_1 | + d x
+ *               | S_2 |
+ *
+ *
+ * | S_1 | = | A_11 A_12 | | S_1 | + | B_1 | x
+ * | S_2 |   | A_21 A_22 | | S_2 |   | B_2 |
+ *
+ *
+ * A_11 = -a1
+ * A_12 = 1
+ * A_21 = -a2
+ * A_22 = 0
+ *
+ * B_1 = b1 - b0 * a1
+ * B_2 = b2 - b0 * a2
+ *
+ * C_1 = 1
+ * C_2 = 0
+ *
+ * d = b0
+ *
+ * Implementation details: The state space filter is typically expressed in either observable or
+ * controllable canonical form [3].  We follow the observable form here.
+ * Raph [4] discovered that the single channel Biquad implementation can be further optimized
+ * by doing 2 filter updates at once (improving speed on NEON by about 20%).
+ * Doing 2 updates at once costs 8 multiplies / sample instead of 5 multiples / sample,
+ * but uses a 4 x 4 matrix multiply, exploiting single cycle multiply-add SIMD throughput.
+ * TODO: consider this optimization.
+ *
+ * [3] Mathworks
+ * https://www.mathworks.com/help/control/ug/canonical-state-space-realizations.html
+ * [4] Raph Levien
+ * https://github.com/kysko/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
+ *
+ * The template variables
+ * T is the data type (scalar or vector).
+ * F is the filter coefficient type.  It is either a scalar or vector (matching T).
+ */
+template <typename T, typename F>
+struct BiquadStateSpace {
+    F coef_[5]; // these are stored as state-space converted.
+    T s1_; // delay state 1
+    T s2_; // delay state 2
+
+    // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
+    // as expressed by a bitmask.  This must include 31.
+    static inline constexpr size_t required_occupancies_[] = {
+        1,  // constant scale
+        3,  // single zero
+        7,  // double zero
+        9,  // single pole
+        11, // first order IIR
+        27, // double pole + single zero
+        31, // second order IIR (full Biquad)
+    };
+
+    // Take care the order of arguments - starts with b's then goes to a's.
+    // The a's are "positive" reference, some filters take negative.
+    BiquadStateSpace(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
+            const T& s1 = {}, const T& s2 = {})
+        // : coef_{b0, b1 - b0 * a1, b2 - b0 * a2, -a1, -a2}
+        : coef_{ b0,
+            intrinsics::vsub(b1, intrinsics::vmul(b0, a1)),
+            intrinsics::vsub(b2, intrinsics::vmul(b0, a2)),
+            intrinsics::vneg(a1),
+            intrinsics::vneg(a2) }
+        , s1_{s1}
+        , s2_{s2} {
+    }
+
+    // D is the data type.  It must be the same element type of T or F.
+    // Take care the order of input and output.
+    template<typename D, size_t OCCUPANCY = 0x1f>
+    void process(D* output, const D* input, size_t frames, size_t stride) {
+        using namespace intrinsics;
+        const F b0 = coef_[0];         // b0
+        const F b1ss = coef_[1];       // b1 - b0 * a1,
+        const F b2ss = coef_[2];       // b2 - b0 * a2,
+        const F negativeA1 = coef_[3]; // -a1
+        const F negativeA2 = coef_[4]; // -a2
+        T s1 = s1_;
+        T s2 = s2_;
+        T x, new_s1; // OK to declare temps here rather than at the point of initialization.
+#ifdef USE_DITHER
+        constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
+        T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
+#endif
+        constexpr bool b0_present = (OCCUPANCY & 0x1) != 0;
+        constexpr bool a1_present = (OCCUPANCY & 0x8) != 0;
+        constexpr bool a2_present = (OCCUPANCY & 0x10) != 0;
+        constexpr bool b1ss_present = (OCCUPANCY & 0x2) != 0 ||
+                (b0_present && a1_present);
+        constexpr bool b2ss_present = (OCCUPANCY & 0x4) != 0 ||
+                (b0_present && a2_present);
+
+
+        // Unroll control.  Make sure the constexpr remains constexpr :-).
+        constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
+        constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 1;   // below this won't be unrolled.
+        constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16;  // above this won't be unrolled.
+        constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
+                CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
+        size_t remainder = 0;
+        if constexpr (UNROLL_LOOPS > 1) {
+            remainder = frames % UNROLL_LOOPS;
+            frames /= UNROLL_LOOPS;
+        }
+
+        // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
+        // The other alternative is to use a MACRO, but that doesn't read as well.
+        const auto KERNEL = [&]() __attribute__((always_inline)) {
+            x = vld1<T>(input);
+            input += stride;
+#ifdef USE_DITHER
+            x = vadd(x, dither);
+            dither = vneg(dither);
+#endif
+            // vst1(output, vadd(s1, vmul(b0, x)));
+            // output += stride;
+            // new_s1 = vadd(vadd(vmul(b1ss, x), vmul(negativeA1, s1)), s2);
+            // s2 = vadd(vmul(b2ss, x), vmul(negativeA2, s1));
+
+            if constexpr (b0_present) {
+                vst1(output, vadd(s1, vmul(b0, x)));
+            } else /* constexpr */ {
+                vst1(output, s1);
+            }
+            output += stride;
+            new_s1 = s2;
+            if constexpr (b1ss_present) {
+                new_s1 = vadd(new_s1, vmul(b1ss, x));
+            }
+            if constexpr (a1_present) {
+                new_s1 = vadd(new_s1, vmul(negativeA1, s1));
+            }
+            if constexpr (b2ss_present) {
+                s2 = vmul(b2ss, x);
+                if constexpr (a2_present) {
+                    s2 = vadd(s2, vmul(negativeA2, s1));
+                }
+            } else if constexpr (a2_present) {
+                s2 = vmul(negativeA2, s1);
+            }
+            s1 = new_s1;
+        };
+
+        while (frames > 0) {
+            #pragma unroll
+            for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
+                KERNEL();
+            }
+            frames--;
+        }
+        if constexpr (UNROLL_LOOPS > 1) {
+            for (size_t i = 0; i < remainder; ++i) {
+                KERNEL();
+            }
+        }
+        s1_ = s1;
+        s2_ = s2;
+    }
+};
+
 namespace details {
+
 // Helper methods for constructing a constexpr array of function pointers.
 // As function pointers are efficient and have no constructor/destructor
 // this is preferred over std::function.
@@ -105,67 +424,6 @@
     }
 }
 
-// For biquad_filter_fast, we template based on whether coef[i] is nonzero - this should be
-// determined in a constexpr fashion for optimization.
-
-// Helper which takes a stride to allow column processing of interleaved audio streams.
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_1fast(D *out, const D *in, size_t frames, size_t stride,
-                         size_t channelCount, D *delays, const D *coefs, size_t localStride) {
-#if defined(__i386__) || defined(__x86_x64__)
-    D delta = std::numeric_limits<float>::min() * (1 << 24);
-#endif
-    D b0, b1, b2, negativeA1, negativeA2;
-
-    if constexpr (SAME_COEF_PER_CHANNEL) {
-        b0 = coefs[0];
-        b1 = coefs[1];
-        b2 = coefs[2];
-        negativeA1 = -coefs[3];
-        negativeA2 = -coefs[4];
-    }
-    for (size_t i = 0; i < channelCount; ++i) {
-        if constexpr (!SAME_COEF_PER_CHANNEL) {
-            b0 = coefs[0];
-            b1 = coefs[localStride];
-            b2 = coefs[2 * localStride];
-            negativeA1 = -coefs[3 * localStride];
-            negativeA2 = -coefs[4 * localStride];
-            ++coefs;
-        }
-
-        D s1n1 = delays[0];
-        D s2n1 = delays[localStride];
-        const D *input = &in[i];
-        D *output = &out[i];
-        for (size_t j = frames; j > 0; --j) {
-            // Adding a delta to avoid subnormal exception handling on the x86/x64 platform;
-            // this is not a problem with the ARM platform. The delta will not affect the
-            // precision of the result.
-#if defined(__i386__) || defined(__x86_x64__)
-            const D xn = *input + delta;
-#else
-            const D xn = *input;
-#endif
-            D yn = (OCCUPANCY >> 0 & 1) * b0 * xn + s1n1;
-            s1n1 = (OCCUPANCY >> 1 & 1) * b1 * xn + (OCCUPANCY >> 3 & 1) * negativeA1 * yn + s2n1;
-            s2n1 = (OCCUPANCY >> 2 & 1) * b2 * xn + (OCCUPANCY >> 4 & 1) * negativeA2 * yn;
-
-            input += stride;
-
-            *output = yn;
-            output += stride;
-
-#if defined(__i386__) || defined(__x86_x64__)
-            delta = -delta;
-#endif
-        }
-        delays[0] = s1n1;
-        delays[localStride] = s2n1;
-        ++delays;
-    }
-}
-
 // Helper function to zero channels in the input buffer.
 // This is used for the degenerate coefficient case which results in all zeroes.
 template <typename D>
@@ -180,90 +438,69 @@
     }
 }
 
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_fast(D *out, const D *in, size_t frames, size_t stride,
-        size_t channelCount, D *delays, const D *coefs, size_t localStride) {
-    if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
-        zeroChannels(out, frames, stride, channelCount);
-        return;
-    }
-    biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>(
-            out, in, frames, stride, channelCount, delays, coefs, localStride);
-}
-
-#ifdef USE_NEON
-
-template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
-void biquad_filter_neon_impl(F *out, const F *in, size_t frames, size_t stride,
+template <template <typename, typename> typename FilterType,
+        size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
+void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride,
         size_t channelCount, F *delays, const F *coefs, size_t localStride) {
     using namespace android::audio_utils::intrinsics;
 
     constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
-    T b0, b1, b2, negativeA1, negativeA2;
-    if constexpr (SAME_COEF_PER_CHANNEL) {
-        b0 = vdupn<T>(coefs[0]);
-        b1 = vdupn<T>(coefs[1]);
-        b2 = vdupn<T>(coefs[2]);
-        negativeA1 = vneg(vdupn<T>(coefs[3]));
-        negativeA2 = vneg(vdupn<T>(coefs[4]));
-    }
+    const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride;
+    using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>;
+
     for (size_t i = 0; i < channelCount; i += elements) {
-        if constexpr (!SAME_COEF_PER_CHANNEL) {
-            b0 = vld1<T>(coefs);
-            b1 = vld1<T>(coefs + localStride);
-            b2 = vld1<T>(coefs + localStride * 2);
-            negativeA1 = vneg(vld1<T>(coefs + localStride * 3));
-            negativeA2 = vneg(vld1<T>(coefs + localStride * 4));
-            coefs += elements;
-        }
         T s1 = vld1<T>(&delays[0]);
         T s2 = vld1<T>(&delays[localStride]);
-        const F *input = &in[i];
-        F *output = &out[i];
-        for (size_t j = frames; j > 0; --j) {
-            T xn = vld1<T>(input);
-            T yn = s1;
 
-            if constexpr (OCCUPANCY >> 0 & 1) {
-                yn = vmla(yn, b0, xn);
-            }
-            s1 = s2;
-            if constexpr (OCCUPANCY >> 3 & 1) {
-                s1 = vmla(s1, negativeA1, yn);
-            }
-            if constexpr (OCCUPANCY >> 1 & 1) {
-                s1 = vmla(s1, b1, xn);
-            }
-            if constexpr (OCCUPANCY >> 2 & 1) {
-                s2 = vmul(b2, xn);
-            } else {
-                s2 = vdupn<T>(0.f);
-            }
-            if constexpr (OCCUPANCY >> 4 & 1) {
-                s2 = vmla(s2, negativeA2, yn);
-            }
-
-            input += stride;
-            vst1(output, yn);
-            output += stride;
-        }
-        vst1(&delays[0], s1);
-        vst1(&delays[localStride], s2);
+        FilterType<T, CoefType> kernel(
+                vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride),
+                vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3),
+                vld1<CoefType>(coefs + coefStride * 4),
+                s1, s2);
+        if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements;
+        kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride);
+        vst1(&delays[0], kernel.s1_);
+        vst1(&delays[localStride], kernel.s2_);
         delays += elements;
     }
 }
 
-#define BIQUAD_FILTER_CASE(N, ... /* type */) \
+// Find the nearest occupancy mask that includes all the desired bits.
+template <typename T, size_t N>
+static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) {
+    if (occupancy < 32) {
+        for (auto test : occupancies) {
+            if ((occupancy & test) == occupancy) return test;
+        }
+    }
+    return 31;
+}
+
+enum FILTER_OPTION {
+    FILTER_OPTION_SCALAR_ONLY = (1 << 0),
+};
+
+// Default biquad type.
+template <typename T, typename F>
+using BiquadFilterType = BiquadStateSpace<T, F>;
+
+#define BIQUAD_FILTER_CASE(N, FilterType, ... /* type */) \
             case N: { \
-                biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, __VA_ARGS__>( \
+                using VectorType = __VA_ARGS__; \
+                biquad_filter_func_impl< \
+                        FilterType, \
+                        nearestOccupancy(OCCUPANCY, \
+                                FilterType<VectorType, D>::required_occupancies_), \
+                        SAME_COEF_PER_CHANNEL, VectorType>( \
                         out + offset, in + offset, frames, stride, remaining, \
                         delays + offset, c, localStride); \
                 goto exit; \
             }
 
 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
-void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride,
-        size_t channelCount, D *delays, const D *coefs, size_t localStride) {
+void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride,
+        size_t channelCount, D *delays, const D *coefs, size_t localStride,
+        FILTER_OPTION filterOptions) {
     if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
         zeroChannels(out, frames, stride, channelCount);
         return;
@@ -274,41 +511,53 @@
     // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
     // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
 
+#ifdef USE_NEON
+    // use NEON types to ensure we have the proper intrinsic acceleration.
+    using alt_16_t = float32x4x4_t;
+    using alt_8_t = float32x4x2_t;
+    using alt_4_t = float32x4_t;
+#else
+    // Use C++ types, no NEON needed.
+    using alt_16_t = intrinsics::internal_array_t<float, 16>;
+    using alt_8_t = intrinsics::internal_array_t<float, 8>;
+    using alt_4_t = intrinsics::internal_array_t<float, 4>;
+#endif
+
     for (size_t offset = 0; offset < channelCount; ) {
         size_t remaining = channelCount - offset;
         auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
+        if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar;
         if constexpr (std::is_same_v<D, float>) {
             switch (remaining) {
             default:
                 if (remaining >= 16) {
                     remaining &= ~15;
-                    biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, float32x4x4_t>(
+                    biquad_filter_func_impl<
+                            BiquadFilterType,
+                            nearestOccupancy(OCCUPANCY,
+                                    BiquadFilterType<D, D>::required_occupancies_),
+                            SAME_COEF_PER_CHANNEL, alt_16_t>(
                             out + offset, in + offset, frames, stride, remaining,
                             delays + offset, c, localStride);
                     offset += remaining;
                     continue;
                 }
                 break;  // case 1 handled at bottom.
-            BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
-            BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
-            BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
-            BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
-            BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
-            BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
-            BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
-            // We choose the NEON intrinsic type over internal_array for 8 to
-            // check if there is any performance difference in benchmark (should be similar).
-            // BIQUAD_FILTER_CASE(8, intrinsics::internal_array_t<float, 8>)
-            BIQUAD_FILTER_CASE(8, float32x4x2_t)
-            BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
-            BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
-            BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
-            BIQUAD_FILTER_CASE(4, float32x4_t)
-            // We choose the NEON intrinsic type over internal_array for 4 to
-            // check if there is any performance difference in benchmark (should be similar).
-            // BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<float, 4>)
-            BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
-            BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
+            BIQUAD_FILTER_CASE(15, BiquadFilterType, intrinsics::internal_array_t<float, 15>)
+            BIQUAD_FILTER_CASE(14, BiquadFilterType, intrinsics::internal_array_t<float, 14>)
+            BIQUAD_FILTER_CASE(13, BiquadFilterType, intrinsics::internal_array_t<float, 13>)
+            BIQUAD_FILTER_CASE(12, BiquadFilterType, intrinsics::internal_array_t<float, 12>)
+            BIQUAD_FILTER_CASE(11, BiquadFilterType, intrinsics::internal_array_t<float, 11>)
+            BIQUAD_FILTER_CASE(10, BiquadFilterType, intrinsics::internal_array_t<float, 10>)
+            BIQUAD_FILTER_CASE(9, BiquadFilterType, intrinsics::internal_array_t<float, 9>)
+            BIQUAD_FILTER_CASE(8, BiquadFilterType, alt_8_t)
+            BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<float, 7>)
+            BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<float, 6>)
+            BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<float, 5>)
+            BIQUAD_FILTER_CASE(4, BiquadFilterType, alt_4_t)
+            BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<float, 3>)
+            BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<float, 2>)
+            // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>)
             }
         } else if constexpr (std::is_same_v<D, double>) {
 #if defined(__aarch64__)
@@ -316,27 +565,34 @@
             default:
                 if (remaining >= 8) {
                     remaining &= ~7;
-                    biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL,
-                              intrinsics::internal_array_t<double, 8>>(
+                    biquad_filter_func_impl<BiquadFilterType,
+                            nearestOccupancy(OCCUPANCY,
+                                    BiquadFilterType<D, D>::required_occupancies_),
+                            SAME_COEF_PER_CHANNEL,
+                            intrinsics::internal_array_t<double, 8>>(
                             out + offset, in + offset, frames, stride, remaining,
                             delays + offset, c, localStride);
                     offset += remaining;
                     continue;
                 }
                 break; // case 1 handled at bottom.
-            BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
-            BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
-            BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
-            BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
-            BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
-            BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
+            BIQUAD_FILTER_CASE(7, BiquadFilterType, intrinsics::internal_array_t<double, 7>)
+            BIQUAD_FILTER_CASE(6, BiquadFilterType, intrinsics::internal_array_t<double, 6>)
+            BIQUAD_FILTER_CASE(5, BiquadFilterType, intrinsics::internal_array_t<double, 5>)
+            BIQUAD_FILTER_CASE(4, BiquadFilterType, intrinsics::internal_array_t<double, 4>)
+            BIQUAD_FILTER_CASE(3, BiquadFilterType, intrinsics::internal_array_t<double, 3>)
+            BIQUAD_FILTER_CASE(2, BiquadFilterType, intrinsics::internal_array_t<double, 2>)
             };
 #endif
         }
+        scalar:
         // Essentially the code below is scalar, the same as
         // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
         // but formulated with NEON intrinsic-like call pattern.
-        biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, D>(
+        biquad_filter_func_impl<BiquadFilterType,
+                nearestOccupancy(OCCUPANCY,
+                        BiquadFilterType<D, D>::required_occupancies_),
+                 SAME_COEF_PER_CHANNEL, D>(
                 out + offset, in + offset, frames, stride, remaining,
                 delays + offset, c, localStride);
         offset += remaining;
@@ -344,8 +600,6 @@
     exit:;
 }
 
-#endif // USE_NEON
-
 } // namespace details
 
 /**
@@ -584,16 +838,14 @@
         }
 
         // Select the proper filtering function from our array.
-        (void)optimized;                // avoid unused variable warning.
-        mFunc = mFilterFast[category];  // default if we don't have processor optimization.
-
-#ifdef USE_NEON
-        /* if constexpr (std::is_same_v<D, float>) */ {
-            if (optimized) {
-                mFunc = mFilterNeon[category];
-            }
+        if (optimized) {
+            mFilterOptions = (details::FILTER_OPTION)
+                    (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY);
+        } else {
+             mFilterOptions = (details::FILTER_OPTION)
+                     (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY);
         }
-#endif
+        mFunc = mFilterFuncs[category];
     }
 
     /**
@@ -603,7 +855,7 @@
      * \param in      pointer to the input data
      * \param frames  number of audio frames to be processed
      */
-    void process(D* out, const D *in, size_t frames) {
+    void process(D* out, const D* in, size_t frames) {
         process(out, in, frames, mChannelCount);
     }
 
@@ -615,10 +867,10 @@
      * \param frames  number of audio frames to be processed
      * \param stride  the total number of samples associated with a frame, if not channelCount.
      */
-    void process(D* out, const D *in, size_t frames, size_t stride) {
+    void process(D* out, const D* in, size_t frames, size_t stride) {
         assert(stride >= mChannelCount);
         mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
-                mCoefs.data(), mChannelCount);
+                mCoefs.data(), mChannelCount, mFilterOptions);
     }
 
     /**
@@ -655,7 +907,7 @@
                     auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
                     auto delays = mDelays.data() + fromEnd;
                     mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
-                            delays, coefs, mChannelCount);
+                            delays, coefs, mChannelCount, mFilterOptions);
                 }
 
                 auto delays = mDelays.data() + baseIdx;
@@ -664,13 +916,13 @@
                 // sliding one audio sample at a time.
                 mFunc(inout, inout,
                         frames - channelBlock + 1, 1 /* stride */, channelBlock,
-                        delays, coefs, mChannelCount);
+                        delays, coefs, mChannelCount, mFilterOptions);
 
                 // drain data pipe.
                 for (size_t i = 1; i < channelBlock; ++i) {
                     mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
                             1 /* frames */, 1 /* stride */, channelBlock - i,
-                            delays, coefs, mChannelCount);
+                            delays, coefs, mChannelCount, mFilterOptions);
                 }
             }
         }
@@ -681,7 +933,7 @@
             auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
             mFunc(inout, inout,
                     frames, 1 /* stride */, 1 /* channelCount */,
-                    mDelays.data() + fromEnd, coefs, mChannelCount);
+                    mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions);
         }
     }
 
@@ -746,121 +998,57 @@
      */
     std::vector<D> mDelays;
 
-    using filter_func = decltype(details::biquad_filter_fast<0, true, D>);
+    details::FILTER_OPTION mFilterOptions{};
 
-    /**
-     * \var filter_func* mFunc
+    // Consider making a separate delegation class.
+    /*
+     * We store an array of functions based on the occupancy.
      *
-     * The current filter function selected for the channel occupancy of the Biquad.
-     */
-    filter_func *mFunc;
-
-    // Create a functional wrapper to feed "biquad_filter_fast" to
-    // make_functional_array() to populate the array.
-    //
-    // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
-    // b0 b1 b2 a1 a2  (from lsb to msb)
-    template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
-    struct FuncWrap {
-        template<typename T>
-        static constexpr size_t nearest() {
-            // Combine cases to both improve expected performance and reduce code space.
-            // Some occupancy masks provide worse performance than more occupied masks.
-            constexpr size_t required_occupancies[] = {
-                1,  // constant scale
-                3,  // single zero
-                7,  // double zero
-                9,  // single pole
-                // 11, // first order IIR (unnecessary optimization, close enough to 31).
-                27, // double pole + single zero
-                31, // second order IIR (full Biquad)
-            };
-            if constexpr (OCCUPANCY < 32) {
-                for (auto test : required_occupancies) {
-                    if ((OCCUPANCY & test) == OCCUPANCY) return test;
-                }
-            } else {
-                static_assert(intrinsics::dependent_false_v<T>);
-            }
-            return 0; // never gets here.
-        }
-
-        static void func(D* out, const D *in, size_t frames, size_t stride,
-                size_t channelCount, D *delays, const D *coef, size_t localStride) {
-            constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
-            details::biquad_filter_fast<NEAREST_OCCUPANCY, SC>(
-                    out, in, frames, stride, channelCount, delays, coef, localStride);
-        }
-    };
-
-    /**
-     * \var mFilterFast
-     *
-     * std::array of functions based on coefficient occupancy.
+     * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
+     * b0 b1 b2 a1 a2  (from lsb to msb)
      *
      *  static inline constexpr std::array<filter_func*, M> mArray = {
-     *     biquad_filter_fast<0>,
-     *     biquad_filter_fast<1>,
-     *     biquad_filter_fast<2>,
+     *     biquad_filter_func<0>,
+     *     biquad_filter_func<1>,
+     *     biquad_filter_func<2>,
      *      ...
-     *     biquad_filter_fast<(1 << kBiquadNumCoefs) - 1>,
+     *     biquad_filter_func<(1 << kBiquadNumCoefs) - 1>,
      *  };
      *
      * Every time the coefficients are changed, we select the processing function from
      * this table.
      */
-    static inline constexpr auto mFilterFast =
-            details::make_functional_array<
-                    FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
 
-#ifdef USE_NEON
-    // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
-    // b0 b1 b2 a1 a2  (from lsb to msb)
-
+    // Used to build the functional array.
     template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
-    struct FuncWrapNeon {
-        template<typename T>
-        static constexpr size_t nearest() {
-            // combine cases to both improve expected performance and reduce code space.
-            //
-            // This lists the occupancies we will specialize functions for.
-            constexpr size_t required_occupancies[] = {
-                1,  // constant scale
-                3,  // single zero
-                7,  // double zero
-                9,  // single pole
-                11, // first order IIR
-                27, // double pole + single zero
-                31, // second order IIR (full Biquad)
-            };
-            if constexpr (OCCUPANCY < 32) {
-                for (auto test : required_occupancies) {
-                    if ((OCCUPANCY & test) == OCCUPANCY) return test;
-                }
-            } else {
-                static_assert(intrinsics::dependent_false_v<T>);
-            }
-            return 0; // never gets here.
-        }
-
+    struct FuncWrap {
         static void func(D* out, const D *in, size_t frames, size_t stride,
-                size_t channelCount, D *delays, const D *coef, size_t localStride) {
-            constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
-            details::biquad_filter_neon<NEAREST_OCCUPANCY, SC>(
-                    out, in, frames, stride, channelCount, delays, coef, localStride);
+                size_t channelCount, D *delays, const D *coef, size_t localStride,
+                details::FILTER_OPTION filterOptions) {
+            constexpr size_t NEAREST_OCCUPANCY =
+                details::nearestOccupancy(
+                        OCCUPANCY, details::BiquadFilterType<D, D>::required_occupancies_);
+            details::biquad_filter_func<NEAREST_OCCUPANCY, SC>(
+                    out, in, frames, stride, channelCount, delays, coef, localStride,
+                    filterOptions);
         }
     };
 
-    // Neon optimized array of functions.
-    static inline constexpr auto mFilterNeon =
+    // Vector optimized array of functions.
+    static inline constexpr auto mFilterFuncs =
             details::make_functional_array<
-                    FuncWrapNeon, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
-#endif // USE_NEON
+                    FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
 
+    /**
+     * \var filter_func* mFunc
+     *
+     * The current filter function selected for the channel occupancy of the Biquad.
+     * It will be one of mFilterFuncs.
+     */
+    std::decay_t<decltype(mFilterFuncs[0])> mFunc;
 };
 
 } // namespace android::audio_utils
 
+#pragma pop_macro("USE_DITHER")
 #pragma pop_macro("USE_NEON")
-
-#endif  // !ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
diff --git a/audio_utils/include/audio_utils/intrinsic_utils.h b/audio_utils/include/audio_utils/intrinsic_utils.h
index ed2b2bb..0c333e0 100644
--- a/audio_utils/include/audio_utils/intrinsic_utils.h
+++ b/audio_utils/include/audio_utils/intrinsic_utils.h
@@ -78,6 +78,45 @@
   using alternative_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
 */
 
+// add a + b
+template<typename T>
+static inline T vadd(T a, T b) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        return a + b;
+
+#ifdef USE_NEON
+    } else if constexpr (std::is_same_v<T, float32x2_t>) {
+        return vadd_f32(a, b);
+    } else if constexpr (std::is_same_v<T, float32x4_t>) {
+        return vaddq_f32(a, b);
+#if defined(__aarch64__)
+    } else if constexpr (std::is_same_v<T, float64x2_t>) {
+        return vaddq_f64(a, b);
+#endif
+#endif // USE_NEON
+
+    } else /* constexpr */ {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        const auto &[bval] = b;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vadd(aval[i], bval[i]);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             const auto &[b1, b2] = bval;
+             r1 = vadd(a1, b1);
+             r2 = vadd(a2, b2);
+             return ret;
+        }
+    }
+}
+
 // duplicate float into all elements.
 template<typename T, typename F>
 static inline T vdupn(F f) {
@@ -156,6 +195,73 @@
     }
 }
 
+/**
+ * Returns c as follows:
+ * c_i = a_i * b_i if a and b are the same vector type or
+ * c_i = a_i * b if a is a vector and b is scalar or
+ * c_i = a * b_i if a is scalar and b is a vector.
+ */
+template<typename T, typename S, typename F>
+static inline T vmla(T a, S b, F c) {
+    // Both types T and S are non-primitive and they are not equal.  T == S handled below.
+    (void) a;
+    (void) b;
+    (void) c;
+    static_assert(dependent_false_v<T>);
+}
+
+template<typename T, typename F>
+static inline T vmla(T a, T b, F c) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+            return a + b * c;
+        } else {
+            static_assert(dependent_false_v<T>);
+        }
+    } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+        // handle the lane variant
+#ifdef USE_NEON
+        if constexpr (std::is_same_v<T, float32x2_t>) {
+            return vmla_n_f32(a, b, c);
+        } else if constexpr (std::is_same_v<T, float32x4_t>) {
+            return vmlaq_n_f32(a, b,c);
+#if defined(__aarch64__)
+        } else if constexpr (std::is_same_v<T, float64x2_t>) {
+            return vmlaq_n_f64(a, b);
+#endif
+        } else
+#endif // USE_NEON
+        {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        const auto &[bval] = b;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vmla(aval[i], bval[i], c);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             const auto &[b1, b2] = bval;
+             r1 = vmla(a1, b1, c);
+             r2 = vmla(a2, b2, c);
+             return ret;
+        }
+        }
+    } else {
+        // Both types T and F are non-primitive and they are not equal.
+        static_assert(dependent_false_v<T>);
+    }
+}
+
+template<typename T, typename F>
+static inline T vmla(T a, F b, T c) {
+    return vmla(a, c, b);
+}
+
 // fused multiply-add a + b * c
 template<typename T>
 static inline T vmla(T a, T b, T c) {
@@ -197,7 +303,57 @@
     }
 }
 
-// multiply a * b
+/**
+ * Returns c as follows:
+ * c_i = a_i * b_i if a and b are the same vector type or
+ * c_i = a_i * b if a is a vector and b is scalar or
+ * c_i = a * b_i if a is scalar and b is a vector.
+ */
+template<typename T, typename F>
+static inline auto vmul(T a, F b) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+            return a * b;
+        } else /* constexpr */ {
+            return vmul(b, a); // we prefer T to be the vector/struct form.
+        }
+    } else if constexpr (std::is_same_v<F, float> || std::is_same_v<F, double>) {
+        // handle the lane variant
+#ifdef USE_NEON
+        if constexpr (std::is_same_v<T, float32x2_t>) {
+            return vmul_n_f32(a, b);
+        } else if constexpr (std::is_same_v<T, float32x4_t>) {
+            return vmulq_n_f32(a, b);
+#if defined(__aarch64__)
+        } else if constexpr (std::is_same_v<T, float64x2_t>) {
+            return vmulq_n_f64(a, b);
+#endif
+        } else
+#endif // USE_NEON
+        {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vmul(aval[i], b);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             r1 = vmul(a1, b);
+             r2 = vmul(a2, b);
+             return ret;
+        }
+        }
+    } else {
+        // Both types T and F are non-primitive and they are not equal.
+        static_assert(dependent_false_v<T>);
+    }
+}
+
 template<typename T>
 static inline T vmul(T a, T b) {
     if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
@@ -308,6 +464,45 @@
     }
 }
 
+// subtract a - b
+template<typename T>
+static inline T vsub(T a, T b) {
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+        return a - b;
+
+#ifdef USE_NEON
+    } else if constexpr (std::is_same_v<T, float32x2_t>) {
+        return vsub_f32(a, b);
+    } else if constexpr (std::is_same_v<T, float32x4_t>) {
+        return vsubq_f32(a, b);
+#if defined(__aarch64__)
+    } else if constexpr (std::is_same_v<T, float64x2_t>) {
+        return vsubq_f64(a, b);
+#endif
+#endif // USE_NEON
+
+    } else /* constexpr */ {
+        T ret;
+        auto &[retval] = ret;  // single-member struct
+        const auto &[aval] = a;
+        const auto &[bval] = b;
+        if constexpr (std::is_array_v<decltype(retval)>) {
+#pragma unroll
+            for (size_t i = 0; i < std::size(aval); ++i) {
+                retval[i] = vsub(aval[i], bval[i]);
+            }
+            return ret;
+        } else /* constexpr */ {
+             auto &[r1, r2] = retval;
+             const auto &[a1, a2] = aval;
+             const auto &[b1, b2] = bval;
+             r1 = vsub(a1, b1);
+             r2 = vsub(a2, b2);
+             return ret;
+        }
+    }
+}
+
 } // namespace android::audio_utils::intrinsics
 
 #pragma pop_macro("USE_NEON")
diff --git a/audio_utils/tests/biquad_filter_tests.cpp b/audio_utils/tests/biquad_filter_tests.cpp
index 3be2a5d..f90b863 100644
--- a/audio_utils/tests/biquad_filter_tests.cpp
+++ b/audio_utils/tests/biquad_filter_tests.cpp
@@ -29,12 +29,13 @@
 
 /************************************************************************************
  * Reference data, must not change.
- * The reference output data is from running in matlab y = filter(b, a, x), where
- *     b = [2.0f, 3.0f]
- *     a = [1.0f, 0.2f]
- *     x = [-0.1f, -0.2f, -0.3f, -0.4f, -0.5f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f]
- * The output y = [-0.2f, -0.66f, -1.068f, -1.4864f, -1.9027f,
- *                 -0.9195f, 0.8839f, 1.0232f, 1.4954f, 1.9009f].
+ * The reference output data is from running in matlab or octave y = filter(b, a, x), where
+ *     b = [2.0 3.0 4.0]
+ *     a = [1.0 0.2 0.3]
+ *     x = [-0.1 -0.2 -0.3 -0.4 -0.5 0.1 0.2 0.3 0.4 0.5]
+ *     filter(b, a, x)
+ *
+ * The output y = [-0.2 -0.66 -1.4080 -2.0204 -2.5735 -1.7792 -0.1721 2.1682 2.1180 2.3259].
  * The reference data construct the input and output as 2D array so that it can be
  * use to practice calling BiquadFilter::process multiple times.
  ************************************************************************************/
@@ -43,11 +44,12 @@
 constexpr float INPUT[PERIOD][FRAME_COUNT] = {
         {-0.1f, -0.2f, -0.3f, -0.4f, -0.5f},
         {0.1f, 0.2f, 0.3f, 0.4f, 0.5f}};
+// COEFS in order of [ b0 b1 b2 a1 a2 ], normalized form where a0 = 1.
 constexpr std::array<float, kBiquadNumCoefs> COEFS = {
-        2.0f, 3.0f, 0.0f, 0.2f, 0.0f };
+        2.0f, 3.0f, 4.0f, 0.2f, 0.3f };
 constexpr float OUTPUT[PERIOD][FRAME_COUNT] = {
-        {-0.2f, -0.66f, -1.068f, -1.4864f, -1.9027f},
-        {-0.9195f, 0.8839f, 1.0232f, 1.4954f, 1.9009f}};
+        {-0.2f, -0.66f, -1.4080f, -2.0204f, -2.5735f},
+        {-1.7792f, -0.1721f, 2.1682f, 2.1180f, 2.3259f}};
 constexpr float EPS = 1e-4f;
 
 template <typename S, typename D>
diff --git a/audio_utils/tests/intrinsic_tests.cpp b/audio_utils/tests/intrinsic_tests.cpp
index 6a16747..d9686ef 100644
--- a/audio_utils/tests/intrinsic_tests.cpp
+++ b/audio_utils/tests/intrinsic_tests.cpp
@@ -25,6 +25,13 @@
 using FloatTypes = ::testing::Types<float, double>;
 TYPED_TEST_CASE(IntrisicUtilsTest, FloatTypes);
 
+TYPED_TEST(IntrisicUtilsTest, vadd) {
+    constexpr TypeParam a = 0.25f;
+    constexpr TypeParam b = 0.5f;
+    constexpr TypeParam result = a + b;
+    ASSERT_EQ(result, android::audio_utils::intrinsics::vadd(a, b));
+}
+
 TYPED_TEST(IntrisicUtilsTest, vdupn) {
     constexpr TypeParam value = 1.f;
     ASSERT_EQ(value, android::audio_utils::intrinsics::vdupn<TypeParam>(value));
@@ -62,3 +69,10 @@
             &destination, android::audio_utils::intrinsics::vdupn<TypeParam>(value));
     ASSERT_EQ(value, destination);
 }
+
+TYPED_TEST(IntrisicUtilsTest, vsub) {
+    constexpr TypeParam a = 1.25f;
+    constexpr TypeParam b = 1.5f;
+    constexpr TypeParam result = a - b;
+    ASSERT_EQ(result, android::audio_utils::intrinsics::vsub(a, b));
+}