Snap for 10447354 from fb61b35abb4b7d3f9d6efde2879fa0da61c14f3b to mainline-cellbroadcast-release

Change-Id: Ica62d58c23a1d8707e149046a38e67423c8369ea
diff --git a/Android.bp b/Android.bp
index f2f09f8..29a421c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -383,7 +383,7 @@
     apex_available: [
         "//apex_available:platform", // used by libstagefright_soft_opusdec
         "com.android.media.swcodec",
-        "com.android.bluetooth",
+        "com.android.btservices",
     ],
     min_sdk_version: "29",
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a28f441..9d824cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -366,11 +366,23 @@
 endif()
 
 if(NOT OPUS_DISABLE_INTRINSICS)
-  if((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR
+  if(((OPUS_X86_MAY_HAVE_SSE AND NOT OPUS_X86_PRESUME_SSE) OR
      (OPUS_X86_MAY_HAVE_SSE2 AND NOT OPUS_X86_PRESUME_SSE2) OR
      (OPUS_X86_MAY_HAVE_SSE4_1 AND NOT OPUS_X86_PRESUME_SSE4_1) OR
-     (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX))
+     (OPUS_X86_MAY_HAVE_AVX AND NOT OPUS_X86_PRESUME_AVX)) AND
+      RUNTIME_CPU_CAPABILITY_DETECTION)
     target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD)
+    if(NOT MSVC)
+      if(CPU_INFO_BY_ASM_SUPPORTED)
+        target_compile_definitions(opus PRIVATE CPU_INFO_BY_ASM)
+      elseif(CPU_INFO_BY_C_SUPPORTED)
+        target_compile_definitions(opus PRIVATE CPU_INFO_BY_C)
+      else()
+        message(ERROR "Runtime cpu capability detection is enabled while CPU_INFO is not supported")
+      endif()
+    endif()
+    add_sources_group(opus celt ${celt_sources_x86_rtcd})
+    add_sources_group(opus silk ${silk_sources_x86_rtcd})
   endif()
 
   if(SSE1_SUPPORTED)
@@ -455,15 +467,13 @@
     endif()
   endif()
 
-  if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
-    add_sources_group(opus celt ${celt_sources_arm})
-  endif()
-
   if(COMPILER_SUPPORT_NEON)
     if(OPUS_MAY_HAVE_NEON)
       if(RUNTIME_CPU_CAPABILITY_DETECTION)
         message(STATUS "OPUS_MAY_HAVE_NEON enabling runtime detection")
         target_compile_definitions(opus PRIVATE OPUS_HAVE_RTCD)
+        add_sources_group(opus celt ${celt_sources_arm_rtcd})
+        add_sources_group(opus silk ${silk_sources_arm_rtcd})
       else()
         message(ERROR "Runtime cpu capability detection needed for MAY_HAVE_NEON")
       endif()
@@ -565,6 +575,7 @@
     target_include_directories(opus_custom_demo
                                PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
     target_link_libraries(opus_custom_demo PRIVATE opus)
+    target_compile_definitions(opus_custom_demo PRIVATE OPUS_BUILD)
   endif()
 
   add_executable(opus_demo ${opus_demo_sources})
@@ -572,14 +583,19 @@
   target_include_directories(opus_demo PRIVATE silk) # debug.h
   target_include_directories(opus_demo PRIVATE celt) # arch.h
   target_link_libraries(opus_demo PRIVATE opus ${OPUS_REQUIRED_LIBRARIES})
+  target_compile_definitions(opus_demo PRIVATE OPUS_BUILD)
 
   # compare
   add_executable(opus_compare ${opus_compare_sources})
   target_include_directories(opus_compare PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_link_libraries(opus_compare PRIVATE opus ${OPUS_REQUIRED_LIBRARIES})
+  if(MSVC)
+    # move cosmetic warning to level 4 for opus_compare
+    target_compile_options(opus_compare PRIVATE /w44244)
+  endif()
 endif()
 
-if(BUILD_TESTING)
+if(BUILD_TESTING AND NOT BUILD_SHARED_LIBS)
   enable_testing()
 
   # tests
@@ -587,32 +603,44 @@
   target_include_directories(test_opus_decode
                              PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_link_libraries(test_opus_decode PRIVATE opus)
+  target_compile_definitions(test_opus_decode PRIVATE OPUS_BUILD)
   if(OPUS_FIXED_POINT)
     target_compile_definitions(test_opus_decode PRIVATE DISABLE_FLOAT_API)
   endif()
-  add_test(NAME test_opus_decode COMMAND $<TARGET_FILE:test_opus_decode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  add_test(NAME test_opus_decode COMMAND ${CMAKE_COMMAND}
+           -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_decode>
+           -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+           -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
 
   add_executable(test_opus_padding ${test_opus_padding_sources})
   target_include_directories(test_opus_padding
                              PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
   target_link_libraries(test_opus_padding PRIVATE opus)
-  add_test(NAME test_opus_padding COMMAND $<TARGET_FILE:test_opus_padding> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  add_test(NAME test_opus_padding COMMAND ${CMAKE_COMMAND}
+           -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_padding>
+           -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+           -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
 
-  if(NOT BUILD_SHARED_LIBS)
-    # disable tests that depends on private API when building shared lib
-    add_executable(test_opus_api ${test_opus_api_sources})
-    target_include_directories(test_opus_api
-                               PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
-    target_link_libraries(test_opus_api PRIVATE opus)
-    if(OPUS_FIXED_POINT)
-      target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API)
-    endif()
-    add_test(NAME test_opus_api COMMAND $<TARGET_FILE:test_opus_api> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-
-    add_executable(test_opus_encode ${test_opus_encode_sources})
-    target_include_directories(test_opus_encode
-                               PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
-    target_link_libraries(test_opus_encode PRIVATE opus)
-    add_test(NAME test_opus_encode COMMAND $<TARGET_FILE:test_opus_encode> WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  add_executable(test_opus_api ${test_opus_api_sources})
+  target_include_directories(test_opus_api
+                            PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
+  target_link_libraries(test_opus_api PRIVATE opus)
+  target_compile_definitions(test_opus_api PRIVATE OPUS_BUILD)
+  if(OPUS_FIXED_POINT)
+    target_compile_definitions(test_opus_api PRIVATE DISABLE_FLOAT_API)
   endif()
+  add_test(NAME test_opus_api COMMAND ${CMAKE_COMMAND}
+        -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_api>
+        -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+        -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
+
+  add_executable(test_opus_encode ${test_opus_encode_sources})
+  target_include_directories(test_opus_encode
+                            PRIVATE ${CMAKE_CURRENT_BINARY_DIR} celt)
+  target_link_libraries(test_opus_encode PRIVATE opus)
+  target_compile_definitions(test_opus_encode PRIVATE OPUS_BUILD)
+  add_test(NAME test_opus_encode COMMAND ${CMAKE_COMMAND}
+        -DTEST_EXECUTABLE=$<TARGET_FILE:test_opus_encode>
+        -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}
+        -P "${PROJECT_SOURCE_DIR}/cmake/RunTest.cmake")
 endif()
diff --git a/METADATA b/METADATA
index cb40ec6..616c521 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update libopus
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
 name: "libopus"
 description: "Android fork of the opus library."
 third_party {
@@ -5,14 +9,14 @@
     type: GIT
     value: "https://gitlab.xiph.org/xiph/opus.git"
   }
-  version: "d633f523e36e3b6d01cc6d57386458d770d618be"
+  version: "8cf872a186b96085b1bb3a547afd598354ebeb87"
   license_type: NOTICE
-  last_upgrade_date {
-    year: 2021
-    month: 2
-    day: 5
-  }
   security {
     tag: "NVD-CPE2.3:cpe:/a:opus-codec:opus:1.0.3"
   }
+  last_upgrade_date {
+    year: 2023
+    month: 1
+    day: 18
+  }
 }
diff --git a/Makefile.am b/Makefile.am
index 83beaa3..492fc09 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -36,6 +36,11 @@
 OPUS_SOURCES += $(OPUS_SOURCES_FLOAT)
 endif
 
+if CPU_X86
+if HAVE_RTCD
+CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
+SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
+endif
 if HAVE_SSE
 CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
@@ -45,10 +50,13 @@
 if HAVE_SSE4_1
 CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
 endif
+endif
 
 if CPU_ARM
-CELT_SOURCES += $(CELT_SOURCES_ARM)
-SILK_SOURCES += $(SILK_SOURCES_ARM)
+if HAVE_RTCD
+CELT_SOURCES += $(CELT_SOURCES_ARM_RTCD)
+SILK_SOURCES += $(SILK_SOURCES_ARM_RTCD)
+endif
 
 if HAVE_ARM_NEON_INTR
 CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
@@ -222,8 +230,11 @@
              cmake/OpusFunctions.cmake \
              cmake/OpusPackageVersion.cmake \
              cmake/OpusSources.cmake \
+             cmake/RunTest.cmake \
              cmake/config.h.cmake.in \
              cmake/vla.c \
+             cmake/cpu_info_by_asm.c \
+             cmake/cpu_info_by_c.c \
              meson/get-version.py \
              meson/read-sources-list.py \
              meson.build \
diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c
index cce3ae3..c7d16e6 100644
--- a/celt/arm/armcpu.c
+++ b/celt/arm/armcpu.c
@@ -156,7 +156,7 @@
    "your platform.  Reconfigure with --disable-rtcd (or send patches)."
 #endif
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
   opus_uint32 flags = opus_cpu_capabilities();
   int arch = 0;
@@ -184,4 +184,11 @@
   return arch;
 }
 
+int opus_select_arch(void) {
+  int arch = opus_select_arch_impl();
+#ifdef FUZZING
+  arch = rand()%(arch+1);
+#endif
+  return arch;
+}
 #endif
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
index 1ac38c4..35cc46e 100644
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@@ -137,22 +137,27 @@
 /* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
 /* operations of celt_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
-static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, float *err, int N)
 {
    int i;
+   *err = 0;
    opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
    for (i = 0; i < N - 3; i += 4) {
       xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
       xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
       xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
       xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+      *err += ABS32(xy0)+ABS32(xy1)+ABS32(xy2)+ABS32(xy3);
    }
    xy0 += xy2;
    xy1 += xy3;
    xy = xy0 + xy1;
+   *err += ABS32(xy1)+ABS32(xy0)+ABS32(xy);
    for (; i < N; i++) {
       xy = MAC16_16(xy, x[i], y[i]);
+      *err += ABS32(xy);
    }
+   *err = *err*2e-7 + N*1e-37;
    return xy;
 }
 
@@ -160,32 +165,10 @@
 /* operations of dual_inner_prod_neon(), and both functions should have bit */
 /* exact output.                                                            */
 static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
-      int N, opus_val32 *xy1, opus_val32 *xy2)
+      int N, opus_val32 *xy1, opus_val32 *xy2, float *err)
 {
-   int i;
-   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
-   for (i = 0; i < N - 3; i += 4) {
-      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
-      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
-      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
-      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
-      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
-      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
-      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
-      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
-   }
-   xy01_0 += xy01_2;
-   xy02_0 += xy02_2;
-   xy01_1 += xy01_3;
-   xy02_1 += xy02_3;
-   xy01 = xy01_0 + xy01_1;
-   xy02 = xy02_0 + xy02_1;
-   for (; i < N; i++) {
-      xy01 = MAC16_16(xy01, x[i], y01[i]);
-      xy02 = MAC16_16(xy02, x[i], y02[i]);
-   }
-   *xy1 = xy01;
-   *xy2 = xy02;
+   *xy1 = celt_inner_prod_neon_float_c_simulation(x, y01, &err[0], N);
+   *xy2 = celt_inner_prod_neon_float_c_simulation(x, y02, &err[1], N);
 }
 
 #endif /* OPUS_CHECK_ASM */
@@ -225,7 +208,12 @@
     }
 
 #ifdef OPUS_CHECK_ASM
-    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+    {
+        float err, res;
+        res = celt_inner_prod_neon_float_c_simulation(x, y, &err, N);
+        /*if (ABS32(res - xy) > err) fprintf(stderr, "%g %g %g\n", res, xy, err);*/
+        celt_assert(ABS32(res - xy) <= err);
+    }
 #endif
 
     return xy;
@@ -280,9 +268,12 @@
 #ifdef OPUS_CHECK_ASM
     {
         opus_val32 xy1_c, xy2_c;
-        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
-        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
-        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+        float err[2];
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c, err);
+        /*if (ABS32(xy1_c - *xy1) > err[0]) fprintf(stderr, "dual1 fail: %g %g %g\n", xy1_c, *xy1, err[0]);
+        if (ABS32(xy2_c - *xy2) > err[1]) fprintf(stderr, "dual2 fail: %g %g %g\n", xy2_c, *xy2, err[1]);*/
+        celt_assert(ABS32(xy1_c - *xy1) <= err[0]);
+        celt_assert(ABS32(xy2_c - *xy2) <= err[1]);
     }
 #endif
 }
diff --git a/celt/bands.c b/celt/bands.c
index 2702963..5320ffa 100644
--- a/celt/bands.c
+++ b/celt/bands.c
@@ -901,7 +901,7 @@
    sctx->itheta = itheta;
    sctx->qalloc = qalloc;
 }
-static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y, int b,
+static unsigned quant_band_n1(struct band_ctx *ctx, celt_norm *X, celt_norm *Y,
       celt_norm *lowband_out)
 {
    int c;
@@ -926,7 +926,6 @@
             sign = ec_dec_bits(ec, 1);
          }
          ctx->remaining_bits -= 1<<BITRES;
-         b-=1<<BITRES;
       }
       if (ctx->resynth)
          x[0] = sign ? -NORM_SCALING : NORM_SCALING;
@@ -1134,7 +1133,7 @@
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, NULL, b, lowband_out);
+      return quant_band_n1(ctx, X, NULL, lowband_out);
    }
 
    if (tf_change>0)
@@ -1256,7 +1255,7 @@
    /* Special case for one sample */
    if (N==1)
    {
-      return quant_band_n1(ctx, X, Y, b, lowband_out);
+      return quant_band_n1(ctx, X, Y, lowband_out);
    }
 
    orig_fill = fill;
@@ -1381,6 +1380,7 @@
    return cm;
 }
 
+#ifndef DISABLE_UPDATE_DRAFT
 static void special_hybrid_folding(const CELTMode *m, celt_norm *norm, celt_norm *norm2, int start, int M, int dual_stereo)
 {
    int n1, n2;
@@ -1393,6 +1393,7 @@
    if (dual_stereo)
       OPUS_COPY(&norm2[n1], &norm2[2*n1 - n2], n2-n1);
 }
+#endif
 
 void quant_all_bands(int encode, const CELTMode *m, int start, int end,
       celt_norm *X_, celt_norm *Y_, unsigned char *collapse_masks,
diff --git a/celt/celt_decoder.c b/celt/celt_decoder.c
index 74ca3b7..883dae1 100644
--- a/celt/celt_decoder.c
+++ b/celt/celt_decoder.c
@@ -90,7 +90,7 @@
    opus_uint32 rng;
    int error;
    int last_pitch_index;
-   int loss_count;
+   int loss_duration;
    int skip_plc;
    int postfilter_period;
    int postfilter_period_old;
@@ -512,7 +512,7 @@
    int nbEBands;
    int overlap;
    int start;
-   int loss_count;
+   int loss_duration;
    int noise_based;
    const opus_int16 *eBands;
    SAVE_STACK;
@@ -532,9 +532,9 @@
    oldLogE2 = oldLogE + 2*nbEBands;
    backgroundLogE = oldLogE2  + 2*nbEBands;
 
-   loss_count = st->loss_count;
+   loss_duration = st->loss_duration;
    start = st->start;
-   noise_based = loss_count >= 5 || start != 0 || st->skip_plc;
+   noise_based = loss_duration >= 40 || start != 0 || st->skip_plc;
    if (noise_based)
    {
       /* Noise-based PLC/CNG */
@@ -557,9 +557,13 @@
 #else
       ALLOC(X, C*N, celt_norm);   /**< Interleaved normalised MDCTs */
 #endif
+      c=0; do {
+         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
+               DECODE_BUFFER_SIZE-N+(overlap>>1));
+      } while (++c<C);
 
       /* Energy decay */
-      decay = loss_count==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
+      decay = loss_duration==0 ? QCONST16(1.5f, DB_SHIFT) : QCONST16(.5f, DB_SHIFT);
       c=0; do
       {
          for (i=start;i<end;i++)
@@ -585,11 +589,6 @@
       }
       st->rng = seed;
 
-      c=0; do {
-         OPUS_MOVE(decode_mem[c], decode_mem[c]+N,
-               DECODE_BUFFER_SIZE-N+(overlap>>1));
-      } while (++c<C);
-
       celt_synthesis(mode, X, out_syn, oldBandE, start, effEnd, C, C, 0, LM, st->downsample, 0, st->arch);
    } else {
       int exc_length;
@@ -602,7 +601,7 @@
       VARDECL(opus_val16, _exc);
       VARDECL(opus_val16, fir_tmp);
 
-      if (loss_count == 0)
+      if (loss_duration == 0)
       {
          st->last_pitch_index = pitch_index = celt_plc_pitch_search(decode_mem, C, st->arch);
       } else {
@@ -630,9 +629,9 @@
 
          buf = decode_mem[c];
          for (i=0;i<MAX_PERIOD+LPC_ORDER;i++)
-            exc[i-LPC_ORDER] = ROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
+            exc[i-LPC_ORDER] = SROUND16(buf[DECODE_BUFFER_SIZE-MAX_PERIOD-LPC_ORDER+i], SIG_SHIFT);
 
-         if (loss_count == 0)
+         if (loss_duration == 0)
          {
             opus_val32 ac[LPC_ORDER+1];
             /* Compute LPC coefficients for the last MAX_PERIOD samples before
@@ -732,7 +731,7 @@
                         exc[extrapolation_offset+j])), SIG_SHIFT);
             /* Compute the energy of the previously decoded signal whose
                excitation we're copying. */
-            tmp = ROUND16(
+            tmp = SROUND16(
                   buf[DECODE_BUFFER_SIZE-MAX_PERIOD-N+extrapolation_offset+j],
                   SIG_SHIFT);
             S1 += SHR32(MULT16_16(tmp, tmp), 10);
@@ -742,7 +741,7 @@
             /* Copy the last decoded samples (prior to the overlap region) to
                synthesis filter memory so we can have a continuous signal. */
             for (i=0;i<LPC_ORDER;i++)
-               lpc_mem[i] = ROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
+               lpc_mem[i] = SROUND16(buf[DECODE_BUFFER_SIZE-N-1-i], SIG_SHIFT);
             /* Apply the synthesis filter to convert the excitation back into
                the signal domain. */
             celt_iir(buf+DECODE_BUFFER_SIZE-N, lpc+c*LPC_ORDER,
@@ -761,7 +760,7 @@
             opus_val32 S2=0;
             for (i=0;i<extrapolation_len;i++)
             {
-               opus_val16 tmp = ROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
+               opus_val16 tmp = SROUND16(buf[DECODE_BUFFER_SIZE-N+i], SIG_SHIFT);
                S2 += SHR32(MULT16_16(tmp, tmp), 10);
             }
             /* This checks for an "explosion" in the synthesis. */
@@ -812,7 +811,8 @@
       } while (++c<C);
    }
 
-   st->loss_count = loss_count+1;
+   /* Saturate to soemthing large to avoid wrap-around. */
+   st->loss_duration = IMIN(10000, loss_duration+(1<<LM));
 
    RESTORE_STACK;
 }
@@ -868,6 +868,7 @@
    int nbEBands;
    int overlap;
    const opus_int16 *eBands;
+   opus_val16 max_background_increase;
    ALLOC_STACK;
 
    VALIDATE_CELT_DECODER(st);
@@ -942,7 +943,7 @@
 
    /* Check if there are at least two packets received consecutively before
     * turning on the pitch-based PLC */
-   st->skip_plc = st->loss_count != 0;
+   st->skip_plc = st->loss_duration != 0;
 
    if (dec == NULL)
    {
@@ -1140,25 +1141,21 @@
    if (C==1)
       OPUS_COPY(&oldBandE[nbEBands], oldBandE, nbEBands);
 
-   /* In case start or end were to change */
    if (!isTransient)
    {
-      opus_val16 max_background_increase;
       OPUS_COPY(oldLogE2, oldLogE, 2*nbEBands);
       OPUS_COPY(oldLogE, oldBandE, 2*nbEBands);
-      /* In normal circumstances, we only allow the noise floor to increase by
-         up to 2.4 dB/second, but when we're in DTX, we allow up to 6 dB
-         increase for each update.*/
-      if (st->loss_count < 10)
-         max_background_increase = M*QCONST16(0.001f,DB_SHIFT);
-      else
-         max_background_increase = QCONST16(1.f,DB_SHIFT);
-      for (i=0;i<2*nbEBands;i++)
-         backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
    } else {
       for (i=0;i<2*nbEBands;i++)
          oldLogE[i] = MIN16(oldLogE[i], oldBandE[i]);
    }
+   /* In normal circumstances, we only allow the noise floor to increase by
+      up to 2.4 dB/second, but when we're in DTX we give the weight of
+      all missing packets to the update packet. */
+   max_background_increase = IMIN(160, st->loss_duration+M)*QCONST16(0.001f,DB_SHIFT);
+   for (i=0;i<2*nbEBands;i++)
+      backgroundLogE[i] = MIN16(backgroundLogE[i] + max_background_increase, oldBandE[i]);
+   /* In case start or end were to change */
    c=0; do
    {
       for (i=0;i<start;i++)
@@ -1175,7 +1172,7 @@
    st->rng = dec->rng;
 
    deemphasis(out_syn, pcm, N, CC, st->downsample, mode->preemph, st->preemph_memD, accum);
-   st->loss_count = 0;
+   st->loss_duration = 0;
    RESTORE_STACK;
    if (ec_tell(dec) > 8*len)
       return OPUS_INTERNAL_ERROR;
diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
index d6f8afc..637d442 100644
--- a/celt/celt_encoder.c
+++ b/celt/celt_encoder.c
@@ -1719,8 +1719,11 @@
       compute_mdcts(mode, 0, in, freq, C, CC, LM, st->upsample, st->arch);
       compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
       amp2Log2(mode, effEnd, end, bandE, bandLogE2, C);
-      for (i=0;i<C*nbEBands;i++)
-         bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+      for (c=0;c<C;c++)
+      {
+         for (i=0;i<end;i++)
+            bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+      }
    }
 
    compute_mdcts(mode, shortBlocks, in, freq, C, CC, LM, st->upsample, st->arch);
@@ -1856,8 +1859,11 @@
          compute_band_energies(mode, freq, bandE, effEnd, C, LM, st->arch);
          amp2Log2(mode, effEnd, end, bandE, bandLogE, C);
          /* Compensate for the scaling of short vs long mdcts */
-         for (i=0;i<C*nbEBands;i++)
-            bandLogE2[i] += HALF16(SHL16(LM, DB_SHIFT));
+         for (c=0;c<C;c++)
+         {
+            for (i=0;i<end;i++)
+               bandLogE2[nbEBands*c+i] += HALF16(SHL16(LM, DB_SHIFT));
+         }
          tf_estimate = QCONST16(.2f,14);
       }
    }
diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index 8ecb693..f91721b 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -50,17 +50,21 @@
 #endif
 
    OPUS_CLEAR(lpc, p);
+#ifdef FIXED_POINT
    if (ac[0] != 0)
+#else
+   if (ac[0] > 1e-10f)
+#endif
    {
       for (i = 0; i < p; i++) {
          /* Sum up this iteration's reflection coefficient */
          opus_val32 rr = 0;
          for (j = 0; j < i; j++)
             rr += MULT32_32_Q31(lpc[j],ac[i - j]);
-         rr += SHR32(ac[i + 1],3);
-         r = -frac_div32(SHL32(rr,3), error);
+         rr += SHR32(ac[i + 1],6);
+         r = -frac_div32(SHL32(rr,6), error);
          /*  Update LPC coefficients and total error */
-         lpc[i] = SHR32(r,3);
+         lpc[i] = SHR32(r,6);
          for (j = 0; j < (i+1)>>1; j++)
          {
             opus_val32 tmp1, tmp2;
@@ -73,17 +77,61 @@
          error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error);
          /* Bail out once we get 30 dB gain */
 #ifdef FIXED_POINT
-         if (error<SHR32(ac[0],10))
+         if (error<=SHR32(ac[0],10))
             break;
 #else
-         if (error<.001f*ac[0])
+         if (error<=.001f*ac[0])
             break;
 #endif
       }
    }
 #ifdef FIXED_POINT
-   for (i=0;i<p;i++)
-      _lpc[i] = ROUND16(lpc[i],16);
+   {
+      /* Convert the int32 lpcs to int16 and ensure there are no wrap-arounds.
+         This reuses the logic in silk_LPC_fit() and silk_bwexpander_32(). Any bug
+         fixes should also be applied there. */
+      int iter, idx = 0;
+      opus_val32 maxabs, absval, chirp_Q16, chirp_minus_one_Q16;
+
+      for (iter = 0; iter < 10; iter++) {
+         maxabs = 0;
+         for (i = 0; i < p; i++) {
+            absval = ABS32(lpc[i]);
+            if (absval > maxabs) {
+               maxabs = absval;
+               idx = i;
+            }
+         }
+         maxabs = PSHR32(maxabs, 13);  /* Q25->Q12 */
+
+         if (maxabs > 32767) {
+            maxabs = MIN32(maxabs, 163838);
+            chirp_Q16 = QCONST32(0.999, 16) - DIV32(SHL32(maxabs - 32767, 14),
+                                                    SHR32(MULT32_32_32(maxabs, idx + 1), 2));
+            chirp_minus_one_Q16 = chirp_Q16 - 65536;
+
+            /* Apply bandwidth expansion. */
+            for (i = 0; i < p - 1; i++) {
+               lpc[i] = MULT32_32_Q16(chirp_Q16, lpc[i]);
+               chirp_Q16 += PSHR32(MULT32_32_32(chirp_Q16, chirp_minus_one_Q16), 16);
+            }
+            lpc[p - 1] = MULT32_32_Q16(chirp_Q16, lpc[p - 1]);
+         } else {
+            break;
+         }
+      }
+
+      if (iter == 10) {
+         /* If the coeffs still do not fit into the 16 bit range after 10 iterations,
+            fall back to the A(z)=1 filter. */
+         OPUS_CLEAR(lpc, p);
+         _lpc[0] = 4096;  /* Q12 */
+      } else {
+         for (i = 0; i < p; i++) {
+            _lpc[i] = EXTRACT16(PSHR32(lpc[i], 13));  /* Q25->Q12 */
+         }
+      }
+   }
 #endif
 }
 
@@ -111,17 +159,17 @@
       sum[2] = SHL32(EXTEND32(x[i+2]), SIG_SHIFT);
       sum[3] = SHL32(EXTEND32(x[i+3]), SIG_SHIFT);
       xcorr_kernel(rnum, x+i-ord, sum, ord, arch);
-      y[i  ] = ROUND16(sum[0], SIG_SHIFT);
-      y[i+1] = ROUND16(sum[1], SIG_SHIFT);
-      y[i+2] = ROUND16(sum[2], SIG_SHIFT);
-      y[i+3] = ROUND16(sum[3], SIG_SHIFT);
+      y[i  ] = SROUND16(sum[0], SIG_SHIFT);
+      y[i+1] = SROUND16(sum[1], SIG_SHIFT);
+      y[i+2] = SROUND16(sum[2], SIG_SHIFT);
+      y[i+3] = SROUND16(sum[3], SIG_SHIFT);
    }
    for (;i<N;i++)
    {
       opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
          sum = MAC16_16(sum,rnum[j],x[i+j-ord]);
-      y[i] = ROUND16(sum, SIG_SHIFT);
+      y[i] = SROUND16(sum, SIG_SHIFT);
    }
    RESTORE_STACK;
 }
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index 68fc606..7b5c56c 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -43,10 +43,11 @@
  */
 #define OPUS_ARCHMASK 3
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#elif defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
 
 #include "x86/x86cpu.h"
 /* We currently support 5 x86 variants:
diff --git a/celt/fixed_debug.h b/celt/fixed_debug.h
index f435295..ef2e5d0 100644
--- a/celt/fixed_debug.h
+++ b/celt/fixed_debug.h
@@ -167,7 +167,7 @@
 #define SHL16(a, shift) SHL16_(a, shift, __FILE__, __LINE__)
 static OPUS_INLINE short SHL16_(int a, int shift, char *file, int line)
 {
-   int res;
+   opus_int32 res;
    if (!VERIFY_SHORT(a) || !VERIFY_SHORT(shift))
    {
       fprintf (stderr, "SHL16: inputs are not short: %d %d in %s: line %d\n", a, shift, file, line);
@@ -175,7 +175,7 @@
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int32)((opus_uint32)a<<shift);
    if (!VERIFY_SHORT(res))
    {
       fprintf (stderr, "SHL16: output is not short: %d in %s: line %d\n", res, file, line);
@@ -214,15 +214,15 @@
    opus_int64  res;
    if (!VERIFY_INT(a) || !VERIFY_SHORT(shift))
    {
-      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", a, shift, file, line);
+      fprintf (stderr, "SHL32: inputs are not int: %lld %d in %s: line %d\n", (long long)a, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
    }
-   res = a<<shift;
+   res = (opus_int64)((opus_uint64)a<<shift);
    if (!VERIFY_INT(res))
    {
-      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", a, shift, res, file, line);
+      fprintf (stderr, "SHL32: output is not int: %lld<<%d = %lld in %s: line %d\n", (long long)a, shift, (long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -339,7 +339,7 @@
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "UADD32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -347,7 +347,7 @@
    res = a+b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", res, file, line);
+      fprintf (stderr, "UADD32: output is not uint32: %llu in %s: line %d\n", (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -363,14 +363,14 @@
    opus_uint64 res;
    if (!VERIFY_UINT(a) || !VERIFY_UINT(b))
    {
-      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs are not uint32: %llu %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
    }
    if (a<b)
    {
-      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", a, b, file, line);
+      fprintf (stderr, "USUB32: inputs underflow: %llu < %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -378,7 +378,7 @@
    res = a-b;
    if (!VERIFY_UINT(res))
    {
-      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", a, b, res, file, line);
+      fprintf (stderr, "USUB32: output is not uint32: %llu - %llu = %llu in %s: line %d\n", (unsigned long long)a, (unsigned long long)b, (unsigned long long)res, file, line);
 #ifdef FIXED_DEBUG_ASSERT
       celt_assert(0);
 #endif
@@ -410,6 +410,51 @@
    return res;
 }
 
+/* result fits in 32 bits */
+static OPUS_INLINE int MULT32_32_32(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_32: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = a*b;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_32: output is not int: %lld\n", (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
+static OPUS_INLINE int MULT32_32_Q16(opus_int64 a, opus_int64 b)
+{
+   opus_int64 res;
+   if (!VERIFY_INT(a) || !VERIFY_INT(b))
+   {
+      fprintf (stderr, "MULT32_32_Q16: inputs are not int: %lld %lld\n", (long long)a, (long long)b);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   res = ((opus_int64)(a)*(opus_int64)(b)) >> 16;
+   if (!VERIFY_INT(res))
+   {
+      fprintf (stderr, "MULT32_32_Q16: output is not int: %lld*%lld=%lld\n", (long long)a, (long long)b, (long long)res);
+#ifdef FIXED_DEBUG_ASSERT
+      celt_assert(0);
+#endif
+   }
+   celt_mips+=5;
+   return res;
+}
+
 #define MULT16_16(a, b) MULT16_16_(a, b, __FILE__, __LINE__)
 static OPUS_INLINE int MULT16_16_(int a, int b, char *file, int line)
 {
@@ -446,7 +491,7 @@
       celt_assert(0);
 #endif
    }
-   if (ABS32(b)>=((opus_val32)(1)<<(15+Q)))
+   if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
    {
       fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n", Q, (int)a, (int)b, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -479,7 +524,7 @@
       celt_assert(0);
 #endif
    }
-   if (ABS32(b)>=((opus_int64)(1)<<(15+Q)))
+   if (ABS32(b)>=((opus_int64)(1)<<(16+Q)))
    {
       fprintf (stderr, "MULT16_32_Q%d: second operand too large: %d %d in %s: line %d\n\n", Q, (int)a, (int)b,file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -786,6 +831,6 @@
 
 
 #undef PRINT_MIPS
-#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", celt_mips);} while (0);
+#define PRINT_MIPS(file) do {fprintf (file, "total complexity = %llu MIPS\n", (unsigned long long)celt_mips);} while (0);
 
 #endif
diff --git a/celt/fixed_generic.h b/celt/fixed_generic.h
index 0ecbb89..8f29d46 100644
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -57,6 +57,13 @@
 #define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),((b)&0x0000ffff)),15))
 #endif
 
+/** 32x32 multiplication, followed by a 16-bit shift right. Results fits in 32 bits */
+#if OPUS_FAST_INT64
+#define MULT32_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),16))
+#else
+#define MULT32_32_Q16(a,b) (ADD32(ADD32(ADD32((opus_val32)(SHR32(((opus_uint32)((a)&0x0000ffff)*(opus_uint32)((b)&0x0000ffff)),16)), MULT16_16SU(SHR32(a,16),((b)&0x0000ffff))), MULT16_16SU(SHR32(b,16),((a)&0x0000ffff))), SHL32(MULT16_16(SHR32(a,16),SHR32(b,16)),16)))
+#endif
+
 /** 32x32 multiplication, followed by a 31-bit shift right. Results fits in 32 bits */
 #if OPUS_FAST_INT64
 #define MULT32_32_Q31(a,b) ((opus_val32)SHR((opus_int64)(a)*(opus_int64)(b),31))
@@ -131,6 +138,9 @@
 /** 16x16 multiplication where the result fits in 16 bits */
 #define MULT16_16_16(a,b)     ((((opus_val16)(a))*((opus_val16)(b))))
 
+/** 32x32 multiplication where the result fits in 32 bits */
+#define MULT32_32_32(a,b)     ((((opus_val32)(a))*((opus_val32)(b))))
+
 /* (opus_val32)(opus_val16) gives TI compiler a hint that it's 16x16->32 multiply */
 /** 16x16 multiplication where the result fits in 32 bits */
 #define MULT16_16(a,b)     (((opus_val32)(opus_val16)(a))*((opus_val32)(opus_val16)(b)))
diff --git a/celt/float_cast.h b/celt/float_cast.h
index 9d34976..8915a5f 100644
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@@ -99,7 +99,7 @@
                 return intgr ;
         }
 
-#elif defined(HAVE_LRINTF)
+#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 /*      These defines enable functionality introduced with the 1999 ISO C
 **      standard. They must be defined before the inclusion of math.h to
@@ -117,7 +117,7 @@
 #include <math.h>
 #define float2int(x) lrintf(x)
 
-#elif (defined(HAVE_LRINT))
+#elif defined(HAVE_LRINT) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 
 #define _ISOC9X_SOURCE 1
 #define _ISOC99_SOURCE 1
diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h
index bffa2bf..267f72f 100644
--- a/celt/kiss_fft.h
+++ b/celt/kiss_fft.h
@@ -52,6 +52,10 @@
 #  define kiss_fft_scalar opus_int32
 #  define kiss_twiddle_scalar opus_int16
 
+/* Some 32-bit CPUs would load/store a kiss_twiddle_cpx with a single memory
+ * access, and could benefit from additional alignment.
+ */
+#  define KISS_TWIDDLE_CPX_ALIGNMENT (sizeof(opus_int32))
 
 #else
 # ifndef kiss_fft_scalar
@@ -62,6 +66,12 @@
 # endif
 #endif
 
+#if defined(__GNUC__) && defined(KISS_TWIDDLE_CPX_ALIGNMENT)
+#define KISS_TWIDDLE_CPX_ALIGNED __attribute__((aligned(KISS_TWIDDLE_CPX_ALIGNMENT)))
+#else
+#define KISS_TWIDDLE_CPX_ALIGNED
+#endif
+
 typedef struct {
     kiss_fft_scalar r;
     kiss_fft_scalar i;
@@ -70,7 +80,7 @@
 typedef struct {
    kiss_twiddle_scalar r;
    kiss_twiddle_scalar i;
-}kiss_twiddle_cpx;
+} KISS_TWIDDLE_CPX_ALIGNED kiss_twiddle_cpx;
 
 #define MAXFACTORS 8
 /* e.g. an fft of length 128 has 4 factors
diff --git a/celt/mathops.h b/celt/mathops.h
index fe29dac..478ac91 100644
--- a/celt/mathops.h
+++ b/celt/mathops.h
@@ -153,7 +153,7 @@
       float f;
       opus_uint32 i;
    } res;
-   integer = floor(x);
+   integer = (int)floor(x);
    if (integer < -50)
       return 0;
    frac = x-integer;
diff --git a/celt/meson.build b/celt/meson.build
index 370ea1f..ad95d94 100644
--- a/celt/meson.build
+++ b/celt/meson.build
@@ -10,6 +10,10 @@
 
 celt_static_libs = []
 
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+  celt_sources +=  sources['CELT_SOURCES_X86_RTCD']
+endif
+
 foreach intr_name : ['sse', 'sse2', 'sse4_1', 'neon_intr']
   have_intr = get_variable('have_' + intr_name)
   if not have_intr
@@ -30,7 +34,9 @@
 endif
 
 if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
-  celt_sources +=  sources['CELT_SOURCES_ARM']
+  if opus_conf.has('OPUS_HAVE_RTCD')
+    celt_sources +=  sources['CELT_SOURCES_ARM_RTCD']
+  endif
   if have_arm_ne10
     celt_sources += sources['CELT_SOURCES_ARM_NE10']
   endif
diff --git a/celt/modes.c b/celt/modes.c
index 390c5e8..23f7cde 100644
--- a/celt/modes.c
+++ b/celt/modes.c
@@ -173,7 +173,10 @@
    mode->nbAllocVectors = BITALLOC_SIZE;
    allocVectors = opus_alloc(sizeof(unsigned char)*(BITALLOC_SIZE*mode->nbEBands));
    if (allocVectors==NULL)
+   {
+      mode->allocVectors = NULL;
       return;
+   }
 
    /* Check for standard mode */
    if (mode->Fs == 400*(opus_int32)mode->shortMdctSize)
diff --git a/celt/pitch.c b/celt/pitch.c
index 872582a..7998db4 100644
--- a/celt/pitch.c
+++ b/celt/pitch.c
@@ -161,17 +161,26 @@
       shift=0;
    if (C==2)
       shift++;
-#endif
    for (i=1;i<len>>1;i++)
-      x_lp[i] = SHR32(HALF32(HALF32(x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]), shift);
-   x_lp[0] = SHR32(HALF32(HALF32(x[0][1])+x[0][0]), shift);
+      x_lp[i] = SHR32(x[0][(2*i-1)], shift+2) + SHR32(x[0][(2*i+1)], shift+2) + SHR32(x[0][2*i], shift+1);
+   x_lp[0] = SHR32(x[0][1], shift+2) + SHR32(x[0][0], shift+1);
    if (C==2)
    {
       for (i=1;i<len>>1;i++)
-         x_lp[i] += SHR32(HALF32(HALF32(x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]), shift);
-      x_lp[0] += SHR32(HALF32(HALF32(x[1][1])+x[1][0]), shift);
+         x_lp[i] += SHR32(x[1][(2*i-1)], shift+2) + SHR32(x[1][(2*i+1)], shift+2) + SHR32(x[1][2*i], shift+1);
+      x_lp[0] += SHR32(x[1][1], shift+2) + SHR32(x[1][0], shift+1);
    }
-
+#else
+   for (i=1;i<len>>1;i++)
+      x_lp[i] = .25f*x[0][(2*i-1)] + .25f*x[0][(2*i+1)] + .5f*x[0][2*i];
+   x_lp[0] = .25f*x[0][1] + .5f*x[0][0];
+   if (C==2)
+   {
+      for (i=1;i<len>>1;i++)
+         x_lp[i] += .25f*x[1][(2*i-1)] + .25f*x[1][(2*i+1)] + .5f*x[1][2*i];
+      x_lp[0] += .25f*x[1][1] + .5f*x[1][0];
+   }
+#endif
    _celt_autocorr(x_lp, ac, NULL, 0,
                   4, len>>1, arch);
 
@@ -249,7 +258,7 @@
    opus_val32 maxcorr=1;
 #endif
    celt_assert(max_pitch>0);
-   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert(((size_t)_x&3)==0);
    for (i=0;i<max_pitch-3;i+=4)
    {
       opus_val32 sum[4]={0,0,0,0};
diff --git a/celt/rate.c b/celt/rate.c
index 465e1ba..7f7ad3f 100644
--- a/celt/rate.c
+++ b/celt/rate.c
@@ -356,6 +356,8 @@
             else
                depth_threshold = 0;
 #ifdef FUZZING
+            (void)signalBandwidth;
+            (void)depth_threshold;
             if ((rand()&0x1) == 0)
 #else
             if (codedBands<=start+2 || (band_bits > (depth_threshold*band_width<<LM<<BITRES)>>4 && j<=signalBandwidth))
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 70f8f49..ae9a7b5 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -144,8 +144,9 @@
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
diff --git a/celt/tests/test_unit_entropy.c b/celt/tests/test_unit_entropy.c
index 7f67452..b1619b7 100644
--- a/celt/tests/test_unit_entropy.c
+++ b/celt/tests/test_unit_entropy.c
@@ -104,7 +104,7 @@
   nbits=ec_tell_frac(&enc);
   ec_enc_done(&enc);
   fprintf(stderr,
-   "Encoded %0.2lf bits of entropy to %0.2lf bits (%0.3lf%% wasted).\n",
+   "Encoded %0.2f bits of entropy to %0.2f bits (%0.3f%% wasted).\n",
    entropy,ldexp(nbits,-3),100*(nbits-ldexp(entropy,3))/nbits);
   fprintf(stderr,"Packed to %li bytes.\n",(long)ec_range_bytes(&enc));
   ec_dec_init(&dec,ptr,DATA_SIZE);
@@ -129,7 +129,7 @@
   nbits2=ec_tell_frac(&dec);
   if(nbits!=nbits2){
     fprintf(stderr,
-     "Reported number of bits used was %0.2lf, should be %0.2lf.\n",
+     "Reported number of bits used was %0.2f, should be %0.2f.\n",
      ldexp(nbits2,-3),ldexp(nbits,-3));
     ret=-1;
   }
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 4a563cc..844c5b4 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -184,8 +184,9 @@
 
 int main(int argc,char ** argv)
 {
+    int arch;
     ALLOC_STACK;
-    int arch = opus_select_arch();
+    arch = opus_select_arch();
 
     if (argc>1) {
         int k;
diff --git a/celt/x86/celt_lpc_sse.h b/celt/x86/celt_lpc_sse.h
index 7d1ecf7..90e69ec 100644
--- a/celt/x86/celt_lpc_sse.h
+++ b/celt/x86/celt_lpc_sse.h
@@ -33,7 +33,6 @@
 #endif
 
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
-#define OVERRIDE_CELT_FIR
 
 void celt_fir_sse4_1(
          const opus_val16 *x,
@@ -44,10 +43,11 @@
          int arch);
 
 #if defined(OPUS_X86_PRESUME_SSE4_1)
+#define OVERRIDE_CELT_FIR
 #define celt_fir(x, num, y, N, ord, arch) \
     ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch))
 
-#else
+#elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
          const opus_val16 *x,
@@ -57,6 +57,7 @@
          int ord,
          int arch);
 
+#define OVERRIDE_CELT_FIR
 #  define celt_fir(x, num, y, N, ord, arch) \
     ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch))
 
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index f7a014b..964aef5 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -63,7 +63,7 @@
 #define xcorr_kernel(x, y, sum, len, arch) \
     ((void)arch, xcorr_kernel_sse(x, y, sum, len))
 
-#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) &&  ((defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
@@ -115,8 +115,8 @@
     ((void)arch, celt_inner_prod_sse(x, y, N))
 
 
-#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
-    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+#elif defined(OPUS_HAVE_RTCD) && (((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+    (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)))
 
 extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_val16 *x,
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index a092c68..2bc5783 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,14 @@
     __m128i sum0, sum1, sum2, sum3, vecSum;
     __m128i initSum;
 
+#ifdef OPUS_CHECK_ASM
+    opus_val32 sum_c[4];
+    for (j=0;j<4;j++) {
+      sum_c[j] = sum[j];
+    }
+    xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
     celt_assert(len >= 3);
 
     sum0 = _mm_setzero_si128();
@@ -177,19 +185,56 @@
         vecSum = _mm_add_epi32(vecSum, sum2);
     }
 
-    for (;j<len;j++)
+    vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+    if (len - j == 3)
     {
-        vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
-        vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+        vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX2 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+        vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+        sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+        vecSum = _mm_add_epi32(vecSum, sum2);
+    }
+    else if (len - j == 2)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+        vecX1 = _mm_shuffle_epi32(vecX, 0xff);
+
+        vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+        vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+
+        sum0 = _mm_mullo_epi32(vecX0, vecY0);
+        sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+        vecSum = _mm_add_epi32(vecSum, sum0);
+        vecSum = _mm_add_epi32(vecSum, sum1);
+    }
+    else if (len - j == 1)
+    {
+        vecX0 = _mm_shuffle_epi32(vecX, 0xff);
 
         vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
 
         sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
         vecSum = _mm_add_epi32(vecSum, sum0);
     }
 
     initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
     initSum = _mm_add_epi32(initSum, vecSum);
     _mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
 }
 #endif
diff --git a/celt/x86/x86cpu.c b/celt/x86/x86cpu.c
index 080eb25..6a1914d 100644
--- a/celt/x86/x86cpu.c
+++ b/celt/x86/x86cpu.c
@@ -35,11 +35,11 @@
 #include "pitch.h"
 #include "x86cpu.h"
 
-#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+#if defined(OPUS_HAVE_RTCD) && \
+  ((defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
   (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
-  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
-
+  (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)))
 
 #if defined(_MSC_VER)
 
@@ -68,7 +68,8 @@
         "=r" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #else
     __asm__ __volatile__ (
@@ -77,11 +78,22 @@
         "=b" (CPUInfo[1]),
         "=c" (CPUInfo[2]),
         "=d" (CPUInfo[3]) :
-        "0" (InfoType)
+        /* We clear ECX to avoid a valgrind false-positive prior to v3.17.0. */
+        "0" (InfoType), "2" (0)
     );
 #endif
 #elif defined(CPU_INFO_BY_C)
-    __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+    /* We use __get_cpuid_count to clear ECX to avoid a valgrind false-positive
+        prior to v3.17.0.*/
+    if (!__get_cpuid_count(InfoType, 0, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]))) {
+        /* Our function cannot fail, but __get_cpuid{_count} can.
+           Returning all zeroes will effectively disable all SIMD, which is
+            what we want on CPUs that don't support CPUID. */
+        CPUInfo[3] = CPUInfo[2] = CPUInfo[1] = CPUInfo[0] = 0;
+    }
+#else
+# error "Configured to use x86 RTCD, but no CPU detection method available. " \
+ "Reconfigure with --disable-rtcd (or send patches)."
 #endif
 }
 
@@ -98,7 +110,7 @@
 
 static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
 {
-    unsigned int info[4] = {0};
+    unsigned int info[4];
     unsigned int nIds = 0;
 
     cpuid(info, 0);
@@ -119,7 +131,7 @@
     }
 }
 
-int opus_select_arch(void)
+static int opus_select_arch_impl(void)
 {
     CPU_Feature cpu_feature;
     int arch;
@@ -154,4 +166,13 @@
     return arch;
 }
 
+int opus_select_arch(void) {
+    int arch = opus_select_arch_impl();
+#ifdef FUZZING
+    /* Randomly downgrade the architecture. */
+    arch = rand()%(arch+1);
+#endif
+    return arch;
+}
+
 #endif
diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h
index 1e2bf17..04e8048 100644
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@@ -56,40 +56,18 @@
 int opus_select_arch(void);
 # endif
 
-/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
-  or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
-  actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
-  reference, these require 16-byte alignment and load a full 16 bytes (instead
-  of 4 or 8), possibly reading out of bounds.
+/*MOVD should not impose any alignment restrictions, but the C standard does,
+   and UBSan will report errors if we actually make unaligned accesses.
+  Use this to work around those restrictions (which should hopefully all get
+   optimized to a single MOVD instruction).*/
+#define OP_LOADU_EPI32(x) \
+  (int)((*(unsigned char *)(x) | *((unsigned char *)(x) + 1) << 8U |\
+   *((unsigned char *)(x) + 2) << 16U | (opus_uint32)*((unsigned char *)(x) + 3) << 24U))
 
-  We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
-  _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
-  reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
-  optimize this out when optimizations ARE enabled.
+#define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(OP_LOADU_EPI32(x))))
 
-  Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
-  (which is fair, since technically the compiler is always allowed to do the
-  dereference before invoking the function implementing the intrinsic).
-  However, it is smart enough to eliminate the extra MOVD instruction.
-  For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
-  the extra MOVQ if it's specified explicitly */
-
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
-# else
-#  define OP_CVTEPI8_EPI32_M32(x) \
- (_mm_cvtepi8_epi32(*(__m128i *)(x)))
-#endif
-
-/* similar reasoning about the instruction sequence as in the 32-bit macro above,
- */
-# if defined(__clang__) || !defined(__OPTIMIZE__)
-#  define OP_CVTEPI16_EPI32_M64(x) \
+#define OP_CVTEPI16_EPI32_M64(x) \
  (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
-# else
-#  define OP_CVTEPI16_EPI32_M64(x) \
- (_mm_cvtepi16_epi32(*(__m128i *)(x)))
-# endif
 
 #endif
diff --git a/celt_sources.mk b/celt_sources.mk
index c9dab06..d6b6765 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -18,9 +18,11 @@
 celt/rate.c \
 celt/vq.c
 
-CELT_SOURCES_SSE = \
+CELT_SOURCES_X86_RTCD = \
 celt/x86/x86cpu.c \
-celt/x86/x86_celt_map.c \
+celt/x86/x86_celt_map.c
+
+CELT_SOURCES_SSE = \
 celt/x86/pitch_sse.c
 
 CELT_SOURCES_SSE2 = \
@@ -31,7 +33,7 @@
 celt/x86/celt_lpc_sse4_1.c \
 celt/x86/pitch_sse4_1.c
 
-CELT_SOURCES_ARM = \
+CELT_SOURCES_ARM_RTCD = \
 celt/arm/armcpu.c \
 celt/arm/arm_celt_map.c
 
diff --git a/cmake/OpusConfig.cmake b/cmake/OpusConfig.cmake
index 8d19a53..b82307a 100644
--- a/cmake/OpusConfig.cmake
+++ b/cmake/OpusConfig.cmake
@@ -9,18 +9,20 @@
 add_definitions(-DHAVE_CONFIG_H)
 
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-set_property(GLOBAL PROPERTY C_STANDARD 99)
+
+if(MSVC)
+  # For compilers that have no notion of a C standard level,
+  # such as Microsoft Visual C++ before VS 16.7,
+  # this property has no effect.
+  set(CMAKE_C_STANDARD 11)
+else()
+  set(CMAKE_C_STANDARD 99)
+endif()
 
 if(MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
 endif()
 
-include(CheckLibraryExists)
-check_library_exists(m floor "" HAVE_LIBM)
-if(HAVE_LIBM)
-  list(APPEND OPUS_REQUIRED_LIBRARIES m)
-endif()
-
 include(CFeatureCheck)
 c_feature_check(VLA)
 
@@ -35,9 +37,18 @@
   check_symbol_exists(alloca "stdlib.h;malloc.h" USE_ALLOCA_SUPPORTED)
 endif()
 
-include(CheckFunctionExists)
-check_function_exists(lrintf HAVE_LRINTF)
-check_function_exists(lrint HAVE_LRINT)
+include(CMakePushCheckState)
+cmake_push_check_state(RESET)
+include(CheckLibraryExists)
+check_library_exists(m floor "" HAVE_LIBM)
+if(HAVE_LIBM)
+  list(APPEND OPUS_REQUIRED_LIBRARIES m)
+  set(CMAKE_REQUIRED_LIBRARIES m)
+endif()
+
+check_symbol_exists(lrintf "math.h" HAVE_LRINTF)
+check_symbol_exists(lrint "math.h" HAVE_LRINT)
+cmake_pop_check_state()
 
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(i[0-9]86|x86|X86|amd64|AMD64|x86_64)")
   if(CMAKE_SIZEOF_VOID_P EQUAL 8)
diff --git a/cmake/OpusFunctions.cmake b/cmake/OpusFunctions.cmake
index fcf3351..3f22ad8 100644
--- a/cmake/OpusFunctions.cmake
+++ b/cmake/OpusFunctions.cmake
@@ -142,14 +142,28 @@
 endfunction()
 
 function(opus_supports_cpu_detection RUNTIME_CPU_CAPABILITY_DETECTION)
-  if(MSVC)
-    check_include_file(intrin.h HAVE_INTRIN_H)
-  else()
-    check_include_file(cpuid.h HAVE_CPUID_H)
-  endif()
-  if(HAVE_INTRIN_H OR HAVE_CPUID_H)
-    set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
-  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
+  set(RUNTIME_CPU_CAPABILITY_DETECTION 0 PARENT_SCOPE)
+  if(OPUS_CPU_X86 OR OPUS_CPU_X64)
+    if(MSVC)
+      check_include_file(intrin.h HAVE_INTRIN_H)
+      if(HAVE_INTRIN_H)
+        # if intrin.h is available we assume __cpuid is there
+        set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
+      endif()
+    else()
+      include(CFeatureCheck)
+      c_feature_check(CPU_INFO_BY_ASM)
+      set(CPU_INFO_BY_ASM_SUPPORTED ${CPU_INFO_BY_ASM_SUPPORTED} PARENT_SCOPE)
+      check_include_file(cpuid.h HAVE_CPUID_H)
+      if(HAVE_CPUID_H)
+        c_feature_check(CPU_INFO_BY_C)
+        set(CPU_INFO_BY_C_SUPPORTED ${CPU_INFO_BY_C_SUPPORTED} PARENT_SCOPE)
+      endif()
+      if(CPU_INFO_BY_ASM_SUPPORTED OR CPU_INFO_BY_C_SUPPORTED)
+        set(RUNTIME_CPU_CAPABILITY_DETECTION 1 PARENT_SCOPE)
+      endif()
+    endif()
+  elseif(OPUS_CPU_ARM)
     # ARM cpu detection is implemented for Windows and anything
     # using a Linux kernel (such as Android).
     if (CMAKE_SYSTEM_NAME MATCHES "(Windows|Linux|Android)")
diff --git a/cmake/OpusSources.cmake b/cmake/OpusSources.cmake
index 01e75d1..b47f8c6 100644
--- a/cmake/OpusSources.cmake
+++ b/cmake/OpusSources.cmake
@@ -9,9 +9,11 @@
 get_opus_sources(SILK_SOURCES silk_sources.mk silk_sources)
 get_opus_sources(SILK_SOURCES_FLOAT silk_sources.mk silk_sources_float)
 get_opus_sources(SILK_SOURCES_FIXED silk_sources.mk silk_sources_fixed)
+get_opus_sources(SILK_SOURCES_X86_RTCD silk_sources.mk silk_sources_x86_rtcd)
 get_opus_sources(SILK_SOURCES_SSE4_1 silk_sources.mk silk_sources_sse4_1)
 get_opus_sources(SILK_SOURCES_FIXED_SSE4_1 silk_sources.mk
                  silk_sources_fixed_sse4_1)
+get_opus_sources(SILK_SOURCES_ARM_RTCD silk_sources.mk silk_sources_arm_rtcd)
 get_opus_sources(SILK_SOURCES_ARM_NEON_INTR silk_sources.mk
                  silk_sources_arm_neon_intr)
 get_opus_sources(SILK_SOURCES_FIXED_ARM_NEON_INTR silk_sources.mk
@@ -23,10 +25,11 @@
 
 get_opus_sources(CELT_HEAD celt_headers.mk celt_headers)
 get_opus_sources(CELT_SOURCES celt_sources.mk celt_sources)
+get_opus_sources(CELT_SOURCES_X86_RTCD celt_sources.mk celt_sources_x86_rtcd)
 get_opus_sources(CELT_SOURCES_SSE celt_sources.mk celt_sources_sse)
 get_opus_sources(CELT_SOURCES_SSE2 celt_sources.mk celt_sources_sse2)
 get_opus_sources(CELT_SOURCES_SSE4_1 celt_sources.mk celt_sources_sse4_1)
-get_opus_sources(CELT_SOURCES_ARM celt_sources.mk celt_sources_arm)
+get_opus_sources(CELT_SOURCES_ARM_RTCD celt_sources.mk celt_sources_arm_rtcd)
 get_opus_sources(CELT_SOURCES_ARM_ASM celt_sources.mk celt_sources_arm_asm)
 get_opus_sources(CELT_AM_SOURCES_ARM_ASM celt_sources.mk
                  celt_am_sources_arm_asm)
diff --git a/cmake/RunTest.cmake b/cmake/RunTest.cmake
new file mode 100644
index 0000000..f6f8b4a
--- /dev/null
+++ b/cmake/RunTest.cmake
@@ -0,0 +1,61 @@
+if(NOT EXISTS ${TEST_EXECUTABLE})
+    message(FATAL_ERROR "Error could not find ${TEST_EXECUTABLE}, ensure that you built the test binary")
+endif()
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+
+  # support to run plain old binary on android devices
+  # requires android debug bridge to be installed
+
+  find_program(adb_executable adb)
+  if(NOT adb_executable)
+    message(FATAL_ERROR "Error could not find adb")
+  endif()
+
+  # check if any device emulator is attached
+  execute_process(COMMAND ${adb_executable} shell echo RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error adb: no devices/emulators found")
+  endif()
+
+  # push binary
+  set(android_path /data/local/tmp)
+  execute_process(COMMAND ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} push ${TEST_EXECUTABLE} ${android_path} failed with result ${CMD_RESULT}")
+  endif()
+
+  # set permissions
+  get_filename_component(test_executable ${TEST_EXECUTABLE} NAME)
+  set(test_executable_on_android /data/local/tmp/${test_executable})
+  execute_process(COMMAND ${adb_executable} shell chmod 555 ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} shell chmod 555 ${test_executable_on_android} failed with result ${CMD_RESULT}")
+  endif()
+
+  # run executable
+  execute_process(COMMAND ${adb_executable} shell ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} shell ${test_executable_on_android} failed with result ${CMD_RESULT}")
+  endif()
+
+  # clean up binary
+  execute_process(COMMAND ${adb_executable} shell rm ${test_executable_on_android} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${adb_executable} shell rm ${test_executable_on_android} failed with result ${CMD_RESULT}")
+  endif()
+
+elseif(CMAKE_SYSTEM_NAME STREQUAL "iOS")
+  # CTest doesn't support iOS
+
+  message(FATAL_ERROR "Error CTest is not supported on iOS")
+
+else()
+  # for other platforms just execute test binary on host
+
+  execute_process(COMMAND ${TEST_EXECUTABLE} RESULT_VARIABLE CMD_RESULT)
+  if(CMD_RESULT)
+    message(FATAL_ERROR "Error running ${TEST_EXECUTABLE} failed with result ${CMD_RESULT}")
+  endif()
+
+endif()
\ No newline at end of file
diff --git a/cmake/cpu_info_by_asm.c b/cmake/cpu_info_by_asm.c
new file mode 100644
index 0000000..1a70a81
--- /dev/null
+++ b/cmake/cpu_info_by_asm.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+int main() {
+    unsigned int CPUInfo0;
+    unsigned int CPUInfo1;
+    unsigned int CPUInfo2;
+    unsigned int CPUInfo3;
+    unsigned int InfoType;
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
+    __asm__ __volatile__ (
+        "xchg %%ebx, %1\n"
+        "cpuid\n"
+        "xchg %%ebx, %1\n":
+        "=a" (CPUInfo0),
+        "=r" (CPUInfo1),
+        "=c" (CPUInfo2),
+        "=d" (CPUInfo3) :
+        "0" (InfoType), "2" (0)
+    );
+#else
+    __asm__ __volatile__ (
+        "cpuid":
+        "=a" (CPUInfo0),
+        "=b" (CPUInfo1),
+        "=c" (CPUInfo2),
+        "=d" (CPUInfo3) :
+        "0" (InfoType), "2" (0)
+    );
+#endif
+    return 0;
+}
diff --git a/cmake/cpu_info_by_c.c b/cmake/cpu_info_by_c.c
new file mode 100644
index 0000000..117084e
--- /dev/null
+++ b/cmake/cpu_info_by_c.c
@@ -0,0 +1,9 @@
+#include <cpuid.h>
+int main() {
+    unsigned int CPUInfo0;
+    unsigned int CPUInfo1;
+    unsigned int CPUInfo2;
+    unsigned int CPUInfo3;
+    unsigned int InfoType;
+    return __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+}
diff --git a/configure.ac b/configure.ac
index f12f0aa..1d426f2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -195,6 +195,7 @@
 
 rtcd_support=no
 cpu_arm=no
+cpu_x86=no
 
 AS_IF([test x"${enable_asm}" = x"yes"],[
     inline_optimization="No inline ASM for your platform, please send patches"
@@ -535,6 +536,7 @@
    ],
    [i?86|x86_64],
    [
+      cpu_x86=yes
       OPUS_CHECK_INTRINSICS(
          [SSE],
          [$X86_SSE_CFLAGS],
@@ -724,7 +726,7 @@
                  unsigned int CPUInfo2;
                  unsigned int CPUInfo3;
                  unsigned int InfoType;
-                 __get_cpuid(InfoType, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
+                 __get_cpuid_count(InfoType, 0, &CPUInfo0, &CPUInfo1, &CPUInfo2, &CPUInfo3);
             ]])],
             [AC_MSG_RESULT([C method])
                  AC_DEFINE([CPU_INFO_BY_C], [1], [Get CPU Info by c method])],
@@ -744,6 +746,7 @@
     [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"])
 AM_CONDITIONAL([HAVE_ARM_NE10],
     [test x"$HAVE_ARM_NE10" = x"1"])
+AM_CONDITIONAL([CPU_X86], [test "$cpu_x86" = "yes"])
 AM_CONDITIONAL([HAVE_SSE],
     [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"])
 AM_CONDITIONAL([HAVE_SSE2],
@@ -753,6 +756,8 @@
 AM_CONDITIONAL([HAVE_AVX],
     [test x"$OPUS_X86_MAY_HAVE_AVX" = x"1"])
 
+AM_CONDITIONAL([HAVE_RTCD],
+ [test x"$enable_rtcd" = x"yes" -a x"$rtcd_support" != x"no"])
 AS_IF([test x"$enable_rtcd" = x"yes"],[
     AS_IF([test x"$rtcd_support" != x"no"],[
         AC_DEFINE([OPUS_HAVE_RTCD], [1],
diff --git a/fuzzer/Android.bp b/fuzzer/Android.bp
index 45ce6ab..be47f44 100644
--- a/fuzzer/Android.bp
+++ b/fuzzer/Android.bp
@@ -39,6 +39,14 @@
             "android-media-fuzzing-reports@google.com",
         ],
         componentid: 155276,
+        hotlists: [
+            "4593311",
+        ],
+        description: "The fuzzer targets the APIs of libopus",
+        vector: "remote",
+        service_privilege: "constrained",
+        users: "multi_user",
+        fuzzed_code_usage: "shipped",
     },
 }
 
diff --git a/include/opus.h b/include/opus.h
index d282f21..0c69c62 100644
--- a/include/opus.h
+++ b/include/opus.h
@@ -198,7 +198,7 @@
  *                                     This must be one of 8000, 12000, 16000,
  *                                     24000, or 48000.
  * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
- * @param [in] application <tt>int</tt>: Coding mode (@ref OPUS_APPLICATION_VOIP/@ref OPUS_APPLICATION_AUDIO/@ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+ * @param [in] application <tt>int</tt>: Coding mode (one of @ref OPUS_APPLICATION_VOIP, @ref OPUS_APPLICATION_AUDIO, or @ref OPUS_APPLICATION_RESTRICTED_LOWDELAY)
  * @param [out] error <tt>int*</tt>: @ref opus_errorcodes
  * @note Regardless of the sampling rate and number channels selected, the Opus encoder
  * can switch to a lower audio bandwidth or number of channels if the bitrate
@@ -222,7 +222,7 @@
  *                                      This must be one of 8000, 12000, 16000,
  *                                      24000, or 48000.
   * @param [in] channels <tt>int</tt>: Number of channels (1 or 2) in input signal
-  * @param [in] application <tt>int</tt>: Coding mode (OPUS_APPLICATION_VOIP/OPUS_APPLICATION_AUDIO/OPUS_APPLICATION_RESTRICTED_LOWDELAY)
+  * @param [in] application <tt>int</tt>: Coding mode (one of OPUS_APPLICATION_VOIP, OPUS_APPLICATION_AUDIO, or OPUS_APPLICATION_RESTRICTED_LOWDELAY)
   * @retval #OPUS_OK Success or @ref opus_errorcodes
   */
 OPUS_EXPORT int opus_encoder_init(
diff --git a/include/opus_custom.h b/include/opus_custom.h
index 2227be0..2f22d4b 100644
--- a/include/opus_custom.h
+++ b/include/opus_custom.h
@@ -104,7 +104,8 @@
 /** The mode contains all the information necessary to create an
     encoder. Both the encoder and decoder need to be initialized
     with exactly the same mode, otherwise the output will be
-    corrupted.
+    corrupted. The mode MUST NOT BE DESTROYED until the encoders and
+    decoders that use it are destroyed as well.
    @brief Mode configuration
  */
 typedef struct OpusCustomMode OpusCustomMode;
diff --git a/include/opus_defines.h b/include/opus_defines.h
index ceee5b8..94b9e0d 100644
--- a/include/opus_defines.h
+++ b/include/opus_defines.h
@@ -482,7 +482,8 @@
   * @param[in] x <tt>opus_int32</tt>: Allowed values:
   * <dl>
   * <dt>0</dt><dd>Disable inband FEC (default).</dd>
-  * <dt>1</dt><dd>Enable inband FEC.</dd>
+  * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+  * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
   * </dl>
   * @hideinitializer */
 #define OPUS_SET_INBAND_FEC(x) OPUS_SET_INBAND_FEC_REQUEST, __opus_check_int(x)
@@ -491,7 +492,8 @@
   * @param[out] x <tt>opus_int32 *</tt>: Returns one of the following values:
   * <dl>
   * <dt>0</dt><dd>Inband FEC disabled (default).</dd>
-  * <dt>1</dt><dd>Inband FEC enabled.</dd>
+  * <dt>1</dt><dd>Inband FEC enabled. If the packet loss rate is sufficiently high, Opus will automatically switch to SILK even at high rates to enable use of that FEC.</dd>
+  * <dt>2</dt><dd>Inband FEC enabled, but does not necessarily switch to SILK if we have music.</dd>
   * </dl>
   * @hideinitializer */
 #define OPUS_GET_INBAND_FEC(x) OPUS_GET_INBAND_FEC_REQUEST, __opus_check_int_ptr(x)
diff --git a/libopus_blocklist.txt b/libopus_blocklist.txt
index 51db611..84c19ee 100644
--- a/libopus_blocklist.txt
+++ b/libopus_blocklist.txt
@@ -24,7 +24,13 @@
 fun:silk_noise_shape_quantizer_del_dec
 # silk/NSQ.c:265:25: 1318152552 + 1068143768 cannot be represented in type 'int'
 fun:silk_noise_shape_quantizer
-
+# silk/x86/NSQ_del_dec_sse4_1.c:571:28: 1162446838 - -1165932966 cannot be represented in type 'int'
+fun:silk_noise_shape_quantizer_del_dec_sse4_1
+# silk/fixed/x86/burg_modified_FIX_sse4_1.c:277: 1940085720 + 252655088 cannot be represented
+# in type 'int'
+fun:silk_burg_modified_sse4_1
+# silk/fixed/burg_modified_FIX.c:181 1940085720 + 252655088 cannot be represented in type 'int'
+fun:silk_burg_modified_c
 src:*/celt/kiss_fft.c
 
 # assembly optimizations that know what they are doing
diff --git a/meson.build b/meson.build
index 41f6935..ed66d38 100644
--- a/meson.build
+++ b/meson.build
@@ -532,9 +532,9 @@
     endif # opt_rtcd
   else
     if opt_intrinsics.enabled()
-      error('intrinsics option enabled, but no intrinsics support for ' + host_machine.get_cpu())
+      error('intrinsics option enabled, but no intrinsics support for ' + host_cpu_family)
     endif
-    warning('No intrinsics support for ' + host_machine.get_cpu())
+    warning('No intrinsics support for ' + host_cpu_family)
   endif
 endif
 
diff --git a/meson/get-version.py b/meson/get-version.py
index 0e8b862..d3835f1 100755
--- a/meson/get-version.py
+++ b/meson/get-version.py
@@ -31,7 +31,7 @@
 
         # check if git checkout
         git_dir = os.path.join(srcroot, '.git')
-        is_git = os.path.isdir(git_dir)
+        is_git = os.path.isdir(git_dir) or os.path.isfile(git_dir)
         have_git = shutil.which('git') is not None
 
         if is_git and have_git:
diff --git a/opus.m4 b/opus.m4
index 47f5ec4..263470d 100644
--- a/opus.m4
+++ b/opus.m4
@@ -63,7 +63,7 @@
 #include <string.h>
 #include <opus.h>
 
-int main ()
+int main (void)
 {
   system("touch conf.opustest");
   return 0;
diff --git a/silk/LPC_fit.c b/silk/LPC_fit.c
index cdea4f3..c0690a1 100644
--- a/silk/LPC_fit.c
+++ b/silk/LPC_fit.c
@@ -31,7 +31,8 @@
 
 #include "SigProc_FIX.h"
 
-/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */
+/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around.
+   This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
 void silk_LPC_fit(
     opus_int16                  *a_QOUT,            /* O    Output signal                                               */
     opus_int32                    *a_QIN,             /* I/O  Input signal                                                */
diff --git a/silk/MacroDebug.h b/silk/MacroDebug.h
index 8dd4ce2..3110da9 100644
--- a/silk/MacroDebug.h
+++ b/silk/MacroDebug.h
@@ -55,7 +55,7 @@
 static OPUS_INLINE opus_int32 silk_ADD32_(opus_int32 a, opus_int32 b, char *file, int line){
     opus_int32 ret;
 
-    ret = a + b;
+    ret = (opus_int32)((opus_uint32)a + (opus_uint32)b);
     if ( ret != silk_ADD_SAT32( a, b ) )
     {
         fprintf (stderr, "silk_ADD32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -101,9 +101,9 @@
 #undef silk_SUB32
 #define silk_SUB32(a,b) silk_SUB32_((a), (b), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB32_(opus_int32 a, opus_int32 b, char *file, int line){
-    opus_int32 ret;
+    opus_int64 ret;
 
-    ret = a - b;
+    ret = a - (opus_int64)b;
     if ( ret != silk_SUB_SAT32( a, b ) )
     {
         fprintf (stderr, "silk_SUB32(%d, %d) in %s: line %d\n", a, b, file, line);
@@ -257,7 +257,7 @@
 static OPUS_INLINE opus_int32 silk_MUL_(opus_int32 a32, opus_int32 b32, char *file, int line){
     opus_int32 ret;
     opus_int64 ret64;
-    ret = a32 * b32;
+    ret = (opus_int32)((opus_uint32)a32 * (opus_uint32)b32);
     ret64 = (opus_int64)a32 * (opus_int64)b32;
     if ( (opus_int64)ret != ret64 )
     {
@@ -333,8 +333,8 @@
 #define silk_SMLAWB(a,b,c) silk_SMLAWB_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SMLAWB_(opus_int32 a32, opus_int32 b32, opus_int32 c32, char *file, int line){
     opus_int32 ret;
-    ret = silk_ADD32( a32, silk_SMULWB( b32, c32 ) );
-    if ( silk_ADD32( a32, silk_SMULWB( b32, c32 ) ) != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
+    ret = silk_ADD32_ovflw( a32, silk_SMULWB( b32, c32 ) );
+    if ( ret != silk_ADD_SAT32( a32, silk_SMULWB( b32, c32 ) ) )
     {
         fprintf (stderr, "silk_SMLAWB(%d, %d, %d) in %s: line %d\n", a32, b32, c32, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -465,7 +465,7 @@
 
     if ( fail )
     {
-        fprintf (stderr, "silk_SMULWT(%d, %d) in %s: line %d\n", a32, b32, file, line);
+        fprintf (stderr, "silk_SMULWW(%d, %d) in %s: line %d\n", a32, b32, file, line);
 #ifdef FIXED_DEBUG_ASSERT
         silk_assert( 0 );
 #endif
@@ -491,12 +491,6 @@
     return ret;
 }
 
-/* Multiply-accumulate macros that allow overflow in the addition (ie, no asserts in debug mode) */
-#undef  silk_MLA_ovflw
-#define silk_MLA_ovflw(a32, b32, c32)    ((a32) + ((b32) * (c32)))
-#undef  silk_SMLABB_ovflw
-#define silk_SMLABB_ovflw(a32, b32, c32)    ((a32) + ((opus_int32)((opus_int16)(b32))) * (opus_int32)((opus_int16)(c32)))
-
 /* no checking needed for silk_SMULL
    no checking needed for silk_SMLAL
    no checking needed for silk_SMLALBB
@@ -546,10 +540,10 @@
 static OPUS_INLINE opus_int8 silk_LSHIFT8_(opus_int8 a, opus_int32 shift, char *file, int line){
     opus_int8 ret;
     int       fail = 0;
-    ret = a << shift;
+    ret = (opus_int8)((opus_uint8)a << shift);
     fail |= shift < 0;
     fail |= shift >= 8;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT8(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -565,10 +559,10 @@
 static OPUS_INLINE opus_int16 silk_LSHIFT16_(opus_int16 a, opus_int32 shift, char *file, int line){
     opus_int16 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int16)((opus_uint16)a << shift);
     fail |= shift < 0;
     fail |= shift >= 16;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT16(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -584,10 +578,10 @@
 static OPUS_INLINE opus_int32 silk_LSHIFT32_(opus_int32 a, opus_int32 shift, char *file, int line){
     opus_int32 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int32)((opus_uint32)a << shift);
     fail |= shift < 0;
     fail |= shift >= 32;
-    fail |= (opus_int64)ret != ((opus_int64)a) << shift;
+    fail |= (opus_int64)ret != (opus_int64)(((opus_uint64)a) << shift);
     if ( fail )
     {
         fprintf (stderr, "silk_LSHIFT32(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -603,7 +597,7 @@
 static OPUS_INLINE opus_int64 silk_LSHIFT64_(opus_int64 a, opus_int shift, char *file, int line){
     opus_int64 ret;
     int        fail = 0;
-    ret = a << shift;
+    ret = (opus_int64)((opus_uint64)a << shift);
     fail |= shift < 0;
     fail |= shift >= 64;
     fail |= (ret>>shift) != ((opus_int64)a);
@@ -714,8 +708,8 @@
 #define silk_ADD_LSHIFT(a,b,c) silk_ADD_LSHIFT_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE int silk_ADD_LSHIFT_(int a, int b, int shift, char *file, int line){
     opus_int16 ret;
-    ret = a + (b << shift);
-    if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+    ret = a + (opus_int16)((opus_uint16)b << shift);
+    if ( (shift < 0) || (shift>15) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_ADD_LSHIFT(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -729,8 +723,8 @@
 #define silk_ADD_LSHIFT32(a,b,c) silk_ADD_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_ADD_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a + (b << shift);
-    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) << shift)) )
+    ret = silk_ADD32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
+    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_ADD_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -774,7 +768,7 @@
 #define silk_ADD_RSHIFT32(a,b,c) silk_ADD_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_ADD_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a + (b >> shift);
+    ret = silk_ADD32_ovflw(a, (b >> shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a + (((opus_int64)b) >> shift)) )
     {
         fprintf (stderr, "silk_ADD_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -804,8 +798,8 @@
 #define silk_SUB_LSHIFT32(a,b,c) silk_SUB_LSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB_LSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a - (b << shift);
-    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) << shift)) )
+    ret = silk_SUB32_ovflw(a, (opus_int32)((opus_uint32)b << shift));
+    if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (opus_int64)(((opus_uint64)b) << shift)) )
     {
         fprintf (stderr, "silk_SUB_LSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
 #ifdef FIXED_DEBUG_ASSERT
@@ -819,7 +813,7 @@
 #define silk_SUB_RSHIFT32(a,b,c) silk_SUB_RSHIFT32_((a), (b), (c), __FILE__, __LINE__)
 static OPUS_INLINE opus_int32 silk_SUB_RSHIFT32_(opus_int32 a, opus_int32 b, opus_int32 shift, char *file, int line){
     opus_int32 ret;
-    ret = a - (b >> shift);
+    ret = silk_SUB32_ovflw(a, (b >> shift));
     if ( (shift < 0) || (shift>31) || ((opus_int64)ret != (opus_int64)a - (((opus_int64)b) >> shift)) )
     {
         fprintf (stderr, "silk_SUB_RSHIFT32(%d, %d, %d) in %s: line %d\n", a, b, shift, file, line);
@@ -835,7 +829,7 @@
 static OPUS_INLINE opus_int32 silk_RSHIFT_ROUND_(opus_int32 a, opus_int32 shift, char *file, int line){
     opus_int32 ret;
     ret = shift == 1 ? (a >> 1) + (a & 1) : ((a >> (shift - 1)) + 1) >> 1;
-    /* the marco definition can't handle a shift of zero */
+    /* the macro definition can't handle a shift of zero */
     if ( (shift <= 0) || (shift>31) || ((opus_int64)ret != ((opus_int64)a + ((opus_int64)1 << (shift - 1))) >> shift) )
     {
         fprintf (stderr, "silk_RSHIFT_ROUND(%d, %d) in %s: line %d\n", a, shift, file, line);
@@ -850,7 +844,7 @@
 #define silk_RSHIFT_ROUND64(a,b) silk_RSHIFT_ROUND64_((a), (b), __FILE__, __LINE__)
 static OPUS_INLINE opus_int64 silk_RSHIFT_ROUND64_(opus_int64 a, opus_int32 shift, char *file, int line){
     opus_int64 ret;
-    /* the marco definition can't handle a shift of zero */
+    /* the macro definition can't handle a shift of zero */
     if ( (shift <= 0) || (shift>=64) )
     {
         fprintf (stderr, "silk_RSHIFT_ROUND64(%lld, %d) in %s: line %d\n", (long long)a, shift, file, line);
diff --git a/silk/NSQ.c b/silk/NSQ.c
index 1d64d8e..45dd45c 100644
--- a/silk/NSQ.c
+++ b/silk/NSQ.c
@@ -75,21 +75,21 @@
 
 void silk_NSQ_c
 (
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            k, lag, start_idx, LSF_interpolation_flag;
@@ -173,9 +173,9 @@
     RESTORE_STACK;
 }
 
-/***********************************/
-/* silk_noise_shape_quantizer  */
-/***********************************/
+/******************************/
+/* silk_noise_shape_quantizer */
+/******************************/
 
 #if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 static OPUS_INLINE
@@ -262,7 +262,7 @@
         tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
             shp_lag_ptr++;
diff --git a/silk/NSQ_del_dec.c b/silk/NSQ_del_dec.c
index 00e749c..41f3fc9 100644
--- a/silk/NSQ_del_dec.c
+++ b/silk/NSQ_del_dec.c
@@ -115,21 +115,21 @@
 );
 
 void silk_NSQ_del_dec_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     const opus_int16            x16[],                                        /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index a906890..65fe6a0 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -381,7 +381,7 @@
     const opus_int              len                 /*    I vector lengths                                              */
 );
 
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
@@ -630,12 +630,14 @@
 /* the following seems faster on x86 */
 #define silk_SMMUL(a32, b32)                (opus_int32)silk_RSHIFT64(silk_SMULL((a32), (b32)), 32)
 
-#if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#if !defined(OVERRIDE_silk_burg_modified)
 #define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
     ((void)(arch), silk_burg_modified_c(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#endif
 
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_c(inVec1, inVec2, len))
+#if !defined(OVERRIDE_silk_inner_prod16)
+#define silk_inner_prod16(inVec1, inVec2, len, arch) \
+    ((void)(arch),silk_inner_prod16_c(inVec1, inVec2, len))
 #endif
 
 #include "Inlines.h"
diff --git a/silk/VQ_WMat_EC.c b/silk/VQ_WMat_EC.c
index 0f3d545..245a7e4 100644
--- a/silk/VQ_WMat_EC.c
+++ b/silk/VQ_WMat_EC.c
@@ -64,7 +64,7 @@
     *rate_dist_Q8 = silk_int32_MAX;
     *res_nrg_Q15 = silk_int32_MAX;
     cb_row_Q7 = cb_Q7;
-    /* In things go really bad, at least *ind is set to something safe. */
+    /* If things go really bad, at least *ind is set to something safe. */
     *ind = 0;
     for( k = 0; k < L; k++ ) {
         opus_int32 penalty;
@@ -115,7 +115,7 @@
         if( sum1_Q15 >= 0 ) {
             /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
             bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
-            /* In the following line we reduce the codelength component by half ("-1"); seems to slghtly improve quality */
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
             bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
             if( bits_tot_Q8 <= *rate_dist_Q8 ) {
                 *rate_dist_Q8 = bits_tot_Q8;
diff --git a/silk/bwexpander_32.c b/silk/bwexpander_32.c
index d0010f7..0f32b9d 100644
--- a/silk/bwexpander_32.c
+++ b/silk/bwexpander_32.c
@@ -31,7 +31,8 @@
 
 #include "SigProc_FIX.h"
 
-/* Chirp (bandwidth expand) LP AR filter */
+/* Chirp (bandwidth expand) LP AR filter.
+   This logic is reused in _celt_lpc(). Any bug fixes should also be applied there. */
 void silk_bwexpander_32(
     opus_int32                  *ar,                /* I/O  AR filter to be expanded (without leading 1)                */
     const opus_int              d,                  /* I    Length of ar                                                */
diff --git a/silk/control_codec.c b/silk/control_codec.c
index 52aa8fd..784ffe6 100644
--- a/silk/control_codec.c
+++ b/silk/control_codec.c
@@ -415,7 +415,7 @@
             /* Previous packet did not have LBRR, and was therefore coded at a higher bitrate */
             psEncC->LBRR_GainIncreases = 7;
         } else {
-            psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.4, 16 ) ), 2 );
+            psEncC->LBRR_GainIncreases = silk_max_int( 7 - silk_SMULWB( (opus_int32)psEncC->PacketLoss_perc, SILK_FIX_CONST( 0.2, 16 ) ), 3 );
         }
     }
 
diff --git a/silk/enc_API.c b/silk/enc_API.c
index 55a33f3..548e073 100644
--- a/silk/enc_API.c
+++ b/silk/enc_API.c
@@ -270,6 +270,7 @@
                        psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
     ALLOC( buf, nSamplesFromInputMax, opus_int16 );
     while( 1 ) {
+        int curr_nBitsUsedLBRR = 0;
         nSamplesToBuffer  = psEnc->state_Fxx[ 0 ].sCmn.frame_length - psEnc->state_Fxx[ 0 ].sCmn.inputBufIx;
         nSamplesToBuffer  = silk_min( nSamplesToBuffer, nSamplesToBufferMax );
         nSamplesFromInput = silk_DIV32_16( nSamplesToBuffer * psEnc->state_Fxx[ 0 ].sCmn.API_fs_Hz, psEnc->state_Fxx[ 0 ].sCmn.fs_kHz * 1000 );
@@ -342,6 +343,7 @@
                 opus_uint8 iCDF[ 2 ] = { 0, 0 };
                 iCDF[ 0 ] = 256 - silk_RSHIFT( 256, ( psEnc->state_Fxx[ 0 ].sCmn.nFramesPerPacket + 1 ) * encControl->nChannelsInternal );
                 ec_enc_icdf( psRangeEnc, 0, iCDF, 8 );
+                curr_nBitsUsedLBRR = ec_tell( psRangeEnc );
 
                 /* Encode any LBRR data from previous packet */
                 /* Encode LBRR flags */
@@ -386,8 +388,7 @@
                 for( n = 0; n < encControl->nChannelsInternal; n++ ) {
                     silk_memset( psEnc->state_Fxx[ n ].sCmn.LBRR_flags, 0, sizeof( psEnc->state_Fxx[ n ].sCmn.LBRR_flags ) );
                 }
-
-                psEnc->nBitsUsedLBRR = ec_tell( psRangeEnc );
+                curr_nBitsUsedLBRR = ec_tell( psRangeEnc ) - curr_nBitsUsedLBRR;
             }
 
             silk_HP_variable_cutoff( psEnc->state_Fxx );
@@ -396,6 +397,16 @@
             nBits = silk_DIV32_16( silk_MUL( encControl->bitRate, encControl->payloadSize_ms ), 1000 );
             /* Subtract bits used for LBRR */
             if( !prefillFlag ) {
+                /* psEnc->nBitsUsedLBRR is an exponential moving average of the LBRR usage,
+                   except that for the first LBRR frame it does no averaging and for the first
+                   frame after after LBRR, it goes back to zero immediately. */
+                if ( curr_nBitsUsedLBRR < 10 ) {
+                    psEnc->nBitsUsedLBRR = 0;
+                } else if ( psEnc->nBitsUsedLBRR < 10) {
+                    psEnc->nBitsUsedLBRR = curr_nBitsUsedLBRR;
+                } else {
+                    psEnc->nBitsUsedLBRR = ( psEnc->nBitsUsedLBRR + curr_nBitsUsedLBRR ) / 2;
+                }
                 nBits -= psEnc->nBitsUsedLBRR;
             }
             /* Divide by number of uncoded frames left in packet */
diff --git a/silk/fixed/LTP_scale_ctrl_FIX.c b/silk/fixed/LTP_scale_ctrl_FIX.c
index 3dcedef..db1016e 100644
--- a/silk/fixed/LTP_scale_ctrl_FIX.c
+++ b/silk/fixed/LTP_scale_ctrl_FIX.c
@@ -42,9 +42,14 @@
 
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
-        round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
-        psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT(
-            silk_SMULWB( silk_SMULBB( round_loss, psEncCtrl->LTPredCodGain_Q7 ), SILK_FIX_CONST( 0.1, 9 ) ), 0, 2 );
+        round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
+        if ( psEnc->sCmn.LBRR_flag ) {
+            /* LBRR reduces the effective loss. In practice, it does not square the loss because
+               losses aren't independent, but that still seems to work best. We also never go below 2%. */
+            round_loss = 2 + silk_SMULBB( round_loss, round_loss ) / 100;
+        }
+        psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 2900-psEnc->sCmn.SNR_dB_Q7 );
+        psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain_Q7, round_loss ) > silk_log2lin( 128*7 + 3900-psEnc->sCmn.SNR_dB_Q7 );
     } else {
         /* Default is minimum scaling */
         psEnc->sCmn.indices.LTP_scaleIndex = 0;
diff --git a/silk/fixed/burg_modified_FIX.c b/silk/fixed/burg_modified_FIX.c
index 274d4b2..185a12b 100644
--- a/silk/fixed/burg_modified_FIX.c
+++ b/silk/fixed/burg_modified_FIX.c
@@ -68,7 +68,7 @@
     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    C0_64 = silk_inner_prod16_aligned_64( x, x, subfr_length*nb_subfr, arch );
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
     lz = silk_CLZ64(C0_64);
     rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
     if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
@@ -87,7 +87,7 @@
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -150,7 +150,7 @@
                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    /* We sometimes have get overflows in the multiplications (even beyond +/- 2^32),
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
                        but they cancel each other and the real result seems to always fit in a 32-bit
                        signed integer. This was determined experimentally, not theoretically (unfortunately). */
                     tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
@@ -253,7 +253,7 @@
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
diff --git a/silk/fixed/find_pred_coefs_FIX.c b/silk/fixed/find_pred_coefs_FIX.c
index 606d863..ad363fb 100644
--- a/silk/fixed/find_pred_coefs_FIX.c
+++ b/silk/fixed/find_pred_coefs_FIX.c
@@ -42,7 +42,8 @@
 {
     opus_int         i;
     opus_int32       invGains_Q16[ MAX_NB_SUBFR ], local_gains[ MAX_NB_SUBFR ];
-    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ];
+    /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ]={0};
     const opus_int16 *x_ptr;
     opus_int16       *x_pre_ptr;
     VARDECL( opus_int16, LPC_in_pre );
diff --git a/silk/fixed/vector_ops_FIX.c b/silk/fixed/vector_ops_FIX.c
index d949800..dcf8407 100644
--- a/silk/fixed/vector_ops_FIX.c
+++ b/silk/fixed/vector_ops_FIX.c
@@ -87,7 +87,7 @@
 #endif
 }
 
-opus_int64 silk_inner_prod16_aligned_64_c(
+opus_int64 silk_inner_prod16_c(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
diff --git a/silk/fixed/x86/burg_modified_FIX_sse4_1.c b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
index bbb1ce0..e58bf07 100644
--- a/silk/fixed/x86/burg_modified_FIX_sse4_1.c
+++ b/silk/fixed/x86/burg_modified_FIX_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -42,7 +42,7 @@
 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
 
 #define QA                          25
-#define N_BITS_HEAD_ROOM            2
+#define N_BITS_HEAD_ROOM            3
 #define MIN_RSHIFTS                 -16
 #define MAX_RSHIFTS                 (32 - QA)
 
@@ -59,7 +59,7 @@
     int                         arch                /* I    Run-time architecture                                       */
 )
 {
-    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
+    opus_int         k, n, s, lz, rshifts, reached_max_gain;
     opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
     const opus_int16 *x_ptr;
     opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
@@ -68,6 +68,7 @@
     opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
     opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
+    opus_int64       C0_64;
 
     __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
     __m128i CONST1 = _mm_set1_epi32(1);
@@ -75,23 +76,18 @@
     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
 
     /* Compute autocorrelations, added over subframes */
-    silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
-    if( rshifts > MAX_RSHIFTS ) {
-        C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
-        silk_assert( C0 > 0 );
-        rshifts = MAX_RSHIFTS;
+    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
+    lz = silk_CLZ64(C0_64);
+    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
+    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
+    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
+
+    if (rshifts > 0) {
+        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
     } else {
-        lz = silk_CLZ32( C0 ) - 1;
-        rshifts_extra = N_BITS_HEAD_ROOM - lz;
-        if( rshifts_extra > 0 ) {
-            rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
-            C0 = silk_RSHIFT32( C0, rshifts_extra );
-        } else {
-            rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
-            C0 = silk_LSHIFT32( C0, -rshifts_extra );
-        }
-        rshifts += rshifts_extra;
+        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
     }
+
     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
     silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
     if( rshifts > 0 ) {
@@ -99,7 +95,7 @@
             x_ptr = x + s * subfr_length;
             for( n = 1; n < D + 1; n++ ) {
                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
-                    silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
+                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
             }
         }
     } else {
@@ -203,8 +199,11 @@
                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
-                    tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
-                    tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
+                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
+                       but they cancel each other and the real result seems to always fit in a 32-bit
+                       signed integer. This was determined experimentally, not theoretically (unfortunately). */
+                    tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
+                    tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
                 }
 
                 tmp1 = -tmp1;                /* Q17 */
@@ -350,7 +349,7 @@
         if( rshifts > 0 ) {
             for( s = 0; s < nb_subfr; s++ ) {
                 x_ptr = x + s * subfr_length;
-                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
+                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
             }
         } else {
             for( s = 0; s < nb_subfr; s++ ) {
@@ -374,4 +373,28 @@
         *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
         *res_nrg_Q = -rshifts;
     }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int32 res_nrg_c = 0;
+        opus_int res_nrg_Q_c = 0;
+        opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
+
+        silk_burg_modified_c(
+            &res_nrg_c,
+            &res_nrg_Q_c,
+            A_Q16_c,
+            x,
+            minInvGain_Q30,
+            subfr_length,
+            nb_subfr,
+            D,
+            0
+        );
+
+        silk_assert( *res_nrg == res_nrg_c );
+        silk_assert( *res_nrg_Q == res_nrg_Q_c );
+        silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
+    }
+#endif
 }
diff --git a/silk/fixed/x86/prefilter_FIX_sse.c b/silk/fixed/x86/prefilter_FIX_sse.c
deleted file mode 100644
index 555432c..0000000
--- a/silk/fixed/x86/prefilter_FIX_sse.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-
-   - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
-#include "main.h"
-#include "celt/x86/x86cpu.h"
-
-void silk_warped_LPC_analysis_filter_FIX_sse4_1(
-    opus_int32                  state[],                    /* I/O  State [order + 1]                   */
-    opus_int32                  res_Q2[],                   /* O    Residual signal [length]            */
-    const opus_int16            coef_Q13[],                 /* I    Coefficients [order]                */
-    const opus_int16            input[],                    /* I    Input signal [length]               */
-    const opus_int16            lambda_Q16,                 /* I    Warping factor                      */
-    const opus_int              length,                     /* I    Length of input signal              */
-    const opus_int              order                       /* I    Filter order (even)                 */
-)
-{
-    opus_int     n, i;
-    opus_int32   acc_Q11, tmp1, tmp2;
-
-    /* Order must be even */
-    celt_assert( ( order & 1 ) == 0 );
-
-    if (order == 10)
-    {
-        if (0 == lambda_Q16)
-        {
-            __m128i coef_Q13_3210, coef_Q13_7654;
-            __m128i coef_Q13_0123, coef_Q13_4567;
-            __m128i state_0123, state_4567;
-            __m128i xmm_product1, xmm_product2;
-            __m128i xmm_tempa, xmm_tempb;
-
-            register opus_int32 sum;
-            register opus_int32 state_8, state_9, state_a;
-            register opus_int64 coef_Q13_8, coef_Q13_9;
-
-            celt_assert( length > 0 );
-
-            coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
-            coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
-
-            coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-            coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-            coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
-            coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
-
-            state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
-            state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
-
-            state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-            state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-            state_8 = state[ 8 ];
-            state_9 = state[ 9 ];
-            state_a = 0;
-
-            for( n = 0; n < length; n++ )
-            {
-                xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
-                xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
-
-                xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-                xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
-
-                xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
-                xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
-
-                xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
-                xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
-
-                xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
-                xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
-
-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
-                xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
-                xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
-
-                sum  = (opus_int32)((coef_Q13_8 * state_8) >> 16);
-                sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
-
-                xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
-                sum += _mm_cvtsi128_si32( xmm_tempa);
-                res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
-
-                /* move right */
-                state_a = state_9;
-                state_9 = state_8;
-                state_8 = _mm_cvtsi128_si32( state_4567 );
-                state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
-
-                state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
-            }
-
-            _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
-            _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
-            state[ 8 ] = state_8;
-            state[ 9 ] = state_9;
-            state[ 10 ] = state_a;
-
-            return;
-        }
-    }
-
-    for( n = 0; n < length; n++ ) {
-        /* Output of lowpass section */
-        tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
-        state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
-        /* Output of allpass section */
-        tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
-        state[ 1 ] = tmp2;
-        acc_Q11 = silk_RSHIFT( order, 1 );
-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
-        /* Loop over allpass sections */
-        for( i = 2; i < order; i += 2 ) {
-            /* Output of allpass section */
-            tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
-            state[ i ] = tmp1;
-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
-            /* Output of allpass section */
-            tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
-            state[ i + 1 ] = tmp2;
-            acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
-        }
-        state[ order ] = tmp1;
-        acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
-        res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
-    }
-}
diff --git a/silk/fixed/x86/vector_ops_FIX_sse4_1.c b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
index c1e9056..a46289b 100644
--- a/silk/fixed/x86/vector_ops_FIX_sse4_1.c
+++ b/silk/fixed/x86/vector_ops_FIX_sse4_1.c
@@ -36,40 +36,38 @@
 
 #include "SigProc_FIX.h"
 #include "pitch.h"
+#include "celt/x86/x86cpu.h"
 
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16            *inVec1,            /*    I input vector 1                                              */
     const opus_int16            *inVec2,            /*    I input vector 2                                              */
     const opus_int              len                 /*    I vector lengths                                              */
 )
 {
-    opus_int  i, dataSize8;
+    opus_int  i, dataSize4;
     opus_int64 sum;
 
-    __m128i xmm_tempa;
-    __m128i inVec1_76543210, acc1;
-    __m128i inVec2_76543210, acc2;
+    __m128i xmm_prod_20, xmm_prod_31;
+    __m128i inVec1_3210, acc1;
+    __m128i inVec2_3210, acc2;
 
     sum = 0;
-    dataSize8 = len & ~7;
+    dataSize4 = len & ~3;
 
     acc1 = _mm_setzero_si128();
     acc2 = _mm_setzero_si128();
 
-    for( i = 0; i < dataSize8; i += 8 ) {
-        inVec1_76543210 = _mm_loadu_si128( (__m128i *)(&inVec1[i + 0] ) );
-        inVec2_76543210 = _mm_loadu_si128( (__m128i *)(&inVec2[i + 0] ) );
+    for( i = 0; i < dataSize4; i += 4 ) {
+        inVec1_3210 = OP_CVTEPI16_EPI32_M64( &inVec1[i + 0] );
+        inVec2_3210 = OP_CVTEPI16_EPI32_M64( &inVec2[i + 0] );
+        xmm_prod_20 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
 
-        /* only when all 4 operands are -32768 (0x8000), this results in wrap around */
-        inVec1_76543210 = _mm_madd_epi16( inVec1_76543210, inVec2_76543210 );
+        inVec1_3210 = _mm_shuffle_epi32( inVec1_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        inVec2_3210 = _mm_shuffle_epi32( inVec2_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_prod_31 = _mm_mul_epi32( inVec1_3210, inVec2_3210 );
 
-        xmm_tempa       = _mm_cvtepi32_epi64( inVec1_76543210 );
-        /* equal shift right 8 bytes */
-        inVec1_76543210 = _mm_shuffle_epi32( inVec1_76543210, _MM_SHUFFLE( 0, 0, 3, 2 ) );
-        inVec1_76543210 = _mm_cvtepi32_epi64( inVec1_76543210 );
-
-        acc1 = _mm_add_epi64( acc1, xmm_tempa );
-        acc2 = _mm_add_epi64( acc2, inVec1_76543210 );
+        acc1 = _mm_add_epi64( acc1, xmm_prod_20 );
+        acc2 = _mm_add_epi64( acc2, xmm_prod_31 );
     }
 
     acc1 = _mm_add_epi64( acc1, acc2 );
@@ -81,8 +79,15 @@
     _mm_storel_epi64( (__m128i *)&sum, acc1 );
 
     for( ; i < len; i++ ) {
-        sum = silk_SMLABB( sum, inVec1[ i ], inVec2[ i ] );
+        sum = silk_SMLALBB( sum, inVec1[ i ], inVec2[ i ] );
     }
 
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int64 sum_c = silk_inner_prod16_c( inVec1, inVec2, len );
+        silk_assert( sum == sum_c );
+    }
+#endif
+
     return sum;
 }
diff --git a/silk/float/LTP_scale_ctrl_FLP.c b/silk/float/LTP_scale_ctrl_FLP.c
index 8dbe29d..6f30ff0 100644
--- a/silk/float/LTP_scale_ctrl_FLP.c
+++ b/silk/float/LTP_scale_ctrl_FLP.c
@@ -41,8 +41,14 @@
 
     if( condCoding == CODE_INDEPENDENTLY ) {
         /* Only scale if first frame in packet */
-        round_loss = psEnc->sCmn.PacketLoss_perc + psEnc->sCmn.nFramesPerPacket;
-        psEnc->sCmn.indices.LTP_scaleIndex = (opus_int8)silk_LIMIT( round_loss * psEncCtrl->LTPredCodGain * 0.1f, 0.0f, 2.0f );
+        round_loss = psEnc->sCmn.PacketLoss_perc * psEnc->sCmn.nFramesPerPacket;
+        if ( psEnc->sCmn.LBRR_flag ) {
+            /* LBRR reduces the effective loss. In practice, it does not square the loss because
+               losses aren't independent, but that still seems to work best. We also never go below 2%. */
+            round_loss = 2 + silk_SMULBB( round_loss, round_loss) / 100;
+        }
+        psEnc->sCmn.indices.LTP_scaleIndex = silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 2900 - psEnc->sCmn.SNR_dB_Q7 );
+        psEnc->sCmn.indices.LTP_scaleIndex += silk_SMULBB( psEncCtrl->LTPredCodGain, round_loss ) > silk_log2lin( 3900 - psEnc->sCmn.SNR_dB_Q7 );
     } else {
         /* Default is minimum scaling */
         psEnc->sCmn.indices.LTP_scaleIndex = 0;
diff --git a/silk/float/find_pred_coefs_FLP.c b/silk/float/find_pred_coefs_FLP.c
index dcf7c52..6f79078 100644
--- a/silk/float/find_pred_coefs_FLP.c
+++ b/silk/float/find_pred_coefs_FLP.c
@@ -44,7 +44,8 @@
     silk_float       XXLTP[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
     silk_float       xXLTP[ MAX_NB_SUBFR * LTP_ORDER ];
     silk_float       invGains[ MAX_NB_SUBFR ];
-    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ];
+    /* Set to NLSF_Q15 to zero so we don't copy junk to the state. */
+    opus_int16       NLSF_Q15[ MAX_LPC_ORDER ]={0};
     const silk_float *x_ptr;
     silk_float       *x_pre_ptr, LPC_in_pre[ MAX_NB_SUBFR * MAX_LPC_ORDER + MAX_FRAME_LENGTH ];
     silk_float       minInvGain;
diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c
index ad90b87..c0c183e 100644
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -190,12 +190,14 @@
     opus_int32 XX_Q17[ MAX_NB_SUBFR * LTP_ORDER * LTP_ORDER ];
     opus_int32 xX_Q17[ MAX_NB_SUBFR * LTP_ORDER ];
 
-    for( i = 0; i < nb_subfr * LTP_ORDER * LTP_ORDER; i++ ) {
+    i = 0;
+    do {
         XX_Q17[ i ] = (opus_int32)silk_float2int( XX[ i ] * 131072.0f );
-    }
-    for( i = 0; i < nb_subfr * LTP_ORDER; i++ ) {
+    } while ( ++i < nb_subfr * LTP_ORDER * LTP_ORDER );
+    i = 0;
+    do {
         xX_Q17[ i ] = (opus_int32)silk_float2int( xX[ i ] * 131072.0f );
-    }
+    } while ( ++i < nb_subfr * LTP_ORDER );
 
     silk_quant_LTP_gains( B_Q14, cbk_index, periodicity_index, sum_log_gain_Q7, &pred_gain_dB_Q7, XX_Q17, xX_Q17, subfr_len, nb_subfr, arch );
 
diff --git a/silk/main.h b/silk/main.h
index 1a33eed..a5f5687 100644
--- a/silk/main.h
+++ b/silk/main.h
@@ -247,21 +247,21 @@
 /************************************/
 
 void silk_NSQ_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int16            x16[],                                      /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if !defined(OVERRIDE_silk_NSQ)
@@ -273,21 +273,21 @@
 
 /* Noise shaping using delayed decision */
 void silk_NSQ_del_dec_c(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int16            x16[],                                      /* I    Input                           */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I  Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
 #if !defined(OVERRIDE_silk_NSQ_del_dec)
diff --git a/silk/meson.build b/silk/meson.build
index 7069237..917048b 100644
--- a/silk/meson.build
+++ b/silk/meson.build
@@ -21,6 +21,16 @@
 silk_includes = [opus_includes, include_directories('float', 'fixed')]
 silk_static_libs = []
 
+if host_cpu_family in ['x86', 'x86_64'] and opus_conf.has('OPUS_HAVE_RTCD')
+  silk_sources +=  sources['SILK_SOURCES_X86_RTCD']
+endif
+
+if host_cpu_family in ['arm', 'aarch64'] and have_arm_intrinsics_or_asm
+  if opus_conf.has('OPUS_HAVE_RTCD')
+    silk_sources +=  sources['SILK_SOURCES_ARM_RTCD']
+  endif
+endif
+
 foreach intr_name : ['sse4_1', 'neon_intr']
   have_intr = get_variable('have_' + intr_name)
   if not have_intr
diff --git a/silk/stereo_LR_to_MS.c b/silk/stereo_LR_to_MS.c
index c822666..751452c 100644
--- a/silk/stereo_LR_to_MS.c
+++ b/silk/stereo_LR_to_MS.c
@@ -77,7 +77,7 @@
     ALLOC( LP_mid, frame_length, opus_int16 );
     ALLOC( HP_mid, frame_length, opus_int16 );
     for( n = 0; n < frame_length; n++ ) {
-        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
+        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 2 );
         LP_mid[ n ] = sum;
         HP_mid[ n ] = mid[ n + 1 ] - sum;
     }
@@ -86,7 +86,7 @@
     ALLOC( LP_side, frame_length, opus_int16 );
     ALLOC( HP_side, frame_length, opus_int16 );
     for( n = 0; n < frame_length; n++ ) {
-        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
+        sum = silk_RSHIFT_ROUND( silk_ADD_LSHIFT32( side[ n ] + (opus_int32)side[ n + 2 ], side[ n + 1 ], 1 ), 2 );
         LP_side[ n ] = sum;
         HP_side[ n ] = side[ n + 1 ] - sum;
     }
@@ -207,7 +207,7 @@
         pred0_Q13 += delta0_Q13;
         pred1_Q13 += delta1_Q13;
         w_Q24   += deltaw_Q24;
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
         sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 );               /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 );       /* Q8  */
         x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -217,7 +217,7 @@
     pred1_Q13 = -pred_Q13[ 1 ];
     w_Q24     =  silk_LSHIFT( width_Q14, 10 );
     for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( mid[ n ] + (opus_int32)mid[ n + 2 ], mid[ n + 1 ], 1 ), 9 );    /* Q11 */
         sum = silk_SMLAWB( silk_SMULWB( w_Q24, side[ n + 1 ] ), sum, pred0_Q13 );               /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)mid[ n + 1 ], 11 ), pred1_Q13 );       /* Q8  */
         x2[ n - 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
diff --git a/silk/stereo_MS_to_LR.c b/silk/stereo_MS_to_LR.c
index 62521a4..1e01bb6 100644
--- a/silk/stereo_MS_to_LR.c
+++ b/silk/stereo_MS_to_LR.c
@@ -59,7 +59,7 @@
     for( n = 0; n < STEREO_INTERP_LEN_MS * fs_kHz; n++ ) {
         pred0_Q13 += delta0_Q13;
         pred1_Q13 += delta1_Q13;
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
         sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 );         /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 );        /* Q8  */
         x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
@@ -67,7 +67,7 @@
     pred0_Q13 = pred_Q13[ 0 ];
     pred1_Q13 = pred_Q13[ 1 ];
     for( n = STEREO_INTERP_LEN_MS * fs_kHz; n < frame_length; n++ ) {
-        sum = silk_LSHIFT( silk_ADD_LSHIFT( x1[ n ] + x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
+        sum = silk_LSHIFT( silk_ADD_LSHIFT32( x1[ n ] + (opus_int32)x1[ n + 2 ], x1[ n + 1 ], 1 ), 9 );       /* Q11 */
         sum = silk_SMLAWB( silk_LSHIFT( (opus_int32)x2[ n + 1 ], 8 ), sum, pred0_Q13 );         /* Q8  */
         sum = silk_SMLAWB( sum, silk_LSHIFT( (opus_int32)x1[ n + 1 ], 11 ), pred1_Q13 );        /* Q8  */
         x2[ n + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sum, 8 ) );
diff --git a/silk/tests/test_unit_LPC_inv_pred_gain.c b/silk/tests/test_unit_LPC_inv_pred_gain.c
index 67067ce..7ca902a 100644
--- a/silk/tests/test_unit_LPC_inv_pred_gain.c
+++ b/silk/tests/test_unit_LPC_inv_pred_gain.c
@@ -43,6 +43,7 @@
     int i;
     int j;
     int sum_a, sum_abs_a;
+    double y[SILK_MAX_ORDER_LPC] = {0};
     sum_a = sum_abs_a = 0;
     for( j = 0; j < order; j++ ) {
         sum_a += A_Q12[ j ];
@@ -57,7 +58,6 @@
     if( sum_abs_a < 4096 ) {
         return 1;
     }
-    double y[SILK_MAX_ORDER_LPC] = {0};
     y[0] = 1;
     for( i = 0; i < 10000; i++ ) {
         double sum = 0;
diff --git a/silk/x86/NSQ_del_dec_sse4_1.c b/silk/x86/NSQ_del_dec_sse4_1.c
index 2c75ede..a58a76c 100644
--- a/silk/x86/NSQ_del_dec_sse4_1.c
+++ b/silk/x86/NSQ_del_dec_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -46,6 +46,7 @@
     opus_int32 Shape_Q14[ DECISION_DELAY ];
     opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
     opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
     opus_int32 Seed;
     opus_int32 SeedInit;
     opus_int32 RD_Q10;
@@ -56,6 +57,7 @@
     opus_int32 RD_Q10;
     opus_int32 xq_Q14;
     opus_int32 LF_AR_Q14;
+    opus_int32 Diff_Q14;
     opus_int32 sLTP_shp_Q14;
     opus_int32 LPC_exc_Q14;
 } NSQ_sample_struct;
@@ -66,7 +68,7 @@
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@@ -112,21 +114,21 @@
 );
 
 void silk_NSQ_del_dec_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
@@ -142,8 +144,39 @@
     VARDECL( opus_int32, delayedGain_Q10 );
     VARDECL( NSQ_del_dec_struct, psDelDec );
     NSQ_del_dec_struct  *psDD;
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+    silk_NSQ_del_dec_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
     /* Set unvoiced lag to the previous one, overwrite later for voiced */
     lag = NSQ->lagPrev;
 
@@ -158,6 +191,7 @@
         psDD->SeedInit       = psDD->Seed;
         psDD->RD_Q10         = 0;
         psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
+        psDD->Diff_Q14       = NSQ->sDiff_shp_Q14;
         psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
         silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
         silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
@@ -185,8 +219,7 @@
         LSF_interpolation_flag = 1;
     }
 
-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
     ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
@@ -198,7 +231,7 @@
     for( k = 0; k < psEncC->nb_subfr; k++ ) {
         A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];
 
         /* Noise shape parameters */
         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -257,7 +290,7 @@
             }
         }
 
-        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
+        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
             psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
 
         silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
@@ -265,7 +298,7 @@
             Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
             psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
 
-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
         pxq    += psEncC->subfr_length;
     }
@@ -288,6 +321,7 @@
     for( i = 0; i < decisionDelay; i++ ) {
         last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
         if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
+
         pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
         pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
             silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
@@ -298,11 +332,19 @@
 
     /* Update states */
     NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
+    NSQ->sDiff_shp_Q14  = psDD->Diff_Q14;
     NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
 
     /* Save quantized speech signal */
     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
     RESTORE_STACK;
 }
 
@@ -345,6 +387,8 @@
     opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
+    int rdo_offset;
+
     VARDECL( NSQ_sample_pair, psSampleState );
     NSQ_del_dec_struct *psDD;
     NSQ_sample_struct  *psSS;
@@ -356,6 +400,8 @@
     celt_assert( nStatesDelayedDecision > 0 );
     ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
 
+    rdo_offset = (Lambda_Q10 >> 1) - 512;
+
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@@ -407,8 +453,8 @@
         /* Long-term shaping */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
-            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
             n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
             shp_lag_ptr++;
         } else {
@@ -478,7 +524,7 @@
                     psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
                     tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
 
-                    /* setp 4 */
+                    /* step 4 */
                     psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
                     psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
                     tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
@@ -511,9 +557,9 @@
                 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
 
                 /* Noise shape feedback */
-                silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
+                celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
                 /* Output of lowpass section */
-                tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
+                tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
                 /* Output of allpass section */
                 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
                 psDD->sAR2_Q14[ 0 ] = tmp2;
@@ -543,9 +589,9 @@
 
                 /* Input minus prediction plus noise feedback                       */
                 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
-                tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );                                    /* Q14 */
+                tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 );                                /* Q14 */
                 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );                               /* Q13 */
-                tmp1 = silk_SUB32( tmp2, tmp1 );                                            /* Q13 */
+                tmp1 = silk_SUB_SAT32( tmp2, tmp1 );                                        /* Q13 */
                 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
 
                 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
@@ -559,6 +605,18 @@
                 /* Find two quantization level candidates and measure their rate-distortion */
                 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
                 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+                if (Lambda_Q10 > 2048) {
+                    /* For aggressive RDO, the bias becomes more than one pulse. */
+                    if (q1_Q10 > rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+                    } else if (q1_Q10 < -rdo_offset) {
+                        q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+                    } else if (q1_Q10 < 0) {
+                        q1_Q0 = -1;
+                    } else {
+                        q1_Q0 = 0;
+                    }
+                }
                 if( q1_Q0 > 0 ) {
                     q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
                     q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
@@ -612,8 +670,9 @@
                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
 
                 /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 0 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                 psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                 psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
                 psSS[ 0 ].xq_Q14       = xq_Q14;
@@ -626,14 +685,14 @@
                     exc_Q14 = -exc_Q14;
                 }
 
-
                 /* Add predictions */
                 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
                 xq_Q14      = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
 
                 /* Update states */
-                sLF_AR_shp_Q14         = silk_SUB32( xq_Q14, n_AR_Q14 );
-                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
+                psSS[ 1 ].Diff_Q14     = silk_SUB_LSHIFT32( xq_Q14, x_Q10[ i ], 4 );
+                sLF_AR_shp_Q14         = silk_SUB32( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
+                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
                 psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
                 psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
                 psSS[ 1 ].xq_Q14       = xq_Q14;
@@ -705,6 +764,7 @@
             psDD                                     = &psDelDec[ k ];
             psSS                                     = &psSampleState[ k ][ 0 ];
             psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
+            psDD->Diff_Q14                           = psSS->Diff_Q14;
             psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
             psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
             psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
@@ -728,7 +788,7 @@
     const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
-    const opus_int32    x_Q3[],                     /* I    Input in Q3                         */
+    const opus_int16    x16[],                      /* I    Input                               */
     opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
@@ -742,51 +802,41 @@
 )
 {
     opus_int            i, k, lag;
-    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
+    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
     NSQ_del_dec_struct  *psDD;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
 
     lag          = pitchL[ subfr ];
     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
-
     silk_assert( inv_gain_Q31 != 0 );
 
-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
     /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
 
-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
 
     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
+
         /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
 
-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
 
-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
     }
 
-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
     if( NSQ->rewhite_flag ) {
         if( subfr == 0 ) {
@@ -800,7 +850,9 @@
     }
 
     /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
         /* Scale long-term shaping state */
         {
             __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
@@ -841,6 +893,7 @@
 
                 /* Scale scalar states */
                 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
+                psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
 
                 /* Scale short-term prediction and shaping states */
                 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -855,5 +908,8 @@
                 }
             }
         }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
     }
 }
diff --git a/silk/x86/NSQ_sse4_1.c b/silk/x86/NSQ_sse4_1.c
index b0315e3..d5ae1d3 100644
--- a/silk/x86/NSQ_sse4_1.c
+++ b/silk/x86/NSQ_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -37,17 +37,17 @@
 #include "stack_alloc.h"
 
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
-    opus_int            subfr,                  /* I    subframe number                 */
-    const opus_int      LTP_scale_Q14,          /* I                                    */
-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
-    const opus_int      signal_type             /* I    Signal type                     */
+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
+    const opus_int16    x16[],                     /* I    input                           */
+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
+    opus_int            subfr,                     /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,             /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
+    const opus_int      signal_type                /* I    Signal type                     */
 );
 
 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
@@ -65,27 +65,28 @@
     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
     opus_int32          LF_shp_Q14,             /* I                                    */
     opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
     opus_int            offset_Q10,             /* I                                    */
     opus_int            length,                 /* I    Input length                    */
     opus_int32          table[][4]              /* I                                    */
 );
 
 void silk_NSQ_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 )
 {
     opus_int            k, lag, start_idx, LSF_interpolation_flag;
@@ -101,8 +102,41 @@
     opus_int32   tmp1;
     opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
 
+#ifdef OPUS_CHECK_ASM
+    silk_nsq_state NSQ_c;
+    SideInfoIndices psIndices_c;
+    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
+    const opus_int8 *const pulses_a = pulses;
+#endif
+
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    ( void )pulses_a;
+    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
+    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
+    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
+    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
+
+    silk_NSQ_c(
+        psEncC,
+        &NSQ_c,
+        &psIndices_c,
+        x16,
+        pulses_c,
+        PredCoef_Q12,
+        LTPCoef_Q14,
+        AR_Q13,
+        HarmShapeGain_Q14,
+        Tilt_Q14,
+        LF_shp_Q14,
+        Gains_Q16,
+        pitchL,
+        Lambda_Q10,
+        LTP_scale_Q14
+    );
+#endif
+
     NSQ->rand_seed = psIndices->Seed;
 
     /* Set unvoiced lag to the previous one, overwrite later for voiced */
@@ -172,8 +206,7 @@
         LSF_interpolation_flag = 1;
     }
 
-    ALLOC( sLTP_Q15,
-           psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
+    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
     ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
     ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
     /* Set up pointers to start of sub frame */
@@ -183,7 +216,7 @@
     for( k = 0; k < psEncC->nb_subfr; k++ ) {
         A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
         B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
-        AR_shp_Q13 = &AR2_Q13[     k * MAX_SHAPE_LPC_ORDER ];
+        AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
 
         /* Noise shape parameters */
         silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
@@ -209,12 +242,12 @@
             }
         }
 
-        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
+        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
 
         if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
         {
             silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
-                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
+                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
                 offset_Q10, psEncC->subfr_length, &(table[32]) );
         }
         else
@@ -224,7 +257,7 @@
                 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
         }
 
-        x_Q3   += psEncC->subfr_length;
+        x16    += psEncC->subfr_length;
         pulses += psEncC->subfr_length;
         pxq    += psEncC->subfr_length;
     }
@@ -235,12 +268,19 @@
     /* Save quantized speech and noise shaping signals */
     silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
     silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
+
+#ifdef OPUS_CHECK_ASM
+    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
+    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
+    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
+#endif
+
     RESTORE_STACK;
 }
 
-/***********************************/
-/* silk_noise_shape_quantizer_10_16  */
-/***********************************/
+/************************************/
+/* silk_noise_shape_quantizer_10_16 */
+/************************************/
 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
     opus_int            signalType,             /* I    Signal type                     */
@@ -256,6 +296,7 @@
     opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
     opus_int32          LF_shp_Q14,             /* I                                    */
     opus_int32          Gain_Q16,               /* I                                    */
+    opus_int            Lambda_Q10,             /* I                                    */
     opus_int            offset_Q10,             /* I                                    */
     opus_int            length,                 /* I    Input length                    */
     opus_int32          table[][4]              /* I                                    */
@@ -264,7 +305,7 @@
     opus_int     i;
     opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
     opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
-    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
+    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
     opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
     opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
 
@@ -279,6 +320,8 @@
     __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
     __m128i AR_shp_Q13_76543210;
 
+    int rdo_offset = (Lambda_Q10 >> 1) - 512;
+
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
     Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
@@ -288,6 +331,7 @@
 
     sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
     xq_Q14         = psLPC_Q14[ 0 ];
+    sDiff_shp_Q14  = NSQ->sDiff_shp_Q14;
     LTP_pred_Q13   = 0;
 
     /* load a_Q12 */
@@ -430,8 +474,8 @@
         sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
         sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
 
-        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
-        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14),       0 );
+        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
+        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14),       0 );
 
         /* high part, use pmaddwd, results in 4 32-bit */
         xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
@@ -462,14 +506,14 @@
         n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
         n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
 
-        silk_assert( lag > 0 || signalType != TYPE_VOICED );
+        celt_assert( lag > 0 || signalType != TYPE_VOICED );
 
         /* Combine prediction and noise shaping signals */
         tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
         tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
         if( lag > 0 ) {
             /* Symmetric, packed FIR coefficients */
-            n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
+            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
             n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
             shp_lag_ptr++;
@@ -495,6 +539,18 @@
         /* Find two quantization level candidates and measure their rate-distortion */
         q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
         q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
+        if (Lambda_Q10 > 2048) {
+            /* For aggressive RDO, the bias becomes more than one pulse. */
+            if (q1_Q10 > rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
+            } else if (q1_Q10 < -rdo_offset) {
+                q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
+            } else if (q1_Q10 < 0) {
+                q1_Q0 = -1;
+            } else {
+                q1_Q0 = 0;
+            }
+        }
 
         q1_Q10 = table[q1_Q0][0];
         q2_Q10 = table[q1_Q0][1];
@@ -519,7 +575,8 @@
         /* Update states */
         psLPC_Q14++;
         *psLPC_Q14 = xq_Q14;
-        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
+        NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
+        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
 
         NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
         sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
@@ -600,64 +657,54 @@
 }
 
 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
-    const silk_encoder_state *psEncC,           /* I    Encoder State                   */
-    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
-    const opus_int32    x_Q3[],                 /* I    input in Q3                     */
-    opus_int32          x_sc_Q10[],             /* O    input scaled with 1/Gain        */
-    const opus_int16    sLTP[],                 /* I    re-whitened LTP state in Q0     */
-    opus_int32          sLTP_Q15[],             /* O    LTP state matching scaled input */
-    opus_int            subfr,                  /* I    subframe number                 */
-    const opus_int      LTP_scale_Q14,          /* I                                    */
-    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                 */
-    const opus_int      pitchL[ MAX_NB_SUBFR ], /* I    Pitch lag                       */
-    const opus_int      signal_type             /* I    Signal type                     */
+    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
+    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
+    const opus_int16    x16[],                     /* I    input                           */
+    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
+    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
+    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
+    opus_int            subfr,                     /* I    subframe number                 */
+    const opus_int      LTP_scale_Q14,             /* I                                    */
+    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
+    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
+    const opus_int      signal_type                /* I    Signal type                     */
 )
 {
     opus_int   i, lag;
-    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
-    __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
+    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
+    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
 
     lag          = pitchL[ subfr ];
     inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
     silk_assert( inv_gain_Q31 != 0 );
 
-    /* Calculate gain adjustment factor */
-    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
-        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
-    } else {
-        gain_adj_Q16 = (opus_int32)1 << 16;
-    }
-
     /* Scale input */
-    inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
+    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
 
-    /* prepare inv_gain_Q23 in packed 4 32-bits */
-    xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
+    /* prepare inv_gain_Q26 in packed 4 32-bits */
+    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
 
     for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
-        xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
+        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
 
         /* equal shift right 4 bytes*/
-        xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
 
-        xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
-        xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
+        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
+        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
 
-        xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
-        xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
+        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
+        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
 
-        xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
+        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
 
-        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
+        _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
     }
 
     for( ; i < psEncC->subfr_length; i++ ) {
-        x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
+        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
     }
 
-    /* Save inverse gain */
-    NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
-
     /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
     if( NSQ->rewhite_flag ) {
         if( subfr == 0 ) {
@@ -671,9 +718,11 @@
     }
 
     /* Adjust for changing gain */
-    if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
-        /* Scale long-term shaping state */
+    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
         __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
+        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
+
+        /* Scale long-term shaping state */
 
         /* prepare gain_adj_Q16 in packed 4 32-bits */
         xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
@@ -707,6 +756,7 @@
         }
 
         NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
+        NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
 
         /* Scale short-term prediction and shaping states */
         for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
@@ -715,5 +765,8 @@
         for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
             NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
         }
+
+        /* Save inverse gain */
+        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
     }
 }
diff --git a/silk/x86/SigProc_FIX_sse.h b/silk/x86/SigProc_FIX_sse.h
index 61efa8d..89a5ec8 100644
--- a/silk/x86/SigProc_FIX_sse.h
+++ b/silk/x86/SigProc_FIX_sse.h
@@ -26,13 +26,13 @@
 */
 
 #ifndef SIGPROC_FIX_SSE_H
-#define SIGPROC_FIX_SSE_H
+# define SIGPROC_FIX_SSE_H
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+#  include "config.h"
+# endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 void silk_burg_modified_sse4_1(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
@@ -45,11 +45,13 @@
     int                         arch                /* I    Run-time architecture                                       */
 );
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
-    ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
 
-#else
+#   define OVERRIDE_silk_burg_modified
+#   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+       ((void)(arch), silk_burg_modified_sse4_1(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_BURG_MODIFIED_IMPL[OPUS_ARCHMASK + 1])(
     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
@@ -62,33 +64,36 @@
     const opus_int              D,                  /* I    Order                                                       */
     int                         arch                /* I    Run-time architecture                                       */);
 
-#  define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
-    ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
+#   define OVERRIDE_silk_burg_modified
+#   define silk_burg_modified(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch) \
+     ((*SILK_BURG_MODIFIED_IMPL[(arch) & OPUS_ARCHMASK])(res_nrg, res_nrg_Q, A_Q16, x, minInvGain_Q30, subfr_length, nb_subfr, D, arch))
 
-#endif
+#  endif
 
-opus_int64 silk_inner_prod16_aligned_64_sse4_1(
+opus_int64 silk_inner_prod16_sse4_1(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
     const opus_int   len
 );
 
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
 
-#define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((void)(arch),silk_inner_prod16_aligned_64_sse4_1(inVec1, inVec2, len))
+#   define OVERRIDE_silk_inner_prod16
+#   define silk_inner_prod16(inVec1, inVec2, len, arch) \
+       ((void)(arch),silk_inner_prod16_sse4_1(inVec1, inVec2, len))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
-extern opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[OPUS_ARCHMASK + 1])(
+extern opus_int64 (*const SILK_INNER_PROD16_IMPL[OPUS_ARCHMASK + 1])(
                     const opus_int16 *inVec1,
                     const opus_int16 *inVec2,
                     const opus_int   len);
 
-#  define silk_inner_prod16_aligned_64(inVec1, inVec2, len, arch) \
-    ((*SILK_INNER_PROD16_ALIGNED_64_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
+#   define OVERRIDE_silk_inner_prod16
+#   define silk_inner_prod16(inVec1, inVec2, len, arch) \
+     ((*SILK_INNER_PROD16_IMPL[(arch) & OPUS_ARCHMASK])(inVec1, inVec2, len))
 
-#endif
-#endif
+#  endif
+# endif
 #endif
diff --git a/silk/x86/VAD_sse4_1.c b/silk/x86/VAD_sse4_1.c
index d02ddf4..e7eaf97 100644
--- a/silk/x86/VAD_sse4_1.c
+++ b/silk/x86/VAD_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -63,6 +63,14 @@
 
     SAVE_STACK;
 
+#ifdef OPUS_CHECK_ASM
+    silk_encoder_state psEncC_c;
+    opus_int ret_c;
+
+    silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
+    ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
+#endif
+
     /* Safety checks */
     silk_assert( VAD_N_BANDS == 4 );
     celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
@@ -233,15 +241,14 @@
         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
     }
 
+    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
+        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
+    }
     /* Power scaling */
     if( speech_nrg <= 0 ) {
         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
-    } else if( speech_nrg < 32768 ) {
-        if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
-        } else {
-            speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
-        }
+    } else if( speech_nrg < 16384 ) {
+        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
 
         /* square-root */
         speech_nrg = silk_SQRT_APPROX( speech_nrg );
@@ -272,6 +279,11 @@
         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
     }
 
+#ifdef OPUS_CHECK_ASM
+    silk_assert( ret == ret_c );
+    silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
+#endif
+
     RESTORE_STACK;
     return( ret );
 }
diff --git a/silk/x86/VQ_WMat_EC_sse4_1.c b/silk/x86/VQ_WMat_EC_sse4_1.c
index 74d6c6d..2c7d18d 100644
--- a/silk/x86/VQ_WMat_EC_sse4_1.c
+++ b/silk/x86/VQ_WMat_EC_sse4_1.c
@@ -1,5 +1,5 @@
-/* Copyright (c) 2014, Cisco Systems, INC
-   Written by XiangMingZhu WeiZhou MinPeng YanWang
+/* Copyright (c) 2014-2020, Cisco Systems, INC
+   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions
@@ -38,105 +38,136 @@
 /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
 void silk_VQ_WMat_EC_sse4_1(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 )
 {
     opus_int   k, gain_tmp_Q7;
     const opus_int8 *cb_row_Q7;
-    opus_int16 diff_Q14[ 5 ];
-    opus_int32 sum1_Q14, sum2_Q16;
+    opus_int32 neg_xX_Q24[ 5 ];
+    opus_int32 sum1_Q15, sum2_Q24;
+    opus_int32 bits_res_Q8, bits_tot_Q8;
+    __m128i v_XX_31_Q17, v_XX_42_Q17, v_cb_row_31_Q7, v_cb_row_42_Q7, v_acc1_Q24, v_acc2_Q24;
 
-    __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
+    /* Negate and convert to new Q domain */
+    neg_xX_Q24[ 0 ] = -silk_LSHIFT32( xX_Q17[ 0 ], 7 );
+    neg_xX_Q24[ 1 ] = -silk_LSHIFT32( xX_Q17[ 1 ], 7 );
+    neg_xX_Q24[ 2 ] = -silk_LSHIFT32( xX_Q17[ 2 ], 7 );
+    neg_xX_Q24[ 3 ] = -silk_LSHIFT32( xX_Q17[ 3 ], 7 );
+    neg_xX_Q24[ 4 ] = -silk_LSHIFT32( xX_Q17[ 4 ], 7 );
+
+    v_XX_31_Q17 = _mm_loadu_si128( (__m128i *)(&XX_Q17[ 1 ] ) );
+    v_XX_42_Q17 = _mm_shuffle_epi32( v_XX_31_Q17, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+
     /* Loop over codebook */
-    *rate_dist_Q14 = silk_int32_MAX;
+    *rate_dist_Q8 = silk_int32_MAX;
+    *res_nrg_Q15 = silk_int32_MAX;
     cb_row_Q7 = cb_Q7;
+    /* If things go really bad, at least *ind is set to something safe. */
+    *ind = 0;
     for( k = 0; k < L; k++ ) {
+        opus_int32 penalty;
         gain_tmp_Q7 = cb_gain_Q7[k];
-
-        diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
-
-        C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
-        C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
-        C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
-        C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
-
-        diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
-        diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
-        diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
-        diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
-
         /* Weighted rate */
-        sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
+        /* Quantization error: 1 - 2 * xX * cb + cb' * XX * cb */
+        sum1_Q15 = SILK_FIX_CONST( 1.001, 15 );
 
         /* Penalty for too large gain */
-        sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
+        penalty = silk_LSHIFT32( silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 11 );
 
-        silk_assert( sum1_Q14 >= 0 );
+        /* first row of XX_Q17 */
+        v_cb_row_31_Q7 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
+        v_cb_row_42_Q7 = _mm_shuffle_epi32( v_cb_row_31_Q7, _MM_SHUFFLE( 0, 3, 2, 1 ) );
+        v_cb_row_31_Q7 = _mm_mul_epi32( v_XX_31_Q17, v_cb_row_31_Q7 );
+        v_cb_row_42_Q7 = _mm_mul_epi32( v_XX_42_Q17, v_cb_row_42_Q7 );
+        v_acc1_Q24 = _mm_add_epi64( v_cb_row_31_Q7, v_cb_row_42_Q7);
+        v_acc2_Q24 = _mm_shuffle_epi32( v_acc1_Q24, _MM_SHUFFLE( 1, 0, 3, 2 ) );
+        v_acc1_Q24 = _mm_add_epi64( v_acc1_Q24, v_acc2_Q24);
+        sum2_Q24 = _mm_cvtsi128_si32( v_acc1_Q24 );
+        sum2_Q24 = silk_ADD32( neg_xX_Q24[ 0 ], sum2_Q24 );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  0 ], cb_row_Q7[ 0 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 0 ] );
 
-        /* first row of W_Q18 */
-        C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
-        C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
+        /* second row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 1 ], XX_Q17[  7 ], cb_row_Q7[ 2 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  8 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  9 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[  6 ], cb_row_Q7[ 1 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 1 ] );
 
-        C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
-        C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
+        /* third row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 2 ], XX_Q17[ 13 ], cb_row_Q7[ 3 ] );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 14 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 12 ], cb_row_Q7[ 2 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 2 ] );
 
-        C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
-        C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
+        /* fourth row of XX_Q17 */
+        sum2_Q24 = silk_MLA( neg_xX_Q24[ 3 ], XX_Q17[ 19 ], cb_row_Q7[ 4 ] );
+        sum2_Q24 = silk_LSHIFT32( sum2_Q24, 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 18 ], cb_row_Q7[ 3 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 3 ] );
 
-        C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
-        C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
-
-        C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
-        sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
-
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  0 ], diff_Q14[ 0 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 0 ] );
-
-        /* second row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[  7 ], diff_Q14[ 2 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  8 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  9 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[  6 ], diff_Q14[ 1 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 1 ] );
-
-        /* third row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 13 ], diff_Q14[ 3 ] );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 2 ] );
-
-        /* fourth row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 19 ], diff_Q14[ 4 ] );
-        sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
-        sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 3 ] );
-
-        /* last row of W_Q18 */
-        sum2_Q16 = silk_SMULWB(           W_Q18[ 24 ], diff_Q14[ 4 ] );
-        sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16,    diff_Q14[ 4 ] );
-
-        silk_assert( sum1_Q14 >= 0 );
+        /* last row of XX_Q17 */
+        sum2_Q24 = silk_LSHIFT32( neg_xX_Q24[ 4 ], 1 );
+        sum2_Q24 = silk_MLA( sum2_Q24,        XX_Q17[ 24 ], cb_row_Q7[ 4 ] );
+        sum1_Q15 = silk_SMLAWB( sum1_Q15,        sum2_Q24,  cb_row_Q7[ 4 ] );
 
         /* find best */
-        if( sum1_Q14 < *rate_dist_Q14 ) {
-            *rate_dist_Q14 = sum1_Q14;
-            *ind = (opus_int8)k;
-            *gain_Q7 = gain_tmp_Q7;
+        if( sum1_Q15 >= 0 ) {
+            /* Translate residual energy to bits using high-rate assumption (6 dB ==> 1 bit/sample) */
+            bits_res_Q8 = silk_SMULBB( subfr_len, silk_lin2log( sum1_Q15 + penalty) - (15 << 7) );
+            /* In the following line we reduce the codelength component by half ("-1"); seems to slightly improve quality */
+            bits_tot_Q8 = silk_ADD_LSHIFT32( bits_res_Q8, cl_Q5[ k ], 3-1 );
+            if( bits_tot_Q8 <= *rate_dist_Q8 ) {
+                *rate_dist_Q8 = bits_tot_Q8;
+                *res_nrg_Q15 = sum1_Q15 + penalty;
+                *ind = (opus_int8)k;
+                *gain_Q7 = gain_tmp_Q7;
+            }
         }
 
         /* Go to next cbk vector */
         cb_row_Q7 += LTP_ORDER;
     }
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_int8  ind_c = 0;
+        opus_int32 res_nrg_Q15_c = 0;
+        opus_int32 rate_dist_Q8_c = 0;
+        opus_int   gain_Q7_c = 0;
+
+        silk_VQ_WMat_EC_c(
+            &ind_c,
+            &res_nrg_Q15_c,
+            &rate_dist_Q8_c,
+            &gain_Q7_c,
+            XX_Q17,
+            xX_Q17,
+            cb_Q7,
+            cb_gain_Q7,
+            cl_Q5,
+            subfr_len,
+            max_gain_Q7,
+            L
+        );
+
+        silk_assert( *ind == ind_c );
+        silk_assert( *res_nrg_Q15 == res_nrg_Q15_c );
+        silk_assert( *rate_dist_Q8 == rate_dist_Q8_c );
+        silk_assert( *gain_Q7 == gain_Q7_c );
+    }
+#endif
 }
diff --git a/silk/x86/main_sse.h b/silk/x86/main_sse.h
index 2f15d44..a01d7f6 100644
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -26,171 +26,169 @@
 */
 
 #ifndef MAIN_SSE_H
-#define MAIN_SSE_H
+# define MAIN_SSE_H
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
+# ifdef HAVE_CONFIG_H
+#  include "config.h"
+# endif
 
 # if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
-#  define OVERRIDE_silk_VQ_WMat_EC
-
 void silk_VQ_WMat_EC_sse4_1(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#   define OVERRIDE_silk_VQ_WMat_EC
+#   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                           subfr_len, max_gain_Q7, L, arch) \
+    ((void)(arch),silk_VQ_WMat_EC_sse4_1(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_VQ_WMAT_EC_IMPL[OPUS_ARCHMASK + 1])(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 );
 
-#  define silk_VQ_WMat_EC(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L, arch) \
-    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, rate_dist_Q14, gain_Q7, in_Q14, W_Q18, cb_Q7, cb_gain_Q7, cl_Q5, \
-                          mu_Q9, max_gain_Q7, L))
+#   define OVERRIDE_silk_VQ_WMat_EC
+#   define silk_VQ_WMat_EC(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                           subfr_len, max_gain_Q7, L, arch) \
+    ((*SILK_VQ_WMAT_EC_IMPL[(arch) & OPUS_ARCHMASK])(ind, res_nrg_Q15, rate_dist_Q8, gain_Q7, XX_Q17, xX_Q17, cb_Q7, cb_gain_Q7, cl_Q5, \
+                          subfr_len, max_gain_Q7, L))
 
-#endif
-#endif
-
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
-#  define OVERRIDE_silk_NSQ
+#  endif
 
 void silk_NSQ_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define OVERRIDE_silk_NSQ
+#   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((void)(arch),silk_NSQ_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_NSQ_IMPL[OPUS_ARCHMASK + 1])(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                   HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+#   define OVERRIDE_silk_NSQ
+#   define silk_NSQ(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
     ((*SILK_NSQ_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
                    HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#endif
-
-#  define OVERRIDE_silk_NSQ_del_dec
+#  endif
 
 void silk_NSQ_del_dec_sse4_1(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#if defined OPUS_X86_PRESUME_SSE4_1
+#  if defined OPUS_X86_PRESUME_SSE4_1
 
-#define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#   define OVERRIDE_silk_NSQ_del_dec
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((void)(arch),silk_NSQ_del_dec_sse4_1(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#else
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 );
 
-#  define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
-                           HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
-    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x_Q3, pulses, PredCoef_Q12, LTPCoef_Q14, AR2_Q13, \
+#   define OVERRIDE_silk_NSQ_del_dec
+#   define silk_NSQ_del_dec(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
+                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14, arch) \
+    ((*SILK_NSQ_DEL_DEC_IMPL[(arch) & OPUS_ARCHMASK])(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, \
                            HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14))
 
-#endif
-#endif
+#  endif
 
 void silk_noise_shape_quantizer(
     silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
@@ -223,26 +221,27 @@
     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
 );
 
-#  define OVERRIDE_silk_VAD_GetSA_Q8
-
 opus_int silk_VAD_GetSA_Q8_sse4_1(
     silk_encoder_state *psEnC,
     const opus_int16   pIn[]
 );
 
-#if defined(OPUS_X86_PRESUME_SSE4_1)
-#define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
+#  if defined(OPUS_X86_PRESUME_SSE4_1)
 
-#else
+#   define OVERRIDE_silk_VAD_GetSA_Q8
+#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) ((void)(arch),silk_VAD_GetSA_Q8_sse4_1(psEnC, pIn))
 
-#  define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
-     ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+#  elif defined(OPUS_HAVE_RTCD)
 
 extern opus_int (*const SILK_VAD_GETSA_Q8_IMPL[OPUS_ARCHMASK + 1])(
      silk_encoder_state *psEnC,
      const opus_int16   pIn[]);
 
-#endif
+#   define OVERRIDE_silk_VAD_GetSA_Q8
+#   define silk_VAD_GetSA_Q8(psEnC, pIn, arch) \
+      ((*SILK_VAD_GETSA_Q8_IMPL[(arch) & OPUS_ARCHMASK])(psEnC, pIn))
+
+#  endif
 
 # endif
 #endif
diff --git a/silk/x86/x86_silk_map.c b/silk/x86/x86_silk_map.c
index 32dcc3c..70f6007 100644
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -35,22 +35,22 @@
 #include "pitch.h"
 #include "main.h"
 
-#if !defined(OPUS_X86_PRESUME_SSE4_1)
+#if defined(OPUS_HAVE_RTCD) && !defined(OPUS_X86_PRESUME_SSE4_1)
 
 #if defined(FIXED_POINT)
 
 #include "fixed/main_FIX.h"
 
-opus_int64 (*const SILK_INNER_PROD16_ALIGNED_64_IMPL[ OPUS_ARCHMASK + 1 ] )(
+opus_int64 (*const SILK_INNER_PROD16_IMPL[ OPUS_ARCHMASK + 1 ] )(
     const opus_int16 *inVec1,
     const opus_int16 *inVec2,
     const opus_int   len
 ) = {
-  silk_inner_prod16_aligned_64_c,                  /* non-sse */
-  silk_inner_prod16_aligned_64_c,
-  silk_inner_prod16_aligned_64_c,
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 ), /* sse4.1 */
-  MAY_HAVE_SSE4_1( silk_inner_prod16_aligned_64 )  /* avx */
+  silk_inner_prod16_c,                  /* non-sse */
+  silk_inner_prod16_c,
+  silk_inner_prod16_c,
+  MAY_HAVE_SSE4_1( silk_inner_prod16 ), /* sse4.1 */
+  MAY_HAVE_SSE4_1( silk_inner_prod16 )  /* avx */
 };
 
 #endif
@@ -66,23 +66,22 @@
   MAY_HAVE_SSE4_1( silk_VAD_GetSA_Q8 )  /* avx */
 };
 
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_IMPL[ OPUS_ARCHMASK + 1 ] )(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 ) = {
   silk_NSQ_c,                  /* non-sse */
   silk_NSQ_c,
@@ -90,21 +89,20 @@
   MAY_HAVE_SSE4_1( silk_NSQ ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_NSQ )  /* avx */
 };
-#endif
 
-#if 0 /* FIXME: SSE disabled until silk_VQ_WMat_EC_sse4_1() gets updated. */
 void (*const SILK_VQ_WMAT_EC_IMPL[ OPUS_ARCHMASK + 1 ] )(
     opus_int8                   *ind,                           /* O    index of best codebook vector               */
-    opus_int32                  *rate_dist_Q14,                 /* O    best weighted quant error + mu * rate       */
+    opus_int32                  *res_nrg_Q15,                   /* O    best residual energy                        */
+    opus_int32                  *rate_dist_Q8,                  /* O    best total bitrate                          */
     opus_int                    *gain_Q7,                       /* O    sum of absolute LTP coefficients            */
-    const opus_int16            *in_Q14,                        /* I    input vector to be quantized                */
-    const opus_int32            *W_Q18,                         /* I    weighting matrix                            */
+    const opus_int32            *XX_Q17,                        /* I    correlation matrix                          */
+    const opus_int32            *xX_Q17,                        /* I    correlation vector                          */
     const opus_int8             *cb_Q7,                         /* I    codebook                                    */
     const opus_uint8            *cb_gain_Q7,                    /* I    codebook effective gain                     */
     const opus_uint8            *cl_Q5,                         /* I    code length for each codebook vector        */
-    const opus_int              mu_Q9,                          /* I    tradeoff betw. weighted error and rate      */
+    const opus_int              subfr_len,                      /* I    number of samples per subframe              */
     const opus_int32            max_gain_Q7,                    /* I    maximum sum of absolute LTP coefficients    */
-    opus_int                    L                               /* I    number of vectors in codebook               */
+    const opus_int              L                               /* I    number of vectors in codebook               */
 ) = {
   silk_VQ_WMat_EC_c,                  /* non-sse */
   silk_VQ_WMat_EC_c,
@@ -112,25 +110,23 @@
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_VQ_WMat_EC )  /* avx */
 };
-#endif
 
-#if 0 /* FIXME: SSE disabled until the NSQ code gets updated. */
 void (*const SILK_NSQ_DEL_DEC_IMPL[ OPUS_ARCHMASK + 1 ] )(
-    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
-    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
-    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
-    const opus_int32            x_Q3[],                                     /* I    Prefiltered input signal        */
-    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
-    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],          /* I    Short term prediction coefs     */
-    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
-    const opus_int16            AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs             */
-    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
-    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
-    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
-    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
-    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
-    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
-    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
+    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
+    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
+    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
+    const opus_int16            x16[],                                        /* I    Input                           */
+    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
+    const opus_int16            PredCoef_Q12[ 2 * MAX_LPC_ORDER ],            /* I    Short term prediction coefs     */
+    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
+    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
+    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
+    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
+    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
+    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
+    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
+    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
+    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
 ) = {
   silk_NSQ_del_dec_c,                  /* non-sse */
   silk_NSQ_del_dec_c,
@@ -138,7 +134,6 @@
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec ), /* sse4.1 */
   MAY_HAVE_SSE4_1( silk_NSQ_del_dec )  /* avx */
 };
-#endif
 
 #if defined(FIXED_POINT)
 
diff --git a/silk_sources.mk b/silk_sources.mk
index d2666e6..3df2481 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -77,15 +77,19 @@
 silk/stereo_quant_pred.c \
 silk/LPC_fit.c
 
-SILK_SOURCES_SSE4_1 =  \
+SILK_SOURCES_X86_RTCD = \
+silk/x86/x86_silk_map.c
+
+SILK_SOURCES_SSE4_1 = \
 silk/x86/NSQ_sse4_1.c \
 silk/x86/NSQ_del_dec_sse4_1.c \
-silk/x86/x86_silk_map.c \
 silk/x86/VAD_sse4_1.c \
 silk/x86/VQ_WMat_EC_sse4_1.c
 
+SILK_SOURCES_ARM_RTCD = \
+silk/arm/arm_silk_map.c
+
 SILK_SOURCES_ARM_NEON_INTR = \
-silk/arm/arm_silk_map.c \
 silk/arm/biquad_alt_neon_intr.c \
 silk/arm/LPC_inv_pred_gain_neon_intr.c \
 silk/arm/NSQ_del_dec_neon_intr.c \
diff --git a/src/opus_decoder.c b/src/opus_decoder.c
index 9113638..6520e74 100644
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -278,7 +278,8 @@
       ec_dec_init(&dec,(unsigned char*)data,len);
    } else {
       audiosize = frame_size;
-      mode = st->prev_mode;
+      /* Run PLC using last used mode (CELT if we ended with CELT redundancy) */
+      mode = st->prev_redundancy ? MODE_CELT_ONLY : st->prev_mode;
       bandwidth = 0;
 
       if (mode == 0)
@@ -419,7 +420,7 @@
 
    start_band = 0;
    if (!decode_fec && mode != MODE_CELT_ONLY && data != NULL
-    && ec_tell(&dec)+17+20*(st->mode == MODE_HYBRID) <= 8*len)
+    && ec_tell(&dec)+17+20*(mode == MODE_HYBRID) <= 8*len)
    {
       /* Check if we have a redundant 0-8 kHz band */
       if (mode == MODE_HYBRID)
@@ -499,6 +500,11 @@
    /* 5 ms redundant frame for CELT->SILK*/
    if (redundancy && celt_to_silk)
    {
+      /* If the previous frame did not use CELT (the first redundancy frame in
+         a transition from SILK may have been lost) then the CELT decoder is
+         stale at this point and the redundancy audio is not useful, however
+         the final range is still needed (for testing), so the redundancy is
+         always decoded but the decoded audio may not be used */
       MUST_SUCCEED(celt_decoder_ctl(celt_dec, CELT_SET_START_BAND(0)));
       celt_decode_with_ec(celt_dec, data+len, redundancy_bytes,
                           redundant_audio, F5, NULL, 0);
@@ -561,7 +567,10 @@
       smooth_fade(pcm+st->channels*(frame_size-F2_5), redundant_audio+st->channels*F2_5,
                   pcm+st->channels*(frame_size-F2_5), F2_5, st->channels, window, st->Fs);
    }
-   if (redundancy && celt_to_silk)
+   /* 5ms redundant frame for CELT->SILK; ignore if the previous frame did not
+      use CELT (the first redundancy frame in a transition from SILK may have
+      been lost) */
+   if (redundancy && celt_to_silk && (st->prev_mode != MODE_SILK_ONLY || st->prev_redundancy))
    {
       for (c=0;c<st->channels;c++)
       {
diff --git a/src/opus_encoder.c b/src/opus_encoder.c
index 7b5f0ab..8c8db5a 100644
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -87,6 +87,7 @@
     int          lfe;
     int          arch;
     int          use_dtx;                 /* general DTX for both SILK and CELT */
+    int          fec_config;
 #ifndef DISABLE_FLOAT_API
     TonalityAnalysisState analysis;
 #endif
@@ -112,7 +113,7 @@
     opus_val16   delay_buffer[MAX_ENCODER_BUFFER*2];
 #ifndef DISABLE_FLOAT_API
     int          detected_bandwidth;
-    int          nb_no_activity_frames;
+    int          nb_no_activity_ms_Q1;
     opus_val32   peak_signal_energy;
 #endif
     int          nonfinal_frame; /* current frame is not the final in a packet */
@@ -893,24 +894,28 @@
 
 /* Decides if DTX should be turned on (=1) or off (=0) */
 static int decide_dtx_mode(opus_int activity,            /* indicates if this frame contains speech/music */
-                           int *nb_no_activity_frames    /* number of consecutive frames with no activity */
+                           int *nb_no_activity_ms_Q1,    /* number of consecutive milliseconds with no activity, in Q1 */
+                           int frame_size_ms_Q1          /* number of miliseconds in this update, in Q1 */
                            )
 
 {
    if (!activity)
    {
-      /* The number of consecutive DTX frames should be within the allowed bounds */
-      (*nb_no_activity_frames)++;
-      if (*nb_no_activity_frames > NB_SPEECH_FRAMES_BEFORE_DTX)
+      /* The number of consecutive DTX frames should be within the allowed bounds.
+         Note that the allowed bound is defined in the SILK headers and assumes 20 ms
+         frames. As this function can be called with any frame length, a conversion to
+         milliseconds is done before the comparisons. */
+      (*nb_no_activity_ms_Q1) += frame_size_ms_Q1;
+      if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2)
       {
-         if (*nb_no_activity_frames <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX))
+         if (*nb_no_activity_ms_Q1 <= (NB_SPEECH_FRAMES_BEFORE_DTX + MAX_CONSECUTIVE_DTX)*20*2)
             /* Valid frame for DTX! */
             return 1;
          else
-            (*nb_no_activity_frames) = NB_SPEECH_FRAMES_BEFORE_DTX;
+            (*nb_no_activity_ms_Q1) = NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
       }
    } else
-      (*nb_no_activity_frames) = 0;
+      (*nb_no_activity_ms_Q1) = 0;
 
    return 0;
 }
@@ -1310,6 +1315,8 @@
         st->stream_channels = st->force_channels;
     } else {
 #ifdef FUZZING
+        (void)stereo_music_threshold;
+        (void)stereo_voice_threshold;
        /* Random mono/stereo decision */
        if (st->channels == 2 && (rand()&0x1F)==0)
           st->stream_channels = 3-st->stream_channels;
@@ -1348,6 +1355,8 @@
     } else if (st->user_forced_mode == OPUS_AUTO)
     {
 #ifdef FUZZING
+        (void)stereo_width;
+        (void)mode_thresholds;
        /* Random mode switching */
        if ((rand()&0xF)==0)
        {
@@ -1385,8 +1394,9 @@
 
        st->mode = (equiv_rate >= threshold) ? MODE_CELT_ONLY: MODE_SILK_ONLY;
 
-       /* When FEC is enabled and there's enough packet loss, use SILK */
-       if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4)
+       /* When FEC is enabled and there's enough packet loss, use SILK.
+          Unless the FEC is set to 2, in which case we don't switch to SILK if we're confident we have music. */
+       if (st->silk_mode.useInBandFEC && st->silk_mode.packetLossPercentage > (128-voice_est)>>4 && (st->fec_config != 2 || voice_est > 25))
           st->mode = MODE_SILK_ONLY;
        /* When encoding voice and DTX is enabled but the generalized DTX cannot be used,
           use SILK in order to make use of its DTX. */
@@ -2132,7 +2142,7 @@
 #ifndef DISABLE_FLOAT_API
     if (st->use_dtx && (analysis_info.valid || is_silence))
     {
-       if (decide_dtx_mode(activity, &st->nb_no_activity_frames))
+       if (decide_dtx_mode(activity, &st->nb_no_activity_ms_Q1, 2*1000*frame_size/st->Fs))
        {
           st->rangeFinal = 0;
           data[0] = gen_toc(st->mode, st->Fs/frame_size, curr_bandwidth, st->stream_channels);
@@ -2140,7 +2150,7 @@
           return 1;
        }
     } else {
-       st->nb_no_activity_frames = 0;
+       st->nb_no_activity_ms_Q1 = 0;
     }
 #endif
 
@@ -2435,11 +2445,12 @@
         case OPUS_SET_INBAND_FEC_REQUEST:
         {
             opus_int32 value = va_arg(ap, opus_int32);
-            if(value<0 || value>1)
+            if(value<0 || value>2)
             {
                goto bad_arg;
             }
-            st->silk_mode.useInBandFEC = value;
+            st->fec_config = value;
+            st->silk_mode.useInBandFEC = (value != 0);
         }
         break;
         case OPUS_GET_INBAND_FEC_REQUEST:
@@ -2449,7 +2460,7 @@
             {
                goto bad_arg;
             }
-            *value = st->silk_mode.useInBandFEC;
+            *value = st->fec_config;
         }
         break;
         case OPUS_SET_PACKET_LOSS_PERC_REQUEST:
@@ -2733,7 +2744,7 @@
 #ifndef DISABLE_FLOAT_API
             else if (st->use_dtx) {
                 /* DTX determined by Opus. */
-                *value = st->nb_no_activity_frames >= NB_SPEECH_FRAMES_BEFORE_DTX;
+                *value = st->nb_no_activity_ms_Q1 >= NB_SPEECH_FRAMES_BEFORE_DTX*20*2;
             }
 #endif
             else {
diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
index 93204a1..213e3eb 100644
--- a/src/opus_multistream_encoder.c
+++ b/src/opus_multistream_encoder.c
@@ -443,7 +443,8 @@
    char *ptr;
 
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+       (streams+coupled_streams>channels))
       return OPUS_BAD_ARG;
 
    st->arch = opus_select_arch();
@@ -459,8 +460,7 @@
       st->layout.mapping[i] = mapping[i];
    if (!validate_layout(&st->layout))
       return OPUS_BAD_ARG;
-   if (mapping_type == MAPPING_TYPE_SURROUND &&
-       !validate_encoder_layout(&st->layout))
+   if (!validate_encoder_layout(&st->layout))
       return OPUS_BAD_ARG;
    if (mapping_type == MAPPING_TYPE_AMBISONICS &&
        !validate_ambisonics(st->layout.nb_channels, NULL, NULL))
@@ -595,7 +595,8 @@
    int ret;
    OpusMSEncoder *st;
    if ((channels>255) || (channels<1) || (coupled_streams>streams) ||
-       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams))
+       (streams<1) || (coupled_streams<0) || (streams>255-coupled_streams) ||
+       (streams+coupled_streams>channels))
    {
       if (error)
          *error = OPUS_BAD_ARG;
diff --git a/tests/opus_build_test.sh b/tests/opus_build_test.sh
new file mode 100755
index 0000000..573f447
--- /dev/null
+++ b/tests/opus_build_test.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+tarball=`realpath "$1"`
+nb_tests="$2"
+oldvectors=`realpath "$3"`
+newvectors=`realpath "$4"`
+base=`basename "$tarball" .tar.gz`
+
+tar xvf "$tarball" > /dev/null 2>&1
+cd "$base"
+
+if [ $? -ne 0 ]
+then
+        echo cannot go to "$base"
+        exit 1
+fi
+
+mkdir build_tests
+
+configure_dir=`pwd`
+seq -w "$nb_tests" | parallel --halt now,fail=10 -j +2 -q ../random_config.sh "build_tests/run_{}" "$configure_dir" "$oldvectors" "$newvectors"
+
+if [ $? -ne 0 ]
+then
+        echo Check found errors
+        exit 1
+else
+        echo No error found
+fi
diff --git a/tests/opus_encode_regressions.c b/tests/opus_encode_regressions.c
index 2923473..4d506eb 100644
--- a/tests/opus_encode_regressions.c
+++ b/tests/opus_encode_regressions.c
@@ -35,7 +35,6 @@
 #include <stdint.h>
 #include <math.h>
 #include <string.h>
-#include <assert.h>
 #include "opus_multistream.h"
 #include "opus.h"
 #include "test_opus_common.h"
@@ -106,7 +105,7 @@
               1799,  1799,  1799,  1799, -9721
         };
         err = opus_multistream_encode(enc, pcm, 320, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -144,7 +143,7 @@
              -9510, -9510, -9510, -9510, -9510, -9510, -9510
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -182,7 +181,7 @@
              -9510, -9510, -9510, -9510, -9510, -9510, -9510
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -220,7 +219,7 @@
              -9510, -9510, -9510, -9510, -9510, -9510, -9510
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -256,7 +255,7 @@
               5632
         };
         err = opus_multistream_encode(enc, pcm, 160, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(0));
@@ -281,7 +280,7 @@
                  0,     0,  -256,   226
         };
         err = opus_multistream_encode(enc, pcm, 40, data, 2460);
-        assert(err > 0);
+        opus_test_assert(err > 0);
         /* returns -3 */
     }
     opus_multistream_encoder_destroy(enc);
@@ -334,7 +333,7 @@
                  0
         };
         err = opus_multistream_encode(enc, pcm, 20, data, 627300);
-        assert(err > 0);
+        opus_test_assert(err > 0);
         /* returns -1 */
     }
     opus_multistream_encoder_destroy(enc);
@@ -384,7 +383,7 @@
                  0
         };
         err = opus_multistream_encode(enc, pcm, 20, data, 472320);
-        assert(err > 0);
+        opus_test_assert(err > 0);
         /* returns -1 */
     }
     opus_multistream_encoder_destroy(enc);
@@ -740,7 +739,7 @@
             -20992, 25859,  5372, 12040, 13307, -4355,-30213,    -9, -6019
         };
         err = opus_multistream_encode(enc, pcm, 960, data, 7380);
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_multistream_encoder_ctl(enc, OPUS_SET_VBR(1));
@@ -885,7 +884,7 @@
         };
         err = opus_multistream_encode(enc, pcm, 1440, data, 7380);
         /* reads uninitialized data at src/opus_multistream_encoder.c:293 */
-        assert(err > 0);
+        opus_test_assert(err > 0);
     }
     opus_multistream_encoder_destroy(enc);
     return 0;
@@ -935,7 +934,7 @@
     opus_encoder_ctl(enc, OPUS_SET_PACKET_LOSS_PERC(6));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
     data_len = opus_encode(enc, pcm1, 960, data, 2000);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_VOICE));
     opus_encoder_ctl(enc, OPUS_SET_PREDICTION_DISABLED(1));
@@ -943,12 +942,12 @@
     opus_encoder_ctl(enc, OPUS_SET_INBAND_FEC(1));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(15600));
     data_len = opus_encode(enc, pcm2, 2880, data, 122);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(27000));
     data_len = opus_encode(enc, pcm3, 2880, data, 122); /* assertion failure */
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_destroy(enc);
     return 0;
@@ -970,7 +969,7 @@
     {
         static const short pcm[960] = { 0 };
         data_len = opus_encode(enc, pcm, 960, data, 2000);
-        assert(data_len > 0);
+        opus_test_assert(data_len > 0);
     }
     opus_encoder_ctl(enc, OPUS_SET_SIGNAL(OPUS_SIGNAL_MUSIC));
     {
@@ -980,7 +979,7 @@
             -32768, -32768, 0, 0, -32768, -32768, 0, 0, -32768, -32768
         };
         data_len = opus_encode(enc, pcm, 480, data, 19);
-        assert(data_len > 0);
+        opus_test_assert(data_len > 0);
     }
     opus_encoder_destroy(enc);
     return 0;
@@ -1009,14 +1008,14 @@
     opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_NARROWBAND));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(6000));
     data_len = opus_encode(enc, pcm1, 160, data, 1000);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_ctl(enc, OPUS_SET_VBR(0));
     opus_encoder_ctl(enc, OPUS_SET_COMPLEXITY(0));
     opus_encoder_ctl(enc, OPUS_SET_MAX_BANDWIDTH(OPUS_BANDWIDTH_MEDIUMBAND));
     opus_encoder_ctl(enc, OPUS_SET_BITRATE(2867));
     data_len = opus_encode(enc, pcm2, 960, data, 1000);
-    assert(data_len > 0);
+    opus_test_assert(data_len > 0);
 
     opus_encoder_destroy(enc);
     return 0;
diff --git a/tests/random_config.sh b/tests/random_config.sh
new file mode 100755
index 0000000..0cdd855
--- /dev/null
+++ b/tests/random_config.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+dir="$1"
+mkdir "$dir"
+if [ $? -ne 0 ]
+then
+        exit 1
+fi
+
+cd "$dir"
+if [ $? -ne 0 ]
+then
+        exit 1
+fi
+
+
+configure_path="$2"
+config="random_config.txt"
+
+case `seq 3 | shuf -n1` in
+1)
+approx=--enable-float-approx
+math=-ffast-math
+;;
+2)
+approx=--enable-float-approx
+;;
+*)
+approx=
+math=
+;;
+esac
+
+CFLAGS='-g'
+
+opt=`echo -e "-O1\n-O2\n-O3" | shuf -n1`
+
+#arch=-march=`echo -e "core2\nsandybridge\nbroadwell\nskylake" | shuf -n1`
+arch=`echo -e "\n-march=core2\n-march=sandybridge\n-march=broadwell\n-march=skylake\n-march=native" | shuf -n1`
+
+footprint=`echo -e "\n-DSMALL_FOOTPRINT" | shuf -n1`
+std=`echo -e "\n-std=c90\n-std=c99\n-std=c11\n-std=c17" | shuf -n1`
+sanitize=`echo -e "\n-fsanitize=address -fno-sanitize-recover=all\n-fsanitize=undefined -fno-sanitize-recover=all -fsanitize-recover=signed-integer-overflow" | shuf -n1`
+
+
+CFLAGS="$CFLAGS $std $opt $arch $footprint $math $sanitize"
+
+echo "CFLAGS=$CFLAGS" > "$config"
+
+lib=`echo -e "\n--disable-static\n--disable-shared" | shuf -n1`
+
+arithmetic=`echo -e "\n--enable-fixed-point\n--enable-fixed-point --enable-fixed-point-debug\n--enable-fixed-point --disable-float-api\n--enable-fixed-point --enable-fixed-point-debug --disable-float-api" | shuf -n1`
+
+custom=`echo -e "\n--enable-custom-modes" | shuf -n1`
+
+asm=`echo -e "\n--disable-asm\n--disable-rtcd\n--disable-intrinsics" | shuf -n1`
+#asm=`echo -e "\n--disable-asm\n--disable-intrinsics" | shuf -n1`
+
+assert=`echo -e "\n--enable-assertions" | shuf -n1`
+harden=`echo -e "\n--enable-hardening" | shuf -n1`
+fuzz=`echo -e "\n--enable-fuzzing" | shuf -n1`
+checkasm=`echo -e "\n--enable-check-asm" | shuf -n1`
+rfc8251=`echo -e "\n--disable-rfc8251" | shuf -n1`
+
+if [ "$rfc8251" = --disable-rfc8251 ]
+then
+        vectors="$3"
+else
+        vectors="$4"
+fi
+echo using testvectors at "$vectors" >> "$config"
+
+
+config_opt="$lib $arithmetic $custom $asm $assert $harden $fuzz $checkasm $rfc8251 $approx"
+
+echo configure $config_opt >> "$config"
+
+export CFLAGS
+"$configure_path/configure" $config_opt > configure_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+        echo configure FAIL >> "$config"
+        exit 1
+fi
+
+make > make_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+        echo make FAIL >> "$config"
+        exit 1
+fi
+
+#Run valgrind 5% of the time (minus the asan cases)
+if [ "`seq 20 | shuf -n1`" -ne 1 -o "$sanitize" = "-fsanitize=address -fno-sanitize-recover=all" ]
+then
+        make check > makecheck_output.txt 2>&1
+else
+        echo valgrind enabled >> "$config"
+        valgrind --trace-children=yes --error-exitcode=128 make check > makecheck_output.txt 2>&1
+fi
+
+if [ $? -ne 0 ]
+then
+        echo check FAIL >> "$config"
+        exit 1
+fi
+
+
+rate=`echo -e "8000\n12000\n16000\n24000\n48000" | shuf -n1`
+echo testvectors for "$rate" Hz > testvectors_output.txt
+../../../run_vectors.sh . "$vectors" "$rate" >> testvectors_output.txt 2>&1
+
+if [ $? -ne 0 ]
+then
+        echo testvectors FAIL >> "$config"
+        exit 1
+fi
+
+echo all tests PASS >> "$config"
+
+#When everything's good, do some cleaning up to save space
+make distclean > /dev/null 2>&1
+rm -f tmp.out
+gzip make_output.txt
diff --git a/tests/test_opus_api.c b/tests/test_opus_api.c
index fb385c6..0e7ed2c 100644
--- a/tests/test_opus_api.c
+++ b/tests/test_opus_api.c
@@ -1298,7 +1298,7 @@
    err=opus_encoder_ctl(enc,OPUS_GET_INBAND_FEC(null_int_ptr));
    if(err!=OPUS_BAD_ARG)test_failed();
    cfgs++;
-   CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,2,
+   CHECK_SETGET(OPUS_SET_INBAND_FEC(i),OPUS_GET_INBAND_FEC(&i),-1,3,
      1,0,
      "    OPUS_SET_INBAND_FEC .......................... OK.\n",
      "    OPUS_GET_INBAND_FEC .......................... OK.\n")
diff --git a/tests/test_opus_common.h b/tests/test_opus_common.h
index d96c7d8..5fb924f 100644
--- a/tests/test_opus_common.h
+++ b/tests/test_opus_common.h
@@ -81,5 +81,5 @@
   abort();
 }
 #define test_failed() _test_failed(__FILE__, __LINE__);
-
+#define opus_test_assert(cond) {if (!(cond)) {test_failed();}}
 void regression_test(void);
diff --git a/tests/test_opus_encode.c b/tests/test_opus_encode.c
index 00795a1..d6e8e2d 100644
--- a/tests/test_opus_encode.c
+++ b/tests/test_opus_encode.c
@@ -297,6 +297,7 @@
   /*FIXME: encoder api tests, fs!=48k, mono, VBR*/
 
    fprintf(stdout,"  Encode+Decode tests.\n");
+   fflush(stdout);
 
    enc = opus_encoder_create(48000, 2, OPUS_APPLICATION_VOIP, &err);
    if(err != OPUS_OK || enc==NULL)test_failed();
@@ -466,6 +467,7 @@
             count++;
          }while(i<(SSAMPLES-MAX_FRAME_SAMP));
          fprintf(stdout,"    Mode %s FB encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate);
+         fflush(stdout);
       }
    }
 
@@ -543,6 +545,7 @@
             count++;
          }while(i<(SSAMPLES/12-MAX_FRAME_SAMP));
          fprintf(stdout,"    Mode %s NB dual-mono MS encode %s, %6d bps OK.\n",mstrings[modes[j]],rc==0?" VBR":rc==1?"CVBR":" CBR",rate);
+         fflush(stdout);
       }
    }
 
@@ -612,6 +615,7 @@
       i+=frame_size;
    }while(i<SAMPLES*4);
    fprintf(stdout,"    All framesize pairs switching encode, %d frames OK.\n",count);
+   fflush(stdout);
 
    if(opus_encoder_ctl(enc, OPUS_RESET_STATE)!=OPUS_OK)test_failed();
    opus_encoder_destroy(enc);
diff --git a/tests/test_opus_padding.c b/tests/test_opus_padding.c
index c22e8f0..c9ef737 100644
--- a/tests/test_opus_padding.c
+++ b/tests/test_opus_padding.c
@@ -39,7 +39,7 @@
 #define CHANNELS 2
 #define FRAMESIZE 5760
 
-int test_overflow(void)
+void test_overflow(void)
 {
   OpusDecoder *decoder;
   int result;
@@ -51,7 +51,7 @@
   fprintf(stderr, "  Checking for padding overflow... ");
   if (!in || !out) {
     fprintf(stderr, "FAIL (out of memory)\n");
-    return -1;
+    test_failed();
   }
   in[0] = 0xff;
   in[1] = 0x41;
@@ -71,21 +71,18 @@
   }
 
   fprintf(stderr, "OK.\n");
-
-  return 1;
 }
 
 int main(void)
 {
   const char *oversion;
-  int tests = 0;;
 
   iseed = 0;
   oversion = opus_get_version_string();
   if (!oversion) test_failed();
   fprintf(stderr, "Testing %s padding.\n", oversion);
 
-  tests += test_overflow();
+  test_overflow();
 
   fprintf(stderr, "All padding tests passed.\n");
 
diff --git a/tests/test_opus_projection.c b/tests/test_opus_projection.c
index 5f0d672..4e06613 100644
--- a/tests/test_opus_projection.c
+++ b/tests/test_opus_projection.c
@@ -29,7 +29,6 @@
 #include "config.h"
 #endif
 
-#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>