From e00426c512a7905f5f925d382c443bab7a0ca693 Mon Sep 17 00:00:00 2001
From: unknown <sneakyfish5.sneaky@gmail.com>
Date: Sat, 2 Nov 2019 11:59:07 -0500
Subject: Update opus to 1.3.1 and opusfile to 0.11

---
 thirdparty/opus/celt/arm/arm2gnu.pl                |   4 +-
 thirdparty/opus/celt/arm/arm_celt_map.c            |  21 +-
 thirdparty/opus/celt/arm/celt_fft_ne10.c           | 173 +++++++
 thirdparty/opus/celt/arm/celt_mdct_ne10.c          | 258 ++++++++++
 thirdparty/opus/celt/arm/celt_ne10_fft.c           | 174 -------
 thirdparty/opus/celt/arm/celt_ne10_mdct.c          | 258 ----------
 thirdparty/opus/celt/arm/celt_neon_intr.c          | 110 +---
 .../opus/celt/arm/celt_pitch_xcorr_arm-gnu.S       | 551 ---------------------
 thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s    |   6 +-
 thirdparty/opus/celt/arm/fft_arm.h                 |   1 -
 thirdparty/opus/celt/arm/fixed_armv4.h             |   6 +-
 thirdparty/opus/celt/arm/fixed_armv5e.h            |   4 +-
 thirdparty/opus/celt/arm/mdct_arm.h                |   1 -
 thirdparty/opus/celt/arm/pitch_arm.h               |  56 ++-
 thirdparty/opus/celt/arm/pitch_neon_intr.c         | 290 +++++++++++
 15 files changed, 802 insertions(+), 1111 deletions(-)
 create mode 100644 thirdparty/opus/celt/arm/celt_fft_ne10.c
 create mode 100644 thirdparty/opus/celt/arm/celt_mdct_ne10.c
 delete mode 100644 thirdparty/opus/celt/arm/celt_ne10_fft.c
 delete mode 100644 thirdparty/opus/celt/arm/celt_ne10_mdct.c
 delete mode 100644 thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
 create mode 100644 thirdparty/opus/celt/arm/pitch_neon_intr.c

(limited to 'thirdparty/opus/celt/arm')

diff --git a/thirdparty/opus/celt/arm/arm2gnu.pl b/thirdparty/opus/celt/arm/arm2gnu.pl
index 6c922ac819..a2895f7445 100755
--- a/thirdparty/opus/celt/arm/arm2gnu.pl
+++ b/thirdparty/opus/celt/arm/arm2gnu.pl
@@ -164,11 +164,11 @@ while (<>) {
         $prefix = "";
         if ($proc)
         {
-            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc) unless ($apple);
+            $prefix = $prefix.sprintf("\t.type\t%s, %%function", $proc) unless ($apple);
             # Make sure we $prefix isn't empty here (for the $apple case).
             # We handle mangling the label here, make sure it doesn't match
             # the label handling below (if $prefix would be empty).
-            $prefix = "; ";
+            $prefix = $prefix."; ";
             push(@proc_stack, $proc);
             s/^[A-Za-z_\.]\w+/$symprefix$&:/;
         }
diff --git a/thirdparty/opus/celt/arm/arm_celt_map.c b/thirdparty/opus/celt/arm/arm_celt_map.c
index 4d4d069a86..ca988b66f5 100644
--- a/thirdparty/opus/celt/arm/arm_celt_map.c
+++ b/thirdparty/opus/celt/arm/arm_celt_map.c
@@ -35,12 +35,29 @@
 
 #if defined(OPUS_HAVE_RTCD)
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+  celt_inner_prod_c,   /* ARMv4 */
+  celt_inner_prod_c,   /* EDSP */
+  celt_inner_prod_c,   /* Media */
+  celt_inner_prod_neon /* NEON */
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2) = {
+  dual_inner_prod_c,   /* ARMv4 */
+  dual_inner_prod_c,   /* EDSP */
+  dual_inner_prod_c,   /* Media */
+  dual_inner_prod_neon /* NEON */
+};
+# endif
+
 # if defined(FIXED_POINT)
 #  if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
     (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
     (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
 opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-    const opus_val16 *, opus_val32 *, int , int) = {
+    const opus_val16 *, opus_val32 *, int, int, int) = {
   celt_pitch_xcorr_c,               /* ARMv4 */
   MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
@@ -51,7 +68,7 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
 # else /* !FIXED_POINT */
 #  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-    const opus_val16 *, opus_val32 *, int, int) = {
+    const opus_val16 *, opus_val32 *, int, int, int) = {
   celt_pitch_xcorr_c,              /* ARMv4 */
   celt_pitch_xcorr_c,              /* EDSP */
   celt_pitch_xcorr_c,              /* Media */
diff --git a/thirdparty/opus/celt/arm/celt_fft_ne10.c b/thirdparty/opus/celt/arm/celt_fft_ne10.c
new file mode 100644
index 0000000000..ea5fd7808b
--- /dev/null
+++ b/thirdparty/opus/celt/arm/celt_fft_ne10.c
@@ -0,0 +1,173 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_fft_ne10.c
+   @brief ARM Neon optimizations for fft using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include <NE10_dsp.h>
+#include "os_support.h"
+#include "kiss_fft.h"
+#include "stack_alloc.h"
+
+#if !defined(FIXED_POINT)
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
+#else
+# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
+#endif
+
+#if defined(CUSTOM_MODES)
+
+/* nfft lengths in NE10 that support scaled fft */
+# define NE10_FFTSCALED_SUPPORT_MAX 4
+static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
+   480, 240, 120, 60
+};
+
+int opus_fft_alloc_arm_neon(kiss_fft_state *st)
+{
+   int i;
+   size_t memneeded = sizeof(struct arch_fft_state);
+
+   st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
+   if (!st->arch_fft)
+      return -1;
+
+   for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
+      if(st->nfft == ne10_fft_scaled_support[i])
+         break;
+   }
+   if (i == NE10_FFTSCALED_SUPPORT_MAX) {
+      /* This nfft length (scaled fft) is not supported in NE10 */
+      st->arch_fft->is_supported = 0;
+      st->arch_fft->priv = NULL;
+   }
+   else {
+      st->arch_fft->is_supported = 1;
+      st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
+      if (st->arch_fft->priv == NULL) {
+         return -1;
+      }
+   }
+   return 0;
+}
+
+void opus_fft_free_arm_neon(kiss_fft_state *st)
+{
+   NE10_FFT_CFG_TYPE_T cfg;
+
+   if (!st->arch_fft)
+      return;
+
+   cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
+   if (cfg)
+      NE10_FFT_DESTROY_C2C_TYPE(cfg);
+   opus_free(st->arch_fft);
+}
+#endif
+
+void opus_fft_neon(const kiss_fft_state *st,
+                   const kiss_fft_cpx *fin,
+                   kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_fft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_forward_scaled = 1;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 0, 1);
+#endif
+   }
+   RESTORE_STACK;
+}
+
+void opus_ifft_neon(const kiss_fft_state *st,
+                    const kiss_fft_cpx *fin,
+                    kiss_fft_cpx *fout)
+{
+   NE10_FFT_STATE_TYPE_T state;
+   NE10_FFT_CFG_TYPE_T cfg = &state;
+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
+   SAVE_STACK;
+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
+
+   if (!st->arch_fft->is_supported) {
+      /* This nfft length (scaled fft) not supported in NE10 */
+      opus_ifft_c(st, fin, fout);
+   }
+   else {
+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
+#if !defined(FIXED_POINT)
+      state.is_backward_scaled = 0;
+
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1);
+#else
+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
+                                (NE10_FFT_CPX_TYPE_T *)fin,
+                                cfg, 1, 0);
+#endif
+   }
+   RESTORE_STACK;
+}
diff --git a/thirdparty/opus/celt/arm/celt_mdct_ne10.c b/thirdparty/opus/celt/arm/celt_mdct_ne10.c
new file mode 100644
index 0000000000..3531d02d10
--- /dev/null
+++ b/thirdparty/opus/celt/arm/celt_mdct_ne10.c
@@ -0,0 +1,258 @@
+/* Copyright (c) 2015 Xiph.Org Foundation
+   Written by Viswanath Puttagunta */
+/**
+   @file celt_mdct_ne10.c
+   @brief ARM Neon optimizations for mdct using NE10 library
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef SKIP_CONFIG_H
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#endif
+
+#include "kiss_fft.h"
+#include "_kiss_fft_guts.h"
+#include "mdct.h"
+#include "stack_alloc.h"
+
+void clt_mdct_forward_neon(const mdct_lookup *l,
+                           kiss_fft_scalar *in,
+                           kiss_fft_scalar * OPUS_RESTRICT out,
+                           const opus_val16 *window,
+                           int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   VARDECL(kiss_fft_cpx, f2);
+   const kiss_fft_state *st = l->kfft[shift];
+   const kiss_twiddle_scalar *trig;
+
+   SAVE_STACK;
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+   ALLOC(f2, N4, kiss_fft_cpx);
+
+   /* Consider the input to be composed of four blocks: [a, b, c, d] */
+   /* Window, shuffle, fold */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
+      for(i=0;i<((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
+         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
+         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+      wp1 = window;
+      wp2 = window+overlap-1;
+      for(;i<N4-((overlap+3)>>2);i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ = *xp2;
+         *yp++ = *xp1;
+         xp1+=2;
+         xp2-=2;
+      }
+      for(;i<N4;i++)
+      {
+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
+         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
+         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
+         xp1+=2;
+         xp2-=2;
+         wp1+=2;
+         wp2-=2;
+      }
+   }
+   /* Pre-rotation */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar *t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_cpx yc;
+         kiss_twiddle_scalar t0, t1;
+         kiss_fft_scalar re, im, yr, yi;
+         t0 = t[i];
+         t1 = t[N4+i];
+         re = *yp++;
+         im = *yp++;
+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
+         yc.r = yr;
+         yc.i = yi;
+         f2[i] = yc;
+      }
+   }
+
+   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
+
+   /* Post-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
+         *yp1 = yr;
+         *yp2 = yi;
+         fp++;
+         yp1 += 2*stride;
+         yp2 -= 2*stride;
+      }
+   }
+   RESTORE_STACK;
+}
+
+void clt_mdct_backward_neon(const mdct_lookup *l,
+                            kiss_fft_scalar *in,
+                            kiss_fft_scalar * OPUS_RESTRICT out,
+                            const opus_val16 * OPUS_RESTRICT window,
+                            int overlap, int shift, int stride, int arch)
+{
+   int i;
+   int N, N2, N4;
+   VARDECL(kiss_fft_scalar, f);
+   const kiss_twiddle_scalar *trig;
+   const kiss_fft_state *st = l->kfft[shift];
+
+   N = l->n;
+   trig = l->trig;
+   for (i=0;i<shift;i++)
+   {
+      N >>= 1;
+      trig += N;
+   }
+   N2 = N>>1;
+   N4 = N>>2;
+
+   ALLOC(f, N2, kiss_fft_scalar);
+
+   /* Pre-rotate */
+   {
+      /* Temp pointers to make it really clear to the compiler what we're doing */
+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
+      kiss_fft_scalar * OPUS_RESTRICT yp = f;
+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
+      for(i=0;i<N4;i++)
+      {
+         kiss_fft_scalar yr, yi;
+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
+         yp[2*i] = yr;
+         yp[2*i+1] = yi;
+         xp1+=2*stride;
+         xp2-=2*stride;
+      }
+   }
+
+   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
+
+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
+      it in-place. */
+   {
+      kiss_fft_scalar * yp0 = out+(overlap>>1);
+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
+      const kiss_twiddle_scalar *t = &trig[0];
+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
+         middle pair will be computed twice. */
+      for(i=0;i<(N4+1)>>1;i++)
+      {
+         kiss_fft_scalar re, im, yr, yi;
+         kiss_twiddle_scalar t0, t1;
+         re = yp0[0];
+         im = yp0[1];
+         t0 = t[i];
+         t1 = t[N4+i];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         re = yp1[0];
+         im = yp1[1];
+         yp0[0] = yr;
+         yp1[1] = yi;
+
+         t0 = t[(N4-i-1)];
+         t1 = t[(N2-i-1)];
+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
+         yr = S_MUL(re,t0) + S_MUL(im,t1);
+         yi = S_MUL(re,t1) - S_MUL(im,t0);
+         yp1[0] = yr;
+         yp0[1] = yi;
+         yp0 += 2;
+         yp1 -= 2;
+      }
+   }
+
+   /* Mirror on both sides for TDAC */
+   {
+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
+      const opus_val16 * OPUS_RESTRICT wp1 = window;
+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
+
+      for(i = 0; i < overlap/2; i++)
+      {
+         kiss_fft_scalar x1, x2;
+         x1 = *xp1;
+         x2 = *yp1;
+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
+         wp1++;
+         wp2--;
+      }
+   }
+   RESTORE_STACK;
+}
diff --git a/thirdparty/opus/celt/arm/celt_ne10_fft.c b/thirdparty/opus/celt/arm/celt_ne10_fft.c
deleted file mode 100644
index 42d96a7117..0000000000
--- a/thirdparty/opus/celt/arm/celt_ne10_fft.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2015 Xiph.Org Foundation
-   Written by Viswanath Puttagunta */
-/**
-   @file celt_ne10_fft.c
-   @brief ARM Neon optimizations for fft using NE10 library
- */
-
-/*
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-
-   - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef SKIP_CONFIG_H
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-#endif
-
-#include <NE10_init.h>
-#include <NE10_dsp.h>
-#include "os_support.h"
-#include "kiss_fft.h"
-#include "stack_alloc.h"
-
-#if !defined(FIXED_POINT)
-# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon
-# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t
-# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t
-# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32
-# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t
-# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon
-#else
-# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)
-# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t
-# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t
-# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
-# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32
-# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t
-# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon
-#endif
-
-#if defined(CUSTOM_MODES)
-
-/* nfft lengths in NE10 that support scaled fft */
-# define NE10_FFTSCALED_SUPPORT_MAX 4
-static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {
-   480, 240, 120, 60
-};
-
-int opus_fft_alloc_arm_neon(kiss_fft_state *st)
-{
-   int i;
-   size_t memneeded = sizeof(struct arch_fft_state);
-
-   st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);
-   if (!st->arch_fft)
-      return -1;
-
-   for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {
-      if(st->nfft == ne10_fft_scaled_support[i])
-         break;
-   }
-   if (i == NE10_FFTSCALED_SUPPORT_MAX) {
-      /* This nfft length (scaled fft) is not supported in NE10 */
-      st->arch_fft->is_supported = 0;
-      st->arch_fft->priv = NULL;
-   }
-   else {
-      st->arch_fft->is_supported = 1;
-      st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);
-      if (st->arch_fft->priv == NULL) {
-         return -1;
-      }
-   }
-   return 0;
-}
-
-void opus_fft_free_arm_neon(kiss_fft_state *st)
-{
-   NE10_FFT_CFG_TYPE_T cfg;
-
-   if (!st->arch_fft)
-      return;
-
-   cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;
-   if (cfg)
-      NE10_FFT_DESTROY_C2C_TYPE(cfg);
-   opus_free(st->arch_fft);
-}
-#endif
-
-void opus_fft_neon(const kiss_fft_state *st,
-                   const kiss_fft_cpx *fin,
-                   kiss_fft_cpx *fout)
-{
-   NE10_FFT_STATE_TYPE_T state;
-   NE10_FFT_CFG_TYPE_T cfg = &state;
-   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
-   SAVE_STACK;
-   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
-
-   if (!st->arch_fft->is_supported) {
-      /* This nfft length (scaled fft) not supported in NE10 */
-      opus_fft_c(st, fin, fout);
-   }
-   else {
-      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
-      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
-#if !defined(FIXED_POINT)
-      state.is_forward_scaled = 1;
-
-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
-                                (NE10_FFT_CPX_TYPE_T *)fin,
-                                cfg, 0);
-#else
-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
-                                (NE10_FFT_CPX_TYPE_T *)fin,
-                                cfg, 0, 1);
-#endif
-   }
-   RESTORE_STACK;
-}
-
-void opus_ifft_neon(const kiss_fft_state *st,
-                    const kiss_fft_cpx *fin,
-                    kiss_fft_cpx *fout)
-{
-   NE10_FFT_STATE_TYPE_T state;
-   NE10_FFT_CFG_TYPE_T cfg = &state;
-   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);
-   SAVE_STACK;
-   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);
-
-   if (!st->arch_fft->is_supported) {
-      /* This nfft length (scaled fft) not supported in NE10 */
-      opus_ifft_c(st, fin, fout);
-   }
-   else {
-      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));
-      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];
-#if !defined(FIXED_POINT)
-      state.is_backward_scaled = 0;
-
-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
-                                (NE10_FFT_CPX_TYPE_T *)fin,
-                                cfg, 1);
-#else
-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,
-                                (NE10_FFT_CPX_TYPE_T *)fin,
-                                cfg, 1, 0);
-#endif
-   }
-   RESTORE_STACK;
-}
diff --git a/thirdparty/opus/celt/arm/celt_ne10_mdct.c b/thirdparty/opus/celt/arm/celt_ne10_mdct.c
deleted file mode 100644
index 293c3efd7a..0000000000
--- a/thirdparty/opus/celt/arm/celt_ne10_mdct.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2015 Xiph.Org Foundation
-   Written by Viswanath Puttagunta */
-/**
-   @file celt_ne10_mdct.c
-   @brief ARM Neon optimizations for mdct using NE10 library
- */
-
-/*
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-
-   - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-   - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#ifndef SKIP_CONFIG_H
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-#endif
-
-#include "kiss_fft.h"
-#include "_kiss_fft_guts.h"
-#include "mdct.h"
-#include "stack_alloc.h"
-
-void clt_mdct_forward_neon(const mdct_lookup *l,
-                           kiss_fft_scalar *in,
-                           kiss_fft_scalar * OPUS_RESTRICT out,
-                           const opus_val16 *window,
-                           int overlap, int shift, int stride, int arch)
-{
-   int i;
-   int N, N2, N4;
-   VARDECL(kiss_fft_scalar, f);
-   VARDECL(kiss_fft_cpx, f2);
-   const kiss_fft_state *st = l->kfft[shift];
-   const kiss_twiddle_scalar *trig;
-
-   SAVE_STACK;
-
-   N = l->n;
-   trig = l->trig;
-   for (i=0;i<shift;i++)
-   {
-      N >>= 1;
-      trig += N;
-   }
-   N2 = N>>1;
-   N4 = N>>2;
-
-   ALLOC(f, N2, kiss_fft_scalar);
-   ALLOC(f2, N4, kiss_fft_cpx);
-
-   /* Consider the input to be composed of four blocks: [a, b, c, d] */
-   /* Window, shuffle, fold */
-   {
-      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
-      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
-      kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
-      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
-      for(i=0;i<((overlap+3)>>2);i++)
-      {
-         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
-         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
-         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
-         xp1+=2;
-         xp2-=2;
-         wp1+=2;
-         wp2-=2;
-      }
-      wp1 = window;
-      wp2 = window+overlap-1;
-      for(;i<N4-((overlap+3)>>2);i++)
-      {
-         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-         *yp++ = *xp2;
-         *yp++ = *xp1;
-         xp1+=2;
-         xp2-=2;
-      }
-      for(;i<N4;i++)
-      {
-         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
-         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
-         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
-         xp1+=2;
-         xp2-=2;
-         wp1+=2;
-         wp2-=2;
-      }
-   }
-   /* Pre-rotation */
-   {
-      kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const kiss_twiddle_scalar *t = &trig[0];
-      for(i=0;i<N4;i++)
-      {
-         kiss_fft_cpx yc;
-         kiss_twiddle_scalar t0, t1;
-         kiss_fft_scalar re, im, yr, yi;
-         t0 = t[i];
-         t1 = t[N4+i];
-         re = *yp++;
-         im = *yp++;
-         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
-         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
-         yc.r = yr;
-         yc.i = yi;
-         f2[i] = yc;
-      }
-   }
-
-   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
-
-   /* Post-rotate */
-   {
-      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
-      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
-      const kiss_twiddle_scalar *t = &trig[0];
-      /* Temp pointers to make it really clear to the compiler what we're doing */
-      for(i=0;i<N4;i++)
-      {
-         kiss_fft_scalar yr, yi;
-         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
-         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
-         *yp1 = yr;
-         *yp2 = yi;
-         fp++;
-         yp1 += 2*stride;
-         yp2 -= 2*stride;
-      }
-   }
-   RESTORE_STACK;
-}
-
-void clt_mdct_backward_neon(const mdct_lookup *l,
-                            kiss_fft_scalar *in,
-                            kiss_fft_scalar * OPUS_RESTRICT out,
-                            const opus_val16 * OPUS_RESTRICT window,
-                            int overlap, int shift, int stride, int arch)
-{
-   int i;
-   int N, N2, N4;
-   VARDECL(kiss_fft_scalar, f);
-   const kiss_twiddle_scalar *trig;
-   const kiss_fft_state *st = l->kfft[shift];
-
-   N = l->n;
-   trig = l->trig;
-   for (i=0;i<shift;i++)
-   {
-      N >>= 1;
-      trig += N;
-   }
-   N2 = N>>1;
-   N4 = N>>2;
-
-   ALLOC(f, N2, kiss_fft_scalar);
-
-   /* Pre-rotate */
-   {
-      /* Temp pointers to make it really clear to the compiler what we're doing */
-      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
-      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
-      kiss_fft_scalar * OPUS_RESTRICT yp = f;
-      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
-      for(i=0;i<N4;i++)
-      {
-         kiss_fft_scalar yr, yi;
-         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
-         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
-         yp[2*i] = yr;
-         yp[2*i+1] = yi;
-         xp1+=2*stride;
-         xp2-=2*stride;
-      }
-   }
-
-   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
-
-   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
-      it in-place. */
-   {
-      kiss_fft_scalar * yp0 = out+(overlap>>1);
-      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
-      const kiss_twiddle_scalar *t = &trig[0];
-      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
-         middle pair will be computed twice. */
-      for(i=0;i<(N4+1)>>1;i++)
-      {
-         kiss_fft_scalar re, im, yr, yi;
-         kiss_twiddle_scalar t0, t1;
-         re = yp0[0];
-         im = yp0[1];
-         t0 = t[i];
-         t1 = t[N4+i];
-         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) + S_MUL(im,t1);
-         yi = S_MUL(re,t1) - S_MUL(im,t0);
-         re = yp1[0];
-         im = yp1[1];
-         yp0[0] = yr;
-         yp1[1] = yi;
-
-         t0 = t[(N4-i-1)];
-         t1 = t[(N2-i-1)];
-         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = S_MUL(re,t0) + S_MUL(im,t1);
-         yi = S_MUL(re,t1) - S_MUL(im,t0);
-         yp1[0] = yr;
-         yp0[1] = yi;
-         yp0 += 2;
-         yp1 -= 2;
-      }
-   }
-
-   /* Mirror on both sides for TDAC */
-   {
-      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
-      const opus_val16 * OPUS_RESTRICT wp1 = window;
-      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
-
-      for(i = 0; i < overlap/2; i++)
-      {
-         kiss_fft_scalar x1, x2;
-         x1 = *xp1;
-         x2 = *yp1;
-         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
-         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
-         wp1++;
-         wp2--;
-      }
-   }
-   RESTORE_STACK;
-}
diff --git a/thirdparty/opus/celt/arm/celt_neon_intr.c b/thirdparty/opus/celt/arm/celt_neon_intr.c
index 47bbe3dc22..effda769d0 100644
--- a/thirdparty/opus/celt/arm/celt_neon_intr.c
+++ b/thirdparty/opus/celt/arm/celt_neon_intr.c
@@ -191,121 +191,21 @@ static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
    vst1q_f32(sum, SUMM);
 }
 
-/*
- * Function: xcorr_kernel_neon_float_process1
- * ---------------------------------
- * Computes single correlation values and stores in *sum
- */
-static void xcorr_kernel_neon_float_process1(const float32_t *x,
-      const float32_t *y, float32_t *sum, int len) {
-   float32x4_t XX[4];
-   float32x4_t YY[4];
-   float32x2_t XX_2;
-   float32x2_t YY_2;
-   float32x4_t SUMM;
-   float32x2_t SUMM_2[2];
-   const float32_t *xi = x;
-   const float32_t *yi = y;
-
-   SUMM = vdupq_n_f32(0);
-
-   /* Work on 16 values per iteration */
-   while (len >= 16) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      XX[1] = vld1q_f32(xi);
-      xi += 4;
-      XX[2] = vld1q_f32(xi);
-      xi += 4;
-      XX[3] = vld1q_f32(xi);
-      xi += 4;
-
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      YY[1] = vld1q_f32(yi);
-      yi += 4;
-      YY[2] = vld1q_f32(yi);
-      yi += 4;
-      YY[3] = vld1q_f32(yi);
-      yi += 4;
-
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
-      SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
-      SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
-      len -= 16;
-   }
-
-   /* Work on 8 values */
-   if (len >= 8) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      XX[1] = vld1q_f32(xi);
-      xi += 4;
-
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      YY[1] = vld1q_f32(yi);
-      yi += 4;
-
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
-      len -= 8;
-   }
-
-   /* Work on 4 values */
-   if (len >= 4) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      len -= 4;
-   }
-
-   /* Start accumulating results */
-   SUMM_2[0] = vget_low_f32(SUMM);
-   if (len >= 2) {
-      /* While at it, consume 2 more values if available */
-      XX_2 = vld1_f32(xi);
-      xi += 2;
-      YY_2 = vld1_f32(yi);
-      yi += 2;
-      SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
-      len -= 2;
-   }
-   SUMM_2[1] = vget_high_f32(SUMM);
-   SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
-   SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
-   /* Ok, now we have result accumulated in SUMM_2[0].0 */
-
-   if (len > 0) {
-      /* Case when you have one value left */
-      XX_2 = vld1_dup_f32(xi);
-      YY_2 = vld1_dup_f32(yi);
-      SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
-   }
-
-   vst1_lane_f32(sum, SUMM_2[0], 0);
-}
-
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
-                        opus_val32 *xcorr, int len, int max_pitch) {
+                        opus_val32 *xcorr, int len, int max_pitch, int arch) {
    int i;
+   (void)arch;
    celt_assert(max_pitch > 0);
-   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
 
    for (i = 0; i < (max_pitch-3); i += 4) {
       xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
             (float32_t *)xcorr+i, len);
    }
 
-   /* In case max_pitch isn't multiple of 4
-    * compute single correlation value per iteration
-    */
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
    for (; i < max_pitch; i++) {
-      xcorr_kernel_neon_float_process1((const float32_t *)_x,
-            (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
+      xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
    }
 }
 #endif
diff --git a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
deleted file mode 100644
index 5b2ee55a10..0000000000
--- a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
+++ /dev/null
@@ -1,551 +0,0 @@
-    .syntax unified
-@ Copyright (c) 2007-2008 CSIRO
-@ Copyright (c) 2007-2009 Xiph.Org Foundation
-@ Copyright (c) 2013      Parrot
-@ Written by Aurélien Zanelli
-@
-@ Redistribution and use in source and binary forms, with or without
-@ modification, are permitted provided that the following conditions
-@ are met:
-@
-@ - Redistributions of source code must retain the above copyright
-@ notice, this list of conditions and the following disclaimer.
-@
-@ - Redistributions in binary form must reproduce the above copyright
-@ notice, this list of conditions and the following disclaimer in the
-@ documentation and/or other materials provided with the distribution.
-@
-@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    .text;   .p2align 2;   .arch armv7-a
-   .fpu neon
-   .object_arch armv4t
-
-  .include "celt/arm/armopts-gnu.S"
-
- .if OPUS_ARM_MAY_HAVE_EDSP
-  .global celt_pitch_xcorr_edsp
- .endif
-
- .if OPUS_ARM_MAY_HAVE_NEON
-  .global celt_pitch_xcorr_neon
- .endif
-
- .if OPUS_ARM_MAY_HAVE_NEON
-
-@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
-; xcorr_kernel_neon: @ PROC
-xcorr_kernel_neon_start:
-  @ input:
-  @   r3     = int         len
-  @   r4     = opus_val16 *x
-  @   r5     = opus_val16 *y
-  @   q0     = opus_val32  sum[4]
-  @ output:
-  @   q0     = opus_val32  sum[4]
-  @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
-  @ internal usage:
-  @   r12 = int j
-  @   d3  = y_3|y_2|y_1|y_0
-  @   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
-  @   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
-  @   q8  = scratch
-  @
-  @ Load y[0...3]
-  @ This requires len>0 to always be valid (which we assert in the C code).
-  VLD1.16      {d5}, [r5]!
-  SUBS         r12, r3, #8
-  BLE xcorr_kernel_neon_process4
-@ Process 8 samples at a time.
-@ This loop loads one y value more than we actually need. Therefore we have to
-@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
-@ reading past the end of the array.
-xcorr_kernel_neon_process8:
-  @ This loop has 19 total instructions (10 cycles to issue, minimum), with
-  @ - 2 cycles of ARM insrtuctions,
-  @ - 10 cycles of load/store/byte permute instructions, and
-  @ - 9 cycles of data processing instructions.
-  @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
-  @ latter two categories, meaning the whole loop should run in 10 cycles per
-  @ iteration, barring cache misses.
-  @
-  @ Load x[0...7]
-  VLD1.16      {d6, d7}, [r4]!
-  @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
-  @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
-  VAND         d3, d5, d5
-  SUBS         r12, r12, #8
-  @ Load y[4...11]
-  VLD1.16      {d4, d5}, [r5]!
-  VMLAL.S16    q0, d3, d6[0]
-  VEXT.16      d16, d3, d4, #1
-  VMLAL.S16    q0, d4, d7[0]
-  VEXT.16      d17, d4, d5, #1
-  VMLAL.S16    q0, d16, d6[1]
-  VEXT.16      d16, d3, d4, #2
-  VMLAL.S16    q0, d17, d7[1]
-  VEXT.16      d17, d4, d5, #2
-  VMLAL.S16    q0, d16, d6[2]
-  VEXT.16      d16, d3, d4, #3
-  VMLAL.S16    q0, d17, d7[2]
-  VEXT.16      d17, d4, d5, #3
-  VMLAL.S16    q0, d16, d6[3]
-  VMLAL.S16    q0, d17, d7[3]
-  BGT xcorr_kernel_neon_process8
-@ Process 4 samples here if we have > 4 left (still reading one extra y value).
-xcorr_kernel_neon_process4:
-  ADDS         r12, r12, #4
-  BLE xcorr_kernel_neon_process2
-  @ Load x[0...3]
-  VLD1.16      d6, [r4]!
-  @ Use VAND since it's a data processing instruction again.
-  VAND         d4, d5, d5
-  SUB          r12, r12, #4
-  @ Load y[4...7]
-  VLD1.16      d5, [r5]!
-  VMLAL.S16    q0, d4, d6[0]
-  VEXT.16      d16, d4, d5, #1
-  VMLAL.S16    q0, d16, d6[1]
-  VEXT.16      d16, d4, d5, #2
-  VMLAL.S16    q0, d16, d6[2]
-  VEXT.16      d16, d4, d5, #3
-  VMLAL.S16    q0, d16, d6[3]
-@ Process 2 samples here if we have > 2 left (still reading one extra y value).
-xcorr_kernel_neon_process2:
-  ADDS         r12, r12, #2
-  BLE xcorr_kernel_neon_process1
-  @ Load x[0...1]
-  VLD2.16      {d6[],d7[]}, [r4]!
-  @ Use VAND since it's a data processing instruction again.
-  VAND         d4, d5, d5
-  SUB          r12, r12, #2
-  @ Load y[4...5]
-  VLD1.32      {d5[]}, [r5]!
-  VMLAL.S16    q0, d4, d6
-  VEXT.16      d16, d4, d5, #1
-  @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
-  @ instead of VEXT, since it's a data-processing instruction.
-  VSRI.64      d5, d4, #32
-  VMLAL.S16    q0, d16, d7
-@ Process 1 sample using the extra y value we loaded above.
-xcorr_kernel_neon_process1:
-  @ Load next *x
-  VLD1.16      {d6[]}, [r4]!
-  ADDS         r12, r12, #1
-  @ y[0...3] are left in d5 from prior iteration(s) (if any)
-  VMLAL.S16    q0, d5, d6
-  MOVLE        pc, lr
-@ Now process 1 last sample, not reading ahead.
-  @ Load last *y
-  VLD1.16      {d4[]}, [r5]!
-  VSRI.64      d4, d5, #16
-  @ Load last *x
-  VLD1.16      {d6[]}, [r4]!
-  VMLAL.S16    q0, d4, d6
-  MOV          pc, lr
-	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
-
-@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-@  opus_val32 *xcorr, int len, int max_pitch)
-; celt_pitch_xcorr_neon: @ PROC
-  @ input:
-  @   r0  = opus_val16 *_x
-  @   r1  = opus_val16 *_y
-  @   r2  = opus_val32 *xcorr
-  @   r3  = int         len
-  @ output:
-  @   r0  = int         maxcorr
-  @ internal usage:
-  @   r4  = opus_val16 *x (for xcorr_kernel_neon())
-  @   r5  = opus_val16 *y (for xcorr_kernel_neon())
-  @   r6  = int         max_pitch
-  @   r12 = int         j
-  @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
-  STMFD        sp!, {r4-r6, lr}
-  LDR          r6, [sp, #16]
-  VMOV.S32     q15, #1
-  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-  SUBS         r6, r6, #4
-  BLT celt_pitch_xcorr_neon_process4_done
-celt_pitch_xcorr_neon_process4:
-  @ xcorr_kernel_neon parameters:
-  @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
-  MOV          r4, r0
-  MOV          r5, r1
-  VEOR         q0, q0, q0
-  @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
-  @ So we don't save/restore any other registers.
-  BL xcorr_kernel_neon_start
-  SUBS         r6, r6, #4
-  VST1.32      {q0}, [r2]!
-  @ _y += 4
-  ADD          r1, r1, #8
-  VMAX.S32     q15, q15, q0
-  @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
-  BGE celt_pitch_xcorr_neon_process4
-@ We have less than 4 sums left to compute.
-celt_pitch_xcorr_neon_process4_done:
-  ADDS         r6, r6, #4
-  @ Reduce maxcorr to a single value
-  VMAX.S32     d30, d30, d31
-  VPMAX.S32    d30, d30, d30
-  @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
-  BLE celt_pitch_xcorr_neon_done
-@ Now compute each remaining sum one at a time.
-celt_pitch_xcorr_neon_process_remaining:
-  MOV          r4, r0
-  MOV          r5, r1
-  VMOV.I32     q0, #0
-  SUBS         r12, r3, #8
-  BLT celt_pitch_xcorr_neon_process_remaining4
-@ Sum terms 8 at a time.
-celt_pitch_xcorr_neon_process_remaining_loop8:
-  @ Load x[0...7]
-  VLD1.16      {q1}, [r4]!
-  @ Load y[0...7]
-  VLD1.16      {q2}, [r5]!
-  SUBS         r12, r12, #8
-  VMLAL.S16    q0, d4, d2
-  VMLAL.S16    q0, d5, d3
-  BGE celt_pitch_xcorr_neon_process_remaining_loop8
-@ Sum terms 4 at a time.
-celt_pitch_xcorr_neon_process_remaining4:
-  ADDS         r12, r12, #4
-  BLT celt_pitch_xcorr_neon_process_remaining4_done
-  @ Load x[0...3]
-  VLD1.16      {d2}, [r4]!
-  @ Load y[0...3]
-  VLD1.16      {d3}, [r5]!
-  SUB          r12, r12, #4
-  VMLAL.S16    q0, d3, d2
-celt_pitch_xcorr_neon_process_remaining4_done:
-  @ Reduce the sum to a single value.
-  VADD.S32     d0, d0, d1
-  VPADDL.S32   d0, d0
-  ADDS         r12, r12, #4
-  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
-@ Sum terms 1 at a time.
-celt_pitch_xcorr_neon_process_remaining_loop1:
-  VLD1.16      {d2[]}, [r4]!
-  VLD1.16      {d3[]}, [r5]!
-  SUBS         r12, r12, #1
-  VMLAL.S16    q0, d2, d3
-  BGT celt_pitch_xcorr_neon_process_remaining_loop1
-celt_pitch_xcorr_neon_process_remaining_loop_done:
-  VST1.32      {d0[0]}, [r2]!
-  VMAX.S32     d30, d30, d0
-  SUBS         r6, r6, #1
-  @ _y++
-  ADD          r1, r1, #2
-  @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
-  BGT celt_pitch_xcorr_neon_process_remaining
-celt_pitch_xcorr_neon_done:
-  VMOV.32      r0, d30[0]
-  LDMFD        sp!, {r4-r6, pc}
-	.size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon  @ ENDP
-
- .endif
-
- .if OPUS_ARM_MAY_HAVE_EDSP
-
-@ This will get used on ARMv7 devices without NEON, so it has been optimized
-@ to take advantage of dual-issuing where possible.
-; xcorr_kernel_edsp: @ PROC
-xcorr_kernel_edsp_start:
-  @ input:
-  @   r3      = int         len
-  @   r4      = opus_val16 *_x (must be 32-bit aligned)
-  @   r5      = opus_val16 *_y (must be 32-bit aligned)
-  @   r6...r9 = opus_val32  sum[4]
-  @ output:
-  @   r6...r9 = opus_val32  sum[4]
-  @ preserved: r0-r5
-  @ internal usage
-  @   r2      = int         j
-  @   r12,r14 = opus_val16  x[4]
-  @   r10,r11 = opus_val16  y[4]
-  STMFD        sp!, {r2,r4,r5,lr}
-  LDR          r10, [r5], #4      @ Load y[0...1]
-  SUBS         r2, r3, #4         @ j = len-4
-  LDR          r11, [r5], #4      @ Load y[2...3]
-  BLE xcorr_kernel_edsp_process4_done
-  LDR          r12, [r4], #4      @ Load x[0...1]
-  @ Stall
-xcorr_kernel_edsp_process4:
-  @ The multiplies must issue from pipeline 0, and can't dual-issue with each
-  @ other. Every other instruction here dual-issues with a multiply, and is
-  @ thus "free". There should be no stalls in the body of the loop.
-  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_0,y_0)
-  LDR          r14, [r4], #4      @ Load x[2...3]
-  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x_0,y_1)
-  SUBS         r2, r2, #4         @ j-=4
-  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_0,y_2)
-  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x_0,y_3)
-  SMLATT       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x_1,y_1)
-  LDR          r10, [r5], #4      @ Load y[4...5]
-  SMLATB       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],x_1,y_2)
-  SMLATT       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x_1,y_3)
-  SMLATB       r9, r12, r10, r9   @ sum[3] = MAC16_16(sum[3],x_1,y_4)
-  LDRGT        r12, [r4], #4      @ Load x[0...1]
-  SMLABB       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_2,y_2)
-  SMLABT       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x_2,y_3)
-  SMLABB       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_2,y_4)
-  SMLABT       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x_2,y_5)
-  SMLATT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],x_3,y_3)
-  LDR          r11, [r5], #4      @ Load y[6...7]
-  SMLATB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],x_3,y_4)
-  SMLATT       r8, r14, r10, r8   @ sum[2] = MAC16_16(sum[2],x_3,y_5)
-  SMLATB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],x_3,y_6)
-  BGT xcorr_kernel_edsp_process4
-xcorr_kernel_edsp_process4_done:
-  ADDS         r2, r2, #4
-  BLE xcorr_kernel_edsp_done
-  LDRH         r12, [r4], #2      @ r12 = *x++
-  SUBS         r2, r2, #1         @ j--
-  @ Stall
-  SMLABB       r6, r12, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_0)
-  LDRHGT       r14, [r4], #2      @ r14 = *x++
-  SMLABT       r7, r12, r10, r7   @ sum[1] = MAC16_16(sum[1],x,y_1)
-  SMLABB       r8, r12, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_2)
-  SMLABT       r9, r12, r11, r9   @ sum[3] = MAC16_16(sum[3],x,y_3)
-  BLE xcorr_kernel_edsp_done
-  SMLABT       r6, r14, r10, r6   @ sum[0] = MAC16_16(sum[0],x,y_1)
-  SUBS         r2, r2, #1         @ j--
-  SMLABB       r7, r14, r11, r7   @ sum[1] = MAC16_16(sum[1],x,y_2)
-  LDRH         r10, [r5], #2      @ r10 = y_4 = *y++
-  SMLABT       r8, r14, r11, r8   @ sum[2] = MAC16_16(sum[2],x,y_3)
-  LDRHGT       r12, [r4], #2      @ r12 = *x++
-  SMLABB       r9, r14, r10, r9   @ sum[3] = MAC16_16(sum[3],x,y_4)
-  BLE xcorr_kernel_edsp_done
-  SMLABB       r6, r12, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_2)
-  CMP          r2, #1             @ j--
-  SMLABT       r7, r12, r11, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_3)
-  LDRH         r2, [r5], #2       @ r2 = y_5 = *y++
-  SMLABB       r8, r12, r10, r8   @ sum[2] = MAC16_16(sum[2],tmp,y_4)
-  LDRHGT       r14, [r4]          @ r14 = *x
-  SMLABB       r9, r12, r2, r9    @ sum[3] = MAC16_16(sum[3],tmp,y_5)
-  BLE xcorr_kernel_edsp_done
-  SMLABT       r6, r14, r11, r6   @ sum[0] = MAC16_16(sum[0],tmp,y_3)
-  LDRH         r11, [r5]          @ r11 = y_6 = *y
-  SMLABB       r7, r14, r10, r7   @ sum[1] = MAC16_16(sum[1],tmp,y_4)
-  SMLABB       r8, r14, r2, r8    @ sum[2] = MAC16_16(sum[2],tmp,y_5)
-  SMLABB       r9, r14, r11, r9   @ sum[3] = MAC16_16(sum[3],tmp,y_6)
-xcorr_kernel_edsp_done:
-  LDMFD        sp!, {r2,r4,r5,pc}
-	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
-
-; celt_pitch_xcorr_edsp: @ PROC
-  @ input:
-  @   r0  = opus_val16 *_x (must be 32-bit aligned)
-  @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
-  @   r2  = opus_val32 *xcorr
-  @   r3  = int         len
-  @ output:
-  @   r0  = maxcorr
-  @ internal usage
-  @   r4  = opus_val16 *x
-  @   r5  = opus_val16 *y
-  @   r6  = opus_val32  sum0
-  @   r7  = opus_val32  sum1
-  @   r8  = opus_val32  sum2
-  @   r9  = opus_val32  sum3
-  @   r1  = int         max_pitch
-  @   r12 = int         j
-  STMFD        sp!, {r4-r11, lr}
-  MOV          r5, r1
-  LDR          r1, [sp, #36]
-  MOV          r4, r0
-  TST          r5, #3
-  @ maxcorr = 1
-  MOV          r0, #1
-  BEQ          celt_pitch_xcorr_edsp_process1u_done
-@ Compute one sum at the start to make y 32-bit aligned.
-  SUBS         r12, r3, #4
-  @ r14 = sum = 0
-  MOV          r14, #0
-  LDRH         r8, [r5], #2
-  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
-  LDR          r6, [r4], #4
-  MOV          r8, r8, LSL #16
-celt_pitch_xcorr_edsp_process1u_loop4:
-  LDR          r9, [r5], #4
-  SMLABT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
-  LDR          r7, [r4], #4
-  SMLATB       r14, r6, r9, r14     @ sum = MAC16_16(sum, x_1, y_1)
-  LDR          r8, [r5], #4
-  SMLABT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
-  SUBS         r12, r12, #4         @ j-=4
-  SMLATB       r14, r7, r8, r14     @ sum = MAC16_16(sum, x_3, y_3)
-  LDRGT        r6, [r4], #4
-  BGT celt_pitch_xcorr_edsp_process1u_loop4
-  MOV          r8, r8, LSR #16
-celt_pitch_xcorr_edsp_process1u_loop4_done:
-  ADDS         r12, r12, #4
-celt_pitch_xcorr_edsp_process1u_loop1:
-  LDRHGE       r6, [r4], #2
-  @ Stall
-  SMLABBGE     r14, r6, r8, r14    @ sum = MAC16_16(sum, *x, *y)
-  SUBSGE       r12, r12, #1
-  LDRHGT       r8, [r5], #2
-  BGT celt_pitch_xcorr_edsp_process1u_loop1
-  @ Restore _x
-  SUB          r4, r4, r3, LSL #1
-  @ Restore and advance _y
-  SUB          r5, r5, r3, LSL #1
-  @ maxcorr = max(maxcorr, sum)
-  CMP          r0, r14
-  ADD          r5, r5, #2
-  MOVLT        r0, r14
-  SUBS         r1, r1, #1
-  @ xcorr[i] = sum
-  STR          r14, [r2], #4
-  BLE celt_pitch_xcorr_edsp_done
-celt_pitch_xcorr_edsp_process1u_done:
-  @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
-  SUBS         r1, r1, #4
-  BLT celt_pitch_xcorr_edsp_process2
-celt_pitch_xcorr_edsp_process4:
-  @ xcorr_kernel_edsp parameters:
-  @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
-  MOV          r6, #0
-  MOV          r7, #0
-  MOV          r8, #0
-  MOV          r9, #0
-  BL xcorr_kernel_edsp_start  @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
-  @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
-  CMP          r0, r6
-  @ _y+=4
-  ADD          r5, r5, #8
-  MOVLT        r0, r6
-  CMP          r0, r7
-  MOVLT        r0, r7
-  CMP          r0, r8
-  MOVLT        r0, r8
-  CMP          r0, r9
-  MOVLT        r0, r9
-  STMIA        r2!, {r6-r9}
-  SUBS         r1, r1, #4
-  BGE celt_pitch_xcorr_edsp_process4
-celt_pitch_xcorr_edsp_process2:
-  ADDS         r1, r1, #2
-  BLT celt_pitch_xcorr_edsp_process1a
-  SUBS         r12, r3, #4
-  @ {r10, r11} = {sum0, sum1} = {0, 0}
-  MOV          r10, #0
-  MOV          r11, #0
-  LDR          r8, [r5], #4
-  BLE celt_pitch_xcorr_edsp_process2_loop_done
-  LDR          r6, [r4], #4
-  LDR          r9, [r5], #4
-celt_pitch_xcorr_edsp_process2_loop4:
-  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
-  LDR          r7, [r4], #4
-  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
-  SUBS         r12, r12, #4         @ j-=4
-  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
-  LDR          r8, [r5], #4
-  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
-  LDRGT        r6, [r4], #4
-  SMLABB       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_2, y_2)
-  SMLABT       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_2, y_3)
-  SMLATT       r10, r7, r9, r10     @ sum0 = MAC16_16(sum0, x_3, y_3)
-  LDRGT        r9, [r5], #4
-  SMLATB       r11, r7, r8, r11     @ sum1 = MAC16_16(sum1, x_3, y_4)
-  BGT celt_pitch_xcorr_edsp_process2_loop4
-celt_pitch_xcorr_edsp_process2_loop_done:
-  ADDS         r12, r12, #2
-  BLE  celt_pitch_xcorr_edsp_process2_1
-  LDR          r6, [r4], #4
-  @ Stall
-  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
-  LDR          r9, [r5], #4
-  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
-  SUB          r12, r12, #2
-  SMLATT       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_1, y_1)
-  MOV          r8, r9
-  SMLATB       r11, r6, r9, r11     @ sum1 = MAC16_16(sum1, x_1, y_2)
-celt_pitch_xcorr_edsp_process2_1:
-  LDRH         r6, [r4], #2
-  ADDS         r12, r12, #1
-  @ Stall
-  SMLABB       r10, r6, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_0)
-  LDRHGT       r7, [r4], #2
-  SMLABT       r11, r6, r8, r11     @ sum1 = MAC16_16(sum1, x_0, y_1)
-  BLE celt_pitch_xcorr_edsp_process2_done
-  LDRH         r9, [r5], #2
-  SMLABT       r10, r7, r8, r10     @ sum0 = MAC16_16(sum0, x_0, y_1)
-  SMLABB       r11, r7, r9, r11     @ sum1 = MAC16_16(sum1, x_0, y_2)
-celt_pitch_xcorr_edsp_process2_done:
-  @ Restore _x
-  SUB          r4, r4, r3, LSL #1
-  @ Restore and advance _y
-  SUB          r5, r5, r3, LSL #1
-  @ maxcorr = max(maxcorr, sum0)
-  CMP          r0, r10
-  ADD          r5, r5, #2
-  MOVLT        r0, r10
-  SUB          r1, r1, #2
-  @ maxcorr = max(maxcorr, sum1)
-  CMP          r0, r11
-  @ xcorr[i] = sum
-  STR          r10, [r2], #4
-  MOVLT        r0, r11
-  STR          r11, [r2], #4
-celt_pitch_xcorr_edsp_process1a:
-  ADDS         r1, r1, #1
-  BLT celt_pitch_xcorr_edsp_done
-  SUBS         r12, r3, #4
-  @ r14 = sum = 0
-  MOV          r14, #0
-  BLT celt_pitch_xcorr_edsp_process1a_loop_done
-  LDR          r6, [r4], #4
-  LDR          r8, [r5], #4
-  LDR          r7, [r4], #4
-  LDR          r9, [r5], #4
-celt_pitch_xcorr_edsp_process1a_loop4:
-  SMLABB       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
-  SUBS         r12, r12, #4         @ j-=4
-  SMLATT       r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
-  LDRGE        r6, [r4], #4
-  SMLABB       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_2, y_2)
-  LDRGE        r8, [r5], #4
-  SMLATT       r14, r7, r9, r14     @ sum = MAC16_16(sum, x_3, y_3)
-  LDRGE        r7, [r4], #4
-  LDRGE        r9, [r5], #4
-  BGE celt_pitch_xcorr_edsp_process1a_loop4
-celt_pitch_xcorr_edsp_process1a_loop_done:
-  ADDS         r12, r12, #2
-  LDRGE        r6, [r4], #4
-  LDRGE        r8, [r5], #4
-  @ Stall
-  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_0, y_0)
-  SUBGE        r12, r12, #2
-  SMLATTGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, x_1, y_1)
-  ADDS         r12, r12, #1
-  LDRHGE       r6, [r4], #2
-  LDRHGE       r8, [r5], #2
-  @ Stall
-  SMLABBGE     r14, r6, r8, r14     @ sum = MAC16_16(sum, *x, *y)
-  @ maxcorr = max(maxcorr, sum)
-  CMP          r0, r14
-  @ xcorr[i] = sum
-  STR          r14, [r2], #4
-  MOVLT        r0, r14
-celt_pitch_xcorr_edsp_done:
-  LDMFD        sp!, {r4-r11, pc}
-	.size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp  @ ENDP
-
- .endif
-
-@ END:
-    .section	.note.GNU-stack,"",%progbits
diff --git a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s
index f96e0a88bb..6e873afc37 100644
--- a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s
+++ b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s
@@ -153,7 +153,7 @@ xcorr_kernel_neon_process1
   ENDP
 
 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-;  opus_val32 *xcorr, int len, int max_pitch)
+;  opus_val32 *xcorr, int len, int max_pitch, int arch)
 celt_pitch_xcorr_neon PROC
   ; input:
   ;   r0  = opus_val16 *_x
@@ -168,6 +168,8 @@ celt_pitch_xcorr_neon PROC
   ;   r6  = int         max_pitch
   ;   r12 = int         j
   ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  ; ignored:
+  ;         int         arch
   STMFD        sp!, {r4-r6, lr}
   LDR          r6, [sp, #16]
   VMOV.S32     q15, #1
@@ -358,6 +360,8 @@ celt_pitch_xcorr_edsp PROC
   ;   r9  = opus_val32  sum3
   ;   r1  = int         max_pitch
   ;   r12 = int         j
+  ; ignored:
+  ;         int         arch
   STMFD        sp!, {r4-r11, lr}
   MOV          r5, r1
   LDR          r1, [sp, #36]
diff --git a/thirdparty/opus/celt/arm/fft_arm.h b/thirdparty/opus/celt/arm/fft_arm.h
index 0cb55d8e22..0b78175f3a 100644
--- a/thirdparty/opus/celt/arm/fft_arm.h
+++ b/thirdparty/opus/celt/arm/fft_arm.h
@@ -34,7 +34,6 @@
 #if !defined(FFT_ARM_H)
 #define FFT_ARM_H
 
-#include "config.h"
 #include "kiss_fft.h"
 
 #if defined(HAVE_ARM_NE10)
diff --git a/thirdparty/opus/celt/arm/fixed_armv4.h b/thirdparty/opus/celt/arm/fixed_armv4.h
index efb3b1896a..d84888a772 100644
--- a/thirdparty/opus/celt/arm/fixed_armv4.h
+++ b/thirdparty/opus/celt/arm/fixed_armv4.h
@@ -37,7 +37,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
       "#MULT16_32_Q16\n\t"
       "smull %0, %1, %2, %3\n\t"
       : "=&r"(rd_lo), "=&r"(rd_hi)
-      : "%r"(b),"r"(a<<16)
+      : "%r"(b),"r"(SHL32(a,16))
   );
   return rd_hi;
 }
@@ -54,10 +54,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
       "#MULT16_32_Q15\n\t"
       "smull %0, %1, %2, %3\n\t"
       : "=&r"(rd_lo), "=&r"(rd_hi)
-      : "%r"(b), "r"(a<<16)
+      : "%r"(b), "r"(SHL32(a,16))
   );
   /*We intentionally don't OR in the high bit of rd_lo for speed.*/
-  return rd_hi<<1;
+  return SHL32(rd_hi,1);
 }
 #define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
 
diff --git a/thirdparty/opus/celt/arm/fixed_armv5e.h b/thirdparty/opus/celt/arm/fixed_armv5e.h
index 36a6321101..6bf73cbace 100644
--- a/thirdparty/opus/celt/arm/fixed_armv5e.h
+++ b/thirdparty/opus/celt/arm/fixed_armv5e.h
@@ -59,7 +59,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
       : "=r"(res)
       : "r"(b), "r"(a)
   );
-  return res<<1;
+  return SHL32(res,1);
 }
 #define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
 
@@ -76,7 +76,7 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
       "#MAC16_32_Q15\n\t"
       "smlawb %0, %1, %2, %3;\n"
       : "=r"(res)
-      : "r"(b<<1), "r"(a), "r"(c)
+      : "r"(SHL32(b,1)), "r"(a), "r"(c)
   );
   return res;
 }
diff --git a/thirdparty/opus/celt/arm/mdct_arm.h b/thirdparty/opus/celt/arm/mdct_arm.h
index 49cbb44576..14200bac4b 100644
--- a/thirdparty/opus/celt/arm/mdct_arm.h
+++ b/thirdparty/opus/celt/arm/mdct_arm.h
@@ -33,7 +33,6 @@
 #if !defined(MDCT_ARM_H)
 #define MDCT_ARM_H
 
-#include "config.h"
 #include "mdct.h"
 
 #if defined(HAVE_ARM_NE10)
diff --git a/thirdparty/opus/celt/arm/pitch_arm.h b/thirdparty/opus/celt/arm/pitch_arm.h
index 14331169ee..bed8b04eac 100644
--- a/thirdparty/opus/celt/arm/pitch_arm.h
+++ b/thirdparty/opus/celt/arm/pitch_arm.h
@@ -30,11 +30,47 @@
 
 # include "armcpu.h"
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
+        const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+
+#  if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_CELT_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_DUAL_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
+        const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
 # if defined(FIXED_POINT)
 
 #  if defined(OPUS_ARM_MAY_HAVE_NEON)
 opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
-    opus_val32 *xcorr, int len, int max_pitch);
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
 #  endif
 
 #  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
@@ -43,7 +79,7 @@ opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
 
 #  if defined(OPUS_ARM_MAY_HAVE_EDSP)
 opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
-    opus_val32 *xcorr, int len, int max_pitch);
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
 #  endif
 
 #  if defined(OPUS_HAVE_RTCD) && \
@@ -52,18 +88,17 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
      (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
 extern opus_val32
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int, int);
 #   define OVERRIDE_PITCH_XCORR (1)
 #   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+        xcorr, len, max_pitch, arch))
 
 #  elif defined(OPUS_ARM_PRESUME_EDSP) || \
     defined(OPUS_ARM_PRESUME_MEDIA) || \
     defined(OPUS_ARM_PRESUME_NEON)
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
+#   define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
 
 #  endif
 
@@ -99,25 +134,24 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
 /* Float case */
 #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
-                                 opus_val32 *xcorr, int len, int max_pitch);
+                                 opus_val32 *xcorr, int len, int max_pitch, int arch);
 #endif
 
 #  if defined(OPUS_HAVE_RTCD) && \
     (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
 extern void
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int, int);
 
 #  define OVERRIDE_PITCH_XCORR (1)
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+        xcorr, len, max_pitch, arch))
 
 #  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
 
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-   ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
+#   define celt_pitch_xcorr celt_pitch_xcorr_float_neon
 
 #  endif
 
diff --git a/thirdparty/opus/celt/arm/pitch_neon_intr.c b/thirdparty/opus/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000000..1ac38c433a
--- /dev/null
+++ b/thirdparty/opus/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,290 @@
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "pitch.h"
+
+#ifdef FIXED_POINT
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    int16x8_t x_s16x8, y_s16x8;
+    int32x4_t xy_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy_s64x2;
+    int64x1_t xy_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8  = vld1q_s16(&x[i]);
+        y_s16x8  = vld1q_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+        const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+        i += 4;
+    }
+
+    xy_s64x2 = vpaddlq_s32(xy_s32x4);
+    xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+    xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(celt_inner_prod_c(x, y, N) == xy);
+#endif
+
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+    int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+    int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy01_s64x2, xy02_s64x2;
+    int64x1_t xy01_s64x1, xy02_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8    = vld1q_s16(&x[i]);
+        y01_s16x8  = vld1q_s16(&y01[i]);
+        y02_s16x8  = vld1q_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4   = vld1_s16(&x[i]);
+        const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+        const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+        i += 4;
+    }
+
+    xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+    xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+    xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+    xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+    xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+    xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(xy1_c == *xy1);
+        celt_assert(xy2_c == *xy2);
+    }
+#endif
+}
+
+#else /* !FIXED_POINT */
+
+/* ========================================================================== */
+
+#ifdef OPUS_CHECK_ASM
+
+/* This part of code simulates floating-point NEON operations. */
+
+/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of celt_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+   }
+   xy0 += xy2;
+   xy1 += xy3;
+   xy = xy0 + xy1;
+   for (; i < N; i++) {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+
+/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of dual_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+   }
+   xy01_0 += xy01_2;
+   xy02_0 += xy02_2;
+   xy01_1 += xy01_3;
+   xy02_1 += xy02_3;
+   xy01 = xy01_0 + xy01_1;
+   xy02 = xy02_0 + xy02_1;
+   for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+#endif /* OPUS_CHECK_ASM */
+
+/* ========================================================================== */
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    float32x4_t xy_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y_f32x4;
+        x_f32x4  = vld1q_f32(&x[i]);
+        y_f32x4  = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        x_f32x4  = vld1q_f32(&x[i + 4]);
+        y_f32x4  = vld1q_f32(&y[i + 4]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        i += 4;
+    }
+
+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+    xy       = vget_lane_f32(xy_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+#endif
+
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy01_f32x2, xy02_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+        x_f32x4    = vld1q_f32(&x[i]);
+        y01_f32x4  = vld1q_f32(&y01[i]);
+        y02_f32x4  = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        x_f32x4    = vld1q_f32(&x[i + 4]);
+        y01_f32x4  = vld1q_f32(&y01[i + 4]);
+        y02_f32x4  = vld1q_f32(&y02[i + 4]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        i += 4;
+    }
+
+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+    xy01       = vget_lane_f32(xy01_f32x2, 0);
+    xy02       = vget_lane_f32(xy02_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
+        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+    }
+#endif
+}
+
+#endif /* FIXED_POINT */
-- 
cgit v1.2.3