14 files changed, 419 insertions, 136 deletions
diff --git a/thirdparty/opus/celt/arm/arm2gnu.pl b/thirdparty/opus/celt/arm/arm2gnu.pl
index 6c922ac819..a2895f7445 100755
--- a/thirdparty/opus/celt/arm/arm2gnu.pl
+++ b/thirdparty/opus/celt/arm/arm2gnu.pl
@@ -164,11 +164,11 @@ while (<>) {
         $prefix = "";
         if ($proc)
         {
-            $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc) unless ($apple);
+            $prefix = $prefix.sprintf("\t.type\t%s, %%function", $proc) unless ($apple);
             # Make sure we $prefix isn't empty here (for the $apple case).
             # We handle mangling the label here, make sure it doesn't match
             # the label handling below (if $prefix would be empty).
-            $prefix = "; ";
+            $prefix = $prefix."; ";
             push(@proc_stack, $proc);
             s/^[A-Za-z_\.]\w+/$symprefix$&:/;
         }
diff --git a/thirdparty/opus/celt/arm/arm_celt_map.c b/thirdparty/opus/celt/arm/arm_celt_map.c
index 4d4d069a86..ca988b66f5 100644
--- a/thirdparty/opus/celt/arm/arm_celt_map.c
+++ b/thirdparty/opus/celt/arm/arm_celt_map.c
@@ -35,12 +35,29 @@
 
 #if defined(OPUS_HAVE_RTCD)
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+  celt_inner_prod_c,   /* ARMv4 */
+  celt_inner_prod_c,   /* EDSP */
+  celt_inner_prod_c,   /* Media */
+  celt_inner_prod_neon /* NEON */
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2) = {
+  dual_inner_prod_c,   /* ARMv4 */
+  dual_inner_prod_c,   /* EDSP */
+  dual_inner_prod_c,   /* Media */
+  dual_inner_prod_neon /* NEON */
+};
+# endif
+
 # if defined(FIXED_POINT)
 #  if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
     (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
     (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
 opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-    const opus_val16 *, opus_val32 *, int , int) = {
+    const opus_val16 *, opus_val32 *, int, int, int) = {
   celt_pitch_xcorr_c,               /* ARMv4 */
   MAY_HAVE_EDSP(celt_pitch_xcorr),  /* EDSP */
   MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */
@@ -51,7 +68,7 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
 # else /* !FIXED_POINT */
 #  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-    const opus_val16 *, opus_val32 *, int, int) = {
+    const opus_val16 *, opus_val32 *, int, int, int) = {
   celt_pitch_xcorr_c,              /* ARMv4 */
   celt_pitch_xcorr_c,              /* EDSP */
   celt_pitch_xcorr_c,              /* Media */
diff --git a/thirdparty/opus/celt/arm/armopts.s b/thirdparty/opus/celt/arm/armopts.s
new file mode 100644
index 0000000000..fb9196072a
--- /dev/null
+++ b/thirdparty/opus/celt/arm/armopts.s
@@ -0,0 +1,37 @@
+/* Copyright (C) 2013 Mozilla Corporation */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OPUS_ARM_MAY_HAVE_EDSP  * 
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OPUS_ARM_MAY_HAVE_MEDIA * 
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OPUS_ARM_MAY_HAVE_NEON  * 
+
+END
diff --git a/thirdparty/opus/celt/arm/celt_ne10_fft.c b/thirdparty/opus/celt/arm/celt_fft_ne10.c
index 42d96a7117..ea5fd7808b 100644
--- a/thirdparty/opus/celt/arm/celt_ne10_fft.c
+++ b/thirdparty/opus/celt/arm/celt_fft_ne10.c
@@ -1,7 +1,7 @@
 /* Copyright (c) 2015 Xiph.Org Foundation
    Written by Viswanath Puttagunta */
 /**
-   @file celt_ne10_fft.c
+   @file celt_fft_ne10.c
    @brief ARM Neon optimizations for fft using NE10 library
  */
 
@@ -36,7 +36,6 @@
 #endif
 #endif
 
-#include <NE10_init.h>
 #include <NE10_dsp.h>
 #include "os_support.h"
 #include "kiss_fft.h"
diff --git a/thirdparty/opus/celt/arm/celt_ne10_mdct.c b/thirdparty/opus/celt/arm/celt_mdct_ne10.c
index 293c3efd7a..3531d02d10 100644
--- a/thirdparty/opus/celt/arm/celt_ne10_mdct.c
+++ b/thirdparty/opus/celt/arm/celt_mdct_ne10.c
@@ -1,7 +1,7 @@
 /* Copyright (c) 2015 Xiph.Org Foundation
    Written by Viswanath Puttagunta */
 /**
-   @file celt_ne10_mdct.c
+   @file celt_mdct_ne10.c
    @brief ARM Neon optimizations for mdct using NE10 library
  */
 
diff --git a/thirdparty/opus/celt/arm/celt_neon_intr.c b/thirdparty/opus/celt/arm/celt_neon_intr.c
index 47bbe3dc22..effda769d0 100644
--- a/thirdparty/opus/celt/arm/celt_neon_intr.c
+++ b/thirdparty/opus/celt/arm/celt_neon_intr.c
@@ -191,121 +191,21 @@ static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
    vst1q_f32(sum, SUMM);
 }
 
-/*
- * Function: xcorr_kernel_neon_float_process1
- * ---------------------------------
- * Computes single correlation values and stores in *sum
- */
-static void xcorr_kernel_neon_float_process1(const float32_t *x,
-      const float32_t *y, float32_t *sum, int len) {
-   float32x4_t XX[4];
-   float32x4_t YY[4];
-   float32x2_t XX_2;
-   float32x2_t YY_2;
-   float32x4_t SUMM;
-   float32x2_t SUMM_2[2];
-   const float32_t *xi = x;
-   const float32_t *yi = y;
-
-   SUMM = vdupq_n_f32(0);
-
-   /* Work on 16 values per iteration */
-   while (len >= 16) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      XX[1] = vld1q_f32(xi);
-      xi += 4;
-      XX[2] = vld1q_f32(xi);
-      xi += 4;
-      XX[3] = vld1q_f32(xi);
-      xi += 4;
-
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      YY[1] = vld1q_f32(yi);
-      yi += 4;
-      YY[2] = vld1q_f32(yi);
-      yi += 4;
-      YY[3] = vld1q_f32(yi);
-      yi += 4;
-
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
-      SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
-      SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
-      len -= 16;
-   }
-
-   /* Work on 8 values */
-   if (len >= 8) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      XX[1] = vld1q_f32(xi);
-      xi += 4;
-
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      YY[1] = vld1q_f32(yi);
-      yi += 4;
-
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
-      len -= 8;
-   }
-
-   /* Work on 4 values */
-   if (len >= 4) {
-      XX[0] = vld1q_f32(xi);
-      xi += 4;
-      YY[0] = vld1q_f32(yi);
-      yi += 4;
-      SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
-      len -= 4;
-   }
-
-   /* Start accumulating results */
-   SUMM_2[0] = vget_low_f32(SUMM);
-   if (len >= 2) {
-      /* While at it, consume 2 more values if available */
-      XX_2 = vld1_f32(xi);
-      xi += 2;
-      YY_2 = vld1_f32(yi);
-      yi += 2;
-      SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
-      len -= 2;
-   }
-   SUMM_2[1] = vget_high_f32(SUMM);
-   SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
-   SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
-   /* Ok, now we have result accumulated in SUMM_2[0].0 */
-
-   if (len > 0) {
-      /* Case when you have one value left */
-      XX_2 = vld1_dup_f32(xi);
-      YY_2 = vld1_dup_f32(yi);
-      SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
-   }
-
-   vst1_lane_f32(sum, SUMM_2[0], 0);
-}
-
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
-                        opus_val32 *xcorr, int len, int max_pitch) {
+                        opus_val32 *xcorr, int len, int max_pitch, int arch) {
    int i;
+   (void)arch;
    celt_assert(max_pitch > 0);
-   celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+   celt_sig_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
 
    for (i = 0; i < (max_pitch-3); i += 4) {
       xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
             (float32_t *)xcorr+i, len);
    }
 
-   /* In case max_pitch isn't multiple of 4
-    * compute single correlation value per iteration
-    */
+   /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
    for (; i < max_pitch; i++) {
-      xcorr_kernel_neon_float_process1((const float32_t *)_x,
-            (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
+      xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
    }
 }
 #endif
diff --git a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
index 5b2ee55a10..10668e54a5 100644
--- a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
+++ b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
@@ -44,7 +44,7 @@
  .if OPUS_ARM_MAY_HAVE_NEON
 
 @ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
-; xcorr_kernel_neon: @ PROC
+	.type	xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
 xcorr_kernel_neon_start:
   @ input:
   @   r3     = int         len
@@ -156,8 +156,8 @@ xcorr_kernel_neon_process1:
 	.size xcorr_kernel_neon, .-xcorr_kernel_neon  @ ENDP
 
 @ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-@  opus_val32 *xcorr, int len, int max_pitch)
-; celt_pitch_xcorr_neon: @ PROC
+@  opus_val32 *xcorr, int len, int max_pitch, int arch)
+	.type	celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
   @ input:
   @   r0  = opus_val16 *_x
   @   r1  = opus_val16 *_y
@@ -171,6 +171,8 @@ xcorr_kernel_neon_process1:
   @   r6  = int         max_pitch
   @   r12 = int         j
   @   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  @ ignored:
+  @         int         arch
   STMFD        sp!, {r4-r6, lr}
   LDR          r6, [sp, #16]
   VMOV.S32     q15, #1
@@ -260,7 +262,7 @@ celt_pitch_xcorr_neon_done:
 
 @ This will get used on ARMv7 devices without NEON, so it has been optimized
 @ to take advantage of dual-issuing where possible.
-; xcorr_kernel_edsp: @ PROC
+	.type	xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
 xcorr_kernel_edsp_start:
   @ input:
   @   r3      = int         len
@@ -344,7 +346,7 @@ xcorr_kernel_edsp_done:
   LDMFD        sp!, {r2,r4,r5,pc}
 	.size xcorr_kernel_edsp, .-xcorr_kernel_edsp  @ ENDP
 
-; celt_pitch_xcorr_edsp: @ PROC
+	.type	celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
   @ input:
   @   r0  = opus_val16 *_x (must be 32-bit aligned)
   @   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
@@ -361,6 +363,8 @@ xcorr_kernel_edsp_done:
   @   r9  = opus_val32  sum3
   @   r1  = int         max_pitch
   @   r12 = int         j
+  @ ignored:
+  @         int         arch
   STMFD        sp!, {r4-r11, lr}
   MOV          r5, r1
   LDR          r1, [sp, #36]
diff --git a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s
index f96e0a88bb..6e873afc37 100644
--- a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s
+++ b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm.s
@@ -153,7 +153,7 @@ xcorr_kernel_neon_process1
   ENDP
 
 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-;  opus_val32 *xcorr, int len, int max_pitch)
+;  opus_val32 *xcorr, int len, int max_pitch, int arch)
 celt_pitch_xcorr_neon PROC
   ; input:
   ;   r0  = opus_val16 *_x
@@ -168,6 +168,8 @@ celt_pitch_xcorr_neon PROC
   ;   r6  = int         max_pitch
   ;   r12 = int         j
   ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
+  ; ignored:
+  ;         int         arch
   STMFD        sp!, {r4-r6, lr}
   LDR          r6, [sp, #16]
   VMOV.S32     q15, #1
@@ -358,6 +360,8 @@ celt_pitch_xcorr_edsp PROC
   ;   r9  = opus_val32  sum3
   ;   r1  = int         max_pitch
   ;   r12 = int         j
+  ; ignored:
+  ;         int         arch
   STMFD        sp!, {r4-r11, lr}
   MOV          r5, r1
   LDR          r1, [sp, #36]
diff --git a/thirdparty/opus/celt/arm/fft_arm.h b/thirdparty/opus/celt/arm/fft_arm.h
index 0cb55d8e22..0b78175f3a 100644
--- a/thirdparty/opus/celt/arm/fft_arm.h
+++ b/thirdparty/opus/celt/arm/fft_arm.h
@@ -34,7 +34,6 @@
 #if !defined(FFT_ARM_H)
 #define FFT_ARM_H
 
-#include "config.h"
 #include "kiss_fft.h"
 
 #if defined(HAVE_ARM_NE10)
diff --git a/thirdparty/opus/celt/arm/fixed_armv4.h b/thirdparty/opus/celt/arm/fixed_armv4.h
index efb3b1896a..d84888a772 100644
--- a/thirdparty/opus/celt/arm/fixed_armv4.h
+++ b/thirdparty/opus/celt/arm/fixed_armv4.h
@@ -37,7 +37,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q16_armv4(opus_val16 a, opus_val32 b)
       "#MULT16_32_Q16\n\t"
       "smull %0, %1, %2, %3\n\t"
       : "=&r"(rd_lo), "=&r"(rd_hi)
-      : "%r"(b),"r"(a<<16)
+      : "%r"(b),"r"(SHL32(a,16))
   );
   return rd_hi;
 }
@@ -54,10 +54,10 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv4(opus_val16 a, opus_val32 b)
       "#MULT16_32_Q15\n\t"
       "smull %0, %1, %2, %3\n\t"
       : "=&r"(rd_lo), "=&r"(rd_hi)
-      : "%r"(b), "r"(a<<16)
+      : "%r"(b), "r"(SHL32(a,16))
   );
   /*We intentionally don't OR in the high bit of rd_lo for speed.*/
-  return rd_hi<<1;
+  return SHL32(rd_hi,1);
 }
 #define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv4(a, b))
 
diff --git a/thirdparty/opus/celt/arm/fixed_armv5e.h b/thirdparty/opus/celt/arm/fixed_armv5e.h
index 36a6321101..6bf73cbace 100644
--- a/thirdparty/opus/celt/arm/fixed_armv5e.h
+++ b/thirdparty/opus/celt/arm/fixed_armv5e.h
@@ -59,7 +59,7 @@ static OPUS_INLINE opus_val32 MULT16_32_Q15_armv5e(opus_val16 a, opus_val32 b)
       : "=r"(res)
       : "r"(b), "r"(a)
   );
-  return res<<1;
+  return SHL32(res,1);
 }
 #define MULT16_32_Q15(a, b) (MULT16_32_Q15_armv5e(a, b))
 
@@ -76,7 +76,7 @@ static OPUS_INLINE opus_val32 MAC16_32_Q15_armv5e(opus_val32 c, opus_val16 a,
       "#MAC16_32_Q15\n\t"
       "smlawb %0, %1, %2, %3;\n"
       : "=r"(res)
-      : "r"(b<<1), "r"(a), "r"(c)
+      : "r"(SHL32(b,1)), "r"(a), "r"(c)
   );
   return res;
 }
diff --git a/thirdparty/opus/celt/arm/mdct_arm.h b/thirdparty/opus/celt/arm/mdct_arm.h
index 49cbb44576..14200bac4b 100644
--- a/thirdparty/opus/celt/arm/mdct_arm.h
+++ b/thirdparty/opus/celt/arm/mdct_arm.h
@@ -33,7 +33,6 @@
 #if !defined(MDCT_ARM_H)
 #define MDCT_ARM_H
 
-#include "config.h"
 #include "mdct.h"
 
 #if defined(HAVE_ARM_NE10)
diff --git a/thirdparty/opus/celt/arm/pitch_arm.h b/thirdparty/opus/celt/arm/pitch_arm.h
index 14331169ee..bed8b04eac 100644
--- a/thirdparty/opus/celt/arm/pitch_arm.h
+++ b/thirdparty/opus/celt/arm/pitch_arm.h
@@ -30,11 +30,47 @@
 
 # include "armcpu.h"
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
+        const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+
+#  if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_CELT_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_DUAL_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
+        const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
 # if defined(FIXED_POINT)
 
 #  if defined(OPUS_ARM_MAY_HAVE_NEON)
 opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
-    opus_val32 *xcorr, int len, int max_pitch);
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
 #  endif
 
 #  if defined(OPUS_ARM_MAY_HAVE_MEDIA)
@@ -43,7 +79,7 @@ opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
 
 #  if defined(OPUS_ARM_MAY_HAVE_EDSP)
 opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
-    opus_val32 *xcorr, int len, int max_pitch);
+    opus_val32 *xcorr, int len, int max_pitch, int arch);
 #  endif
 
 #  if defined(OPUS_HAVE_RTCD) && \
@@ -52,18 +88,17 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
      (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
 extern opus_val32
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int, int);
 #   define OVERRIDE_PITCH_XCORR (1)
 #   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+        xcorr, len, max_pitch, arch))
 
 #  elif defined(OPUS_ARM_PRESUME_EDSP) || \
     defined(OPUS_ARM_PRESUME_MEDIA) || \
     defined(OPUS_ARM_PRESUME_NEON)
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-  ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
+#   define celt_pitch_xcorr (PRESUME_NEON(celt_pitch_xcorr))
 
 #  endif
 
@@ -99,25 +134,24 @@ extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
 /* Float case */
 #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
-                                 opus_val32 *xcorr, int len, int max_pitch);
+                                 opus_val32 *xcorr, int len, int max_pitch, int arch);
 #endif
 
 #  if defined(OPUS_HAVE_RTCD) && \
     (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
 extern void
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int, int);
 
 #  define OVERRIDE_PITCH_XCORR (1)
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+        xcorr, len, max_pitch, arch))
 
 #  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
 
 #   define OVERRIDE_PITCH_XCORR (1)
-#   define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
-   ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, len, max_pitch))
+#   define celt_pitch_xcorr celt_pitch_xcorr_float_neon
 
 #  endif
 
diff --git a/thirdparty/opus/celt/arm/pitch_neon_intr.c b/thirdparty/opus/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000000..1ac38c433a
--- /dev/null
+++ b/thirdparty/opus/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,290 @@
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "pitch.h"
+
+#ifdef FIXED_POINT
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    int16x8_t x_s16x8, y_s16x8;
+    int32x4_t xy_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy_s64x2;
+    int64x1_t xy_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8  = vld1q_s16(&x[i]);
+        y_s16x8  = vld1q_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+        const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+        i += 4;
+    }
+
+    xy_s64x2 = vpaddlq_s32(xy_s32x4);
+    xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+    xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(celt_inner_prod_c(x, y, N) == xy);
+#endif
+
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+    int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+    int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy01_s64x2, xy02_s64x2;
+    int64x1_t xy01_s64x1, xy02_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8    = vld1q_s16(&x[i]);
+        y01_s16x8  = vld1q_s16(&y01[i]);
+        y02_s16x8  = vld1q_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4   = vld1_s16(&x[i]);
+        const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+        const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+        i += 4;
+    }
+
+    xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+    xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+    xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+    xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+    xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+    xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_c(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(xy1_c == *xy1);
+        celt_assert(xy2_c == *xy2);
+    }
+#endif
+}
+
+#else /* !FIXED_POINT */
+
+/* ========================================================================== */
+
+#ifdef OPUS_CHECK_ASM
+
+/* This part of code simulates floating-point NEON operations. */
+
+/* celt_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of celt_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static opus_val32 celt_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+   }
+   xy0 += xy2;
+   xy1 += xy3;
+   xy = xy0 + xy1;
+   for (; i < N; i++) {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+
+/* dual_inner_prod_neon_float_c_simulation() simulates the floating-point   */
+/* operations of dual_inner_prod_neon(), and both functions should have bit */
+/* exact output.                                                            */
+static void dual_inner_prod_neon_float_c_simulation(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+   }
+   xy01_0 += xy01_2;
+   xy02_0 += xy02_2;
+   xy01_1 += xy01_3;
+   xy02_1 += xy02_3;
+   xy01 = xy01_0 + xy01_1;
+   xy02 = xy02_0 + xy02_1;
+   for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+#endif /* OPUS_CHECK_ASM */
+
+/* ========================================================================== */
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+    float32x4_t xy_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y_f32x4;
+        x_f32x4  = vld1q_f32(&x[i]);
+        y_f32x4  = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        x_f32x4  = vld1q_f32(&x[i + 4]);
+        y_f32x4  = vld1q_f32(&y[i + 4]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        i += 4;
+    }
+
+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+    xy       = vget_lane_f32(xy_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+
+#ifdef OPUS_CHECK_ASM
+    celt_assert(ABS32(celt_inner_prod_neon_float_c_simulation(x, y, N) - xy) <= VERY_SMALL);
+#endif
+
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy01_f32x2, xy02_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+        x_f32x4    = vld1q_f32(&x[i]);
+        y01_f32x4  = vld1q_f32(&y01[i]);
+        y02_f32x4  = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        x_f32x4    = vld1q_f32(&x[i + 4]);
+        y01_f32x4  = vld1q_f32(&y01[i + 4]);
+        y02_f32x4  = vld1q_f32(&y02[i + 4]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        i += 4;
+    }
+
+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+    xy01       = vget_lane_f32(xy01_f32x2, 0);
+    xy02       = vget_lane_f32(xy02_f32x2, 0);
+
+    for (; i < N; i++) {
+        xy01 = MAC16_16(xy01, x[i], y01[i]);
+        xy02 = MAC16_16(xy02, x[i], y02[i]);
+    }
+    *xy1 = xy01;
+    *xy2 = xy02;
+
+#ifdef OPUS_CHECK_ASM
+    {
+        opus_val32 xy1_c, xy2_c;
+        dual_inner_prod_neon_float_c_simulation(x, y01, y02, N, &xy1_c, &xy2_c);
+        celt_assert(ABS32(xy1_c - *xy1) <= VERY_SMALL);
+        celt_assert(ABS32(xy2_c - *xy2) <= VERY_SMALL);
+    }
+#endif
+}
+
+#endif /* FIXED_POINT */