summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S555
-rw-r--r--thirdparty/opus/celt/fixed_c5x.h79
-rw-r--r--thirdparty/opus/celt/fixed_c6x.h (renamed from thirdparty/opus/celt/arm/armopts.s)51
-rw-r--r--thirdparty/opus/config.h17
-rw-r--r--thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h184
-rw-r--r--thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c160
6 files changed, 478 insertions, 568 deletions
diff --git a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S b/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
deleted file mode 100644
index 10668e54a5..0000000000
--- a/thirdparty/opus/celt/arm/celt_pitch_xcorr_arm-gnu.S
+++ /dev/null
@@ -1,555 +0,0 @@
- .syntax unified
-@ Copyright (c) 2007-2008 CSIRO
-@ Copyright (c) 2007-2009 Xiph.Org Foundation
-@ Copyright (c) 2013 Parrot
-@ Written by Aurélien Zanelli
-@
-@ Redistribution and use in source and binary forms, with or without
-@ modification, are permitted provided that the following conditions
-@ are met:
-@
-@ - Redistributions of source code must retain the above copyright
-@ notice, this list of conditions and the following disclaimer.
-@
-@ - Redistributions in binary form must reproduce the above copyright
-@ notice, this list of conditions and the following disclaimer in the
-@ documentation and/or other materials provided with the distribution.
-@
-@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-@ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-@ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-@ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
-@ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-@ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-@ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- .text; .p2align 2; .arch armv7-a
- .fpu neon
- .object_arch armv4t
-
- .include "celt/arm/armopts-gnu.S"
-
- .if OPUS_ARM_MAY_HAVE_EDSP
- .global celt_pitch_xcorr_edsp
- .endif
-
- .if OPUS_ARM_MAY_HAVE_NEON
- .global celt_pitch_xcorr_neon
- .endif
-
- .if OPUS_ARM_MAY_HAVE_NEON
-
-@ Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
- .type xcorr_kernel_neon, %function; xcorr_kernel_neon: @ PROC
-xcorr_kernel_neon_start:
- @ input:
- @ r3 = int len
- @ r4 = opus_val16 *x
- @ r5 = opus_val16 *y
- @ q0 = opus_val32 sum[4]
- @ output:
- @ q0 = opus_val32 sum[4]
- @ preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
- @ internal usage:
- @ r12 = int j
- @ d3 = y_3|y_2|y_1|y_0
- @ q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
- @ q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
- @ q8 = scratch
- @
- @ Load y[0...3]
- @ This requires len>0 to always be valid (which we assert in the C code).
- VLD1.16 {d5}, [r5]!
- SUBS r12, r3, #8
- BLE xcorr_kernel_neon_process4
-@ Process 8 samples at a time.
-@ This loop loads one y value more than we actually need. Therefore we have to
-@ stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
-@ reading past the end of the array.
-xcorr_kernel_neon_process8:
- @ This loop has 19 total instructions (10 cycles to issue, minimum), with
- @ - 2 cycles of ARM insrtuctions,
- @ - 10 cycles of load/store/byte permute instructions, and
- @ - 9 cycles of data processing instructions.
- @ On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
- @ latter two categories, meaning the whole loop should run in 10 cycles per
- @ iteration, barring cache misses.
- @
- @ Load x[0...7]
- VLD1.16 {d6, d7}, [r4]!
- @ Unlike VMOV, VAND is a data processsing instruction (and doesn't get
- @ assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
- VAND d3, d5, d5
- SUBS r12, r12, #8
- @ Load y[4...11]
- VLD1.16 {d4, d5}, [r5]!
- VMLAL.S16 q0, d3, d6[0]
- VEXT.16 d16, d3, d4, #1
- VMLAL.S16 q0, d4, d7[0]
- VEXT.16 d17, d4, d5, #1
- VMLAL.S16 q0, d16, d6[1]
- VEXT.16 d16, d3, d4, #2
- VMLAL.S16 q0, d17, d7[1]
- VEXT.16 d17, d4, d5, #2
- VMLAL.S16 q0, d16, d6[2]
- VEXT.16 d16, d3, d4, #3
- VMLAL.S16 q0, d17, d7[2]
- VEXT.16 d17, d4, d5, #3
- VMLAL.S16 q0, d16, d6[3]
- VMLAL.S16 q0, d17, d7[3]
- BGT xcorr_kernel_neon_process8
-@ Process 4 samples here if we have > 4 left (still reading one extra y value).
-xcorr_kernel_neon_process4:
- ADDS r12, r12, #4
- BLE xcorr_kernel_neon_process2
- @ Load x[0...3]
- VLD1.16 d6, [r4]!
- @ Use VAND since it's a data processing instruction again.
- VAND d4, d5, d5
- SUB r12, r12, #4
- @ Load y[4...7]
- VLD1.16 d5, [r5]!
- VMLAL.S16 q0, d4, d6[0]
- VEXT.16 d16, d4, d5, #1
- VMLAL.S16 q0, d16, d6[1]
- VEXT.16 d16, d4, d5, #2
- VMLAL.S16 q0, d16, d6[2]
- VEXT.16 d16, d4, d5, #3
- VMLAL.S16 q0, d16, d6[3]
-@ Process 2 samples here if we have > 2 left (still reading one extra y value).
-xcorr_kernel_neon_process2:
- ADDS r12, r12, #2
- BLE xcorr_kernel_neon_process1
- @ Load x[0...1]
- VLD2.16 {d6[],d7[]}, [r4]!
- @ Use VAND since it's a data processing instruction again.
- VAND d4, d5, d5
- SUB r12, r12, #2
- @ Load y[4...5]
- VLD1.32 {d5[]}, [r5]!
- VMLAL.S16 q0, d4, d6
- VEXT.16 d16, d4, d5, #1
- @ Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
- @ instead of VEXT, since it's a data-processing instruction.
- VSRI.64 d5, d4, #32
- VMLAL.S16 q0, d16, d7
-@ Process 1 sample using the extra y value we loaded above.
-xcorr_kernel_neon_process1:
- @ Load next *x
- VLD1.16 {d6[]}, [r4]!
- ADDS r12, r12, #1
- @ y[0...3] are left in d5 from prior iteration(s) (if any)
- VMLAL.S16 q0, d5, d6
- MOVLE pc, lr
-@ Now process 1 last sample, not reading ahead.
- @ Load last *y
- VLD1.16 {d4[]}, [r5]!
- VSRI.64 d4, d5, #16
- @ Load last *x
- VLD1.16 {d6[]}, [r4]!
- VMLAL.S16 q0, d4, d6
- MOV pc, lr
- .size xcorr_kernel_neon, .-xcorr_kernel_neon @ ENDP
-
-@ opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
-@ opus_val32 *xcorr, int len, int max_pitch, int arch)
- .type celt_pitch_xcorr_neon, %function; celt_pitch_xcorr_neon: @ PROC
- @ input:
- @ r0 = opus_val16 *_x
- @ r1 = opus_val16 *_y
- @ r2 = opus_val32 *xcorr
- @ r3 = int len
- @ output:
- @ r0 = int maxcorr
- @ internal usage:
- @ r4 = opus_val16 *x (for xcorr_kernel_neon())
- @ r5 = opus_val16 *y (for xcorr_kernel_neon())
- @ r6 = int max_pitch
- @ r12 = int j
- @ q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
- @ ignored:
- @ int arch
- STMFD sp!, {r4-r6, lr}
- LDR r6, [sp, #16]
- VMOV.S32 q15, #1
- @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
- SUBS r6, r6, #4
- BLT celt_pitch_xcorr_neon_process4_done
-celt_pitch_xcorr_neon_process4:
- @ xcorr_kernel_neon parameters:
- @ r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
- MOV r4, r0
- MOV r5, r1
- VEOR q0, q0, q0
- @ xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
- @ So we don't save/restore any other registers.
- BL xcorr_kernel_neon_start
- SUBS r6, r6, #4
- VST1.32 {q0}, [r2]!
- @ _y += 4
- ADD r1, r1, #8
- VMAX.S32 q15, q15, q0
- @ if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
- BGE celt_pitch_xcorr_neon_process4
-@ We have less than 4 sums left to compute.
-celt_pitch_xcorr_neon_process4_done:
- ADDS r6, r6, #4
- @ Reduce maxcorr to a single value
- VMAX.S32 d30, d30, d31
- VPMAX.S32 d30, d30, d30
- @ if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
- BLE celt_pitch_xcorr_neon_done
-@ Now compute each remaining sum one at a time.
-celt_pitch_xcorr_neon_process_remaining:
- MOV r4, r0
- MOV r5, r1
- VMOV.I32 q0, #0
- SUBS r12, r3, #8
- BLT celt_pitch_xcorr_neon_process_remaining4
-@ Sum terms 8 at a time.
-celt_pitch_xcorr_neon_process_remaining_loop8:
- @ Load x[0...7]
- VLD1.16 {q1}, [r4]!
- @ Load y[0...7]
- VLD1.16 {q2}, [r5]!
- SUBS r12, r12, #8
- VMLAL.S16 q0, d4, d2
- VMLAL.S16 q0, d5, d3
- BGE celt_pitch_xcorr_neon_process_remaining_loop8
-@ Sum terms 4 at a time.
-celt_pitch_xcorr_neon_process_remaining4:
- ADDS r12, r12, #4
- BLT celt_pitch_xcorr_neon_process_remaining4_done
- @ Load x[0...3]
- VLD1.16 {d2}, [r4]!
- @ Load y[0...3]
- VLD1.16 {d3}, [r5]!
- SUB r12, r12, #4
- VMLAL.S16 q0, d3, d2
-celt_pitch_xcorr_neon_process_remaining4_done:
- @ Reduce the sum to a single value.
- VADD.S32 d0, d0, d1
- VPADDL.S32 d0, d0
- ADDS r12, r12, #4
- BLE celt_pitch_xcorr_neon_process_remaining_loop_done
-@ Sum terms 1 at a time.
-celt_pitch_xcorr_neon_process_remaining_loop1:
- VLD1.16 {d2[]}, [r4]!
- VLD1.16 {d3[]}, [r5]!
- SUBS r12, r12, #1
- VMLAL.S16 q0, d2, d3
- BGT celt_pitch_xcorr_neon_process_remaining_loop1
-celt_pitch_xcorr_neon_process_remaining_loop_done:
- VST1.32 {d0[0]}, [r2]!
- VMAX.S32 d30, d30, d0
- SUBS r6, r6, #1
- @ _y++
- ADD r1, r1, #2
- @ if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
- BGT celt_pitch_xcorr_neon_process_remaining
-celt_pitch_xcorr_neon_done:
- VMOV.32 r0, d30[0]
- LDMFD sp!, {r4-r6, pc}
- .size celt_pitch_xcorr_neon, .-celt_pitch_xcorr_neon @ ENDP
-
- .endif
-
- .if OPUS_ARM_MAY_HAVE_EDSP
-
-@ This will get used on ARMv7 devices without NEON, so it has been optimized
-@ to take advantage of dual-issuing where possible.
- .type xcorr_kernel_edsp, %function; xcorr_kernel_edsp: @ PROC
-xcorr_kernel_edsp_start:
- @ input:
- @ r3 = int len
- @ r4 = opus_val16 *_x (must be 32-bit aligned)
- @ r5 = opus_val16 *_y (must be 32-bit aligned)
- @ r6...r9 = opus_val32 sum[4]
- @ output:
- @ r6...r9 = opus_val32 sum[4]
- @ preserved: r0-r5
- @ internal usage
- @ r2 = int j
- @ r12,r14 = opus_val16 x[4]
- @ r10,r11 = opus_val16 y[4]
- STMFD sp!, {r2,r4,r5,lr}
- LDR r10, [r5], #4 @ Load y[0...1]
- SUBS r2, r3, #4 @ j = len-4
- LDR r11, [r5], #4 @ Load y[2...3]
- BLE xcorr_kernel_edsp_process4_done
- LDR r12, [r4], #4 @ Load x[0...1]
- @ Stall
-xcorr_kernel_edsp_process4:
- @ The multiplies must issue from pipeline 0, and can't dual-issue with each
- @ other. Every other instruction here dual-issues with a multiply, and is
- @ thus "free". There should be no stalls in the body of the loop.
- SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_0,y_0)
- LDR r14, [r4], #4 @ Load x[2...3]
- SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x_0,y_1)
- SUBS r2, r2, #4 @ j-=4
- SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_0,y_2)
- SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x_0,y_3)
- SMLATT r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x_1,y_1)
- LDR r10, [r5], #4 @ Load y[4...5]
- SMLATB r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],x_1,y_2)
- SMLATT r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x_1,y_3)
- SMLATB r9, r12, r10, r9 @ sum[3] = MAC16_16(sum[3],x_1,y_4)
- LDRGT r12, [r4], #4 @ Load x[0...1]
- SMLABB r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_2,y_2)
- SMLABT r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x_2,y_3)
- SMLABB r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_2,y_4)
- SMLABT r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x_2,y_5)
- SMLATT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],x_3,y_3)
- LDR r11, [r5], #4 @ Load y[6...7]
- SMLATB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],x_3,y_4)
- SMLATT r8, r14, r10, r8 @ sum[2] = MAC16_16(sum[2],x_3,y_5)
- SMLATB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],x_3,y_6)
- BGT xcorr_kernel_edsp_process4
-xcorr_kernel_edsp_process4_done:
- ADDS r2, r2, #4
- BLE xcorr_kernel_edsp_done
- LDRH r12, [r4], #2 @ r12 = *x++
- SUBS r2, r2, #1 @ j--
- @ Stall
- SMLABB r6, r12, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_0)
- LDRHGT r14, [r4], #2 @ r14 = *x++
- SMLABT r7, r12, r10, r7 @ sum[1] = MAC16_16(sum[1],x,y_1)
- SMLABB r8, r12, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_2)
- SMLABT r9, r12, r11, r9 @ sum[3] = MAC16_16(sum[3],x,y_3)
- BLE xcorr_kernel_edsp_done
- SMLABT r6, r14, r10, r6 @ sum[0] = MAC16_16(sum[0],x,y_1)
- SUBS r2, r2, #1 @ j--
- SMLABB r7, r14, r11, r7 @ sum[1] = MAC16_16(sum[1],x,y_2)
- LDRH r10, [r5], #2 @ r10 = y_4 = *y++
- SMLABT r8, r14, r11, r8 @ sum[2] = MAC16_16(sum[2],x,y_3)
- LDRHGT r12, [r4], #2 @ r12 = *x++
- SMLABB r9, r14, r10, r9 @ sum[3] = MAC16_16(sum[3],x,y_4)
- BLE xcorr_kernel_edsp_done
- SMLABB r6, r12, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_2)
- CMP r2, #1 @ j--
- SMLABT r7, r12, r11, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_3)
- LDRH r2, [r5], #2 @ r2 = y_5 = *y++
- SMLABB r8, r12, r10, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_4)
- LDRHGT r14, [r4] @ r14 = *x
- SMLABB r9, r12, r2, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_5)
- BLE xcorr_kernel_edsp_done
- SMLABT r6, r14, r11, r6 @ sum[0] = MAC16_16(sum[0],tmp,y_3)
- LDRH r11, [r5] @ r11 = y_6 = *y
- SMLABB r7, r14, r10, r7 @ sum[1] = MAC16_16(sum[1],tmp,y_4)
- SMLABB r8, r14, r2, r8 @ sum[2] = MAC16_16(sum[2],tmp,y_5)
- SMLABB r9, r14, r11, r9 @ sum[3] = MAC16_16(sum[3],tmp,y_6)
-xcorr_kernel_edsp_done:
- LDMFD sp!, {r2,r4,r5,pc}
- .size xcorr_kernel_edsp, .-xcorr_kernel_edsp @ ENDP
-
- .type celt_pitch_xcorr_edsp, %function; celt_pitch_xcorr_edsp: @ PROC
- @ input:
- @ r0 = opus_val16 *_x (must be 32-bit aligned)
- @ r1 = opus_val16 *_y (only needs to be 16-bit aligned)
- @ r2 = opus_val32 *xcorr
- @ r3 = int len
- @ output:
- @ r0 = maxcorr
- @ internal usage
- @ r4 = opus_val16 *x
- @ r5 = opus_val16 *y
- @ r6 = opus_val32 sum0
- @ r7 = opus_val32 sum1
- @ r8 = opus_val32 sum2
- @ r9 = opus_val32 sum3
- @ r1 = int max_pitch
- @ r12 = int j
- @ ignored:
- @ int arch
- STMFD sp!, {r4-r11, lr}
- MOV r5, r1
- LDR r1, [sp, #36]
- MOV r4, r0
- TST r5, #3
- @ maxcorr = 1
- MOV r0, #1
- BEQ celt_pitch_xcorr_edsp_process1u_done
-@ Compute one sum at the start to make y 32-bit aligned.
- SUBS r12, r3, #4
- @ r14 = sum = 0
- MOV r14, #0
- LDRH r8, [r5], #2
- BLE celt_pitch_xcorr_edsp_process1u_loop4_done
- LDR r6, [r4], #4
- MOV r8, r8, LSL #16
-celt_pitch_xcorr_edsp_process1u_loop4:
- LDR r9, [r5], #4
- SMLABT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0)
- LDR r7, [r4], #4
- SMLATB r14, r6, r9, r14 @ sum = MAC16_16(sum, x_1, y_1)
- LDR r8, [r5], #4
- SMLABT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2)
- SUBS r12, r12, #4 @ j-=4
- SMLATB r14, r7, r8, r14 @ sum = MAC16_16(sum, x_3, y_3)
- LDRGT r6, [r4], #4
- BGT celt_pitch_xcorr_edsp_process1u_loop4
- MOV r8, r8, LSR #16
-celt_pitch_xcorr_edsp_process1u_loop4_done:
- ADDS r12, r12, #4
-celt_pitch_xcorr_edsp_process1u_loop1:
- LDRHGE r6, [r4], #2
- @ Stall
- SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y)
- SUBSGE r12, r12, #1
- LDRHGT r8, [r5], #2
- BGT celt_pitch_xcorr_edsp_process1u_loop1
- @ Restore _x
- SUB r4, r4, r3, LSL #1
- @ Restore and advance _y
- SUB r5, r5, r3, LSL #1
- @ maxcorr = max(maxcorr, sum)
- CMP r0, r14
- ADD r5, r5, #2
- MOVLT r0, r14
- SUBS r1, r1, #1
- @ xcorr[i] = sum
- STR r14, [r2], #4
- BLE celt_pitch_xcorr_edsp_done
-celt_pitch_xcorr_edsp_process1u_done:
- @ if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
- SUBS r1, r1, #4
- BLT celt_pitch_xcorr_edsp_process2
-celt_pitch_xcorr_edsp_process4:
- @ xcorr_kernel_edsp parameters:
- @ r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
- MOV r6, #0
- MOV r7, #0
- MOV r8, #0
- MOV r9, #0
- BL xcorr_kernel_edsp_start @ xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
- @ maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
- CMP r0, r6
- @ _y+=4
- ADD r5, r5, #8
- MOVLT r0, r6
- CMP r0, r7
- MOVLT r0, r7
- CMP r0, r8
- MOVLT r0, r8
- CMP r0, r9
- MOVLT r0, r9
- STMIA r2!, {r6-r9}
- SUBS r1, r1, #4
- BGE celt_pitch_xcorr_edsp_process4
-celt_pitch_xcorr_edsp_process2:
- ADDS r1, r1, #2
- BLT celt_pitch_xcorr_edsp_process1a
- SUBS r12, r3, #4
- @ {r10, r11} = {sum0, sum1} = {0, 0}
- MOV r10, #0
- MOV r11, #0
- LDR r8, [r5], #4
- BLE celt_pitch_xcorr_edsp_process2_loop_done
- LDR r6, [r4], #4
- LDR r9, [r5], #4
-celt_pitch_xcorr_edsp_process2_loop4:
- SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0)
- LDR r7, [r4], #4
- SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1)
- SUBS r12, r12, #4 @ j-=4
- SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1)
- LDR r8, [r5], #4
- SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2)
- LDRGT r6, [r4], #4
- SMLABB r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_2, y_2)
- SMLABT r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_2, y_3)
- SMLATT r10, r7, r9, r10 @ sum0 = MAC16_16(sum0, x_3, y_3)
- LDRGT r9, [r5], #4
- SMLATB r11, r7, r8, r11 @ sum1 = MAC16_16(sum1, x_3, y_4)
- BGT celt_pitch_xcorr_edsp_process2_loop4
-celt_pitch_xcorr_edsp_process2_loop_done:
- ADDS r12, r12, #2
- BLE celt_pitch_xcorr_edsp_process2_1
- LDR r6, [r4], #4
- @ Stall
- SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0)
- LDR r9, [r5], #4
- SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1)
- SUB r12, r12, #2
- SMLATT r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_1, y_1)
- MOV r8, r9
- SMLATB r11, r6, r9, r11 @ sum1 = MAC16_16(sum1, x_1, y_2)
-celt_pitch_xcorr_edsp_process2_1:
- LDRH r6, [r4], #2
- ADDS r12, r12, #1
- @ Stall
- SMLABB r10, r6, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_0)
- LDRHGT r7, [r4], #2
- SMLABT r11, r6, r8, r11 @ sum1 = MAC16_16(sum1, x_0, y_1)
- BLE celt_pitch_xcorr_edsp_process2_done
- LDRH r9, [r5], #2
- SMLABT r10, r7, r8, r10 @ sum0 = MAC16_16(sum0, x_0, y_1)
- SMLABB r11, r7, r9, r11 @ sum1 = MAC16_16(sum1, x_0, y_2)
-celt_pitch_xcorr_edsp_process2_done:
- @ Restore _x
- SUB r4, r4, r3, LSL #1
- @ Restore and advance _y
- SUB r5, r5, r3, LSL #1
- @ maxcorr = max(maxcorr, sum0)
- CMP r0, r10
- ADD r5, r5, #2
- MOVLT r0, r10
- SUB r1, r1, #2
- @ maxcorr = max(maxcorr, sum1)
- CMP r0, r11
- @ xcorr[i] = sum
- STR r10, [r2], #4
- MOVLT r0, r11
- STR r11, [r2], #4
-celt_pitch_xcorr_edsp_process1a:
- ADDS r1, r1, #1
- BLT celt_pitch_xcorr_edsp_done
- SUBS r12, r3, #4
- @ r14 = sum = 0
- MOV r14, #0
- BLT celt_pitch_xcorr_edsp_process1a_loop_done
- LDR r6, [r4], #4
- LDR r8, [r5], #4
- LDR r7, [r4], #4
- LDR r9, [r5], #4
-celt_pitch_xcorr_edsp_process1a_loop4:
- SMLABB r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0)
- SUBS r12, r12, #4 @ j-=4
- SMLATT r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1)
- LDRGE r6, [r4], #4
- SMLABB r14, r7, r9, r14 @ sum = MAC16_16(sum, x_2, y_2)
- LDRGE r8, [r5], #4
- SMLATT r14, r7, r9, r14 @ sum = MAC16_16(sum, x_3, y_3)
- LDRGE r7, [r4], #4
- LDRGE r9, [r5], #4
- BGE celt_pitch_xcorr_edsp_process1a_loop4
-celt_pitch_xcorr_edsp_process1a_loop_done:
- ADDS r12, r12, #2
- LDRGE r6, [r4], #4
- LDRGE r8, [r5], #4
- @ Stall
- SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_0, y_0)
- SUBGE r12, r12, #2
- SMLATTGE r14, r6, r8, r14 @ sum = MAC16_16(sum, x_1, y_1)
- ADDS r12, r12, #1
- LDRHGE r6, [r4], #2
- LDRHGE r8, [r5], #2
- @ Stall
- SMLABBGE r14, r6, r8, r14 @ sum = MAC16_16(sum, *x, *y)
- @ maxcorr = max(maxcorr, sum)
- CMP r0, r14
- @ xcorr[i] = sum
- STR r14, [r2], #4
- MOVLT r0, r14
-celt_pitch_xcorr_edsp_done:
- LDMFD sp!, {r4-r11, pc}
- .size celt_pitch_xcorr_edsp, .-celt_pitch_xcorr_edsp @ ENDP
-
- .endif
-
-@ END:
- .section .note.GNU-stack,"",%progbits
diff --git a/thirdparty/opus/celt/fixed_c5x.h b/thirdparty/opus/celt/fixed_c5x.h
new file mode 100644
index 0000000000..ea95a998c3
--- /dev/null
+++ b/thirdparty/opus/celt/fixed_c5x.h
@@ -0,0 +1,79 @@
+/* Copyright (C) 2003 Jean-Marc Valin */
+/**
+ @file fixed_c5x.h
+ @brief Fixed-point operations for the TI C5x DSP family
+*/
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_C5X_H
+#define FIXED_C5X_H
+
+#include "dsplib.h"
+
+#undef IMUL32
+static OPUS_INLINE long IMUL32(long i, long j)
+{
+ long ac0, ac1;
+ ac0 = _lmpy(i>>16,j);
+ ac1 = ac0 + _lmpy(i,j>>16);
+ return _lmpyu(i,j) + (ac1<<16);
+}
+
+#undef MAX16
+#define MAX16(a,b) _max(a,b)
+
+#undef MIN16
+#define MIN16(a,b) _min(a,b)
+
+#undef MAX32
+#define MAX32(a,b) _lmax(a,b)
+
+#undef MIN32
+#define MIN32(a,b) _lmin(a,b)
+
+#undef VSHR32
+#define VSHR32(a, shift) _lshl(a,-(shift))
+
+#undef MULT16_16_Q15
+#define MULT16_16_Q15(a,b) (_smpy(a,b))
+
+#undef MULT16_16SU
+#define MULT16_16SU(a,b) _lmpysu(a,b)
+
+#undef MULT_16_16
+#define MULT_16_16(a,b) _lmpy(a,b)
+
+/* FIXME: This is technically incorrect and is bound to cause problems. Is there any cleaner solution? */
+#undef MULT16_32_Q15
+#define MULT16_32_Q15(a,b) ADD32(SHL(MULT16_16((a),SHR((b),16)),1), SHR(MULT16_16SU((a),(b)),15))
+
+#define celt_ilog2(x) (30 - _lnorm(x))
+#define OVERRIDE_CELT_ILOG2
+
+#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
+#define OVERRIDE_CELT_MAXABS16
+
+#endif /* FIXED_C5X_H */
diff --git a/thirdparty/opus/celt/arm/armopts.s b/thirdparty/opus/celt/fixed_c6x.h
index fb9196072a..bb6ad92780 100644
--- a/thirdparty/opus/celt/arm/armopts.s
+++ b/thirdparty/opus/celt/fixed_c6x.h
@@ -1,4 +1,8 @@
-/* Copyright (C) 2013 Mozilla Corporation */
+/* Copyright (C) 2008 CSIRO */
+/**
+ @file fixed_c6x.h
+ @brief Fixed-point operations for the TI C6x DSP family
+*/
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -24,14 +28,43 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-; Set the following to 1 if we have EDSP instructions
-; (LDRD/STRD, etc., ARMv5E and later).
-OPUS_ARM_MAY_HAVE_EDSP *
+#ifndef FIXED_C6X_H
+#define FIXED_C6X_H
+
+#undef MULT16_16SU
+#define MULT16_16SU(a,b) _mpysu(a,b)
+
+#undef MULT_16_16
+#define MULT_16_16(a,b) _mpy(a,b)
+
+#define celt_ilog2(x) (30 - _norm(x))
+#define OVERRIDE_CELT_ILOG2
+
+#undef MULT16_32_Q15
+#define MULT16_32_Q15(a,b) (_mpylill(a, b) >> 15)
+
+#if 0
+#include "dsplib.h"
+
+#undef MAX16
+#define MAX16(a,b) _max(a,b)
+
+#undef MIN16
+#define MIN16(a,b) _min(a,b)
+
+#undef MAX32
+#define MAX32(a,b) _lmax(a,b)
+
+#undef MIN32
+#define MIN32(a,b) _lmin(a,b)
+
+#undef VSHR32
+#define VSHR32(a, shift) _lshl(a,-(shift))
-; Set the following to 1 if we have ARMv6 media instructions.
-OPUS_ARM_MAY_HAVE_MEDIA *
+#undef MULT16_16_Q15
+#define MULT16_16_Q15(a,b) (_smpy(a,b))
-; Set the following to 1 if we have NEON (some ARMv7)
-OPUS_ARM_MAY_HAVE_NEON *
+#define celt_maxabs16(x, len) MAX32(EXTEND32(maxval((DATA *)x, len)),-EXTEND32(minval((DATA *)x, len)))
+#define OVERRIDE_CELT_MAXABS16
-END
+#endif /* FIXED_C6X_H */
diff --git a/thirdparty/opus/config.h b/thirdparty/opus/config.h
index 3ed0874d4b..bb935619eb 100644
--- a/thirdparty/opus/config.h
+++ b/thirdparty/opus/config.h
@@ -35,7 +35,7 @@
/* #undef FUZZING */
/* Define to 1 if you have the <alloca.h> header file. */
-/* #undef HAVE_ALLOCA_H */
+/* #undef HAVE_ALLOCA_H */
/* NE10 library is installed on host. Make sure it is on target! */
/* #undef HAVE_ARM_NE10 */
@@ -46,12 +46,16 @@
/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1
+#if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 ))
+
/* Define to 1 if you have the `lrint' function. */
#define HAVE_LRINT 1
/* Define to 1 if you have the `lrintf' function. */
#define HAVE_LRINTF 1
+#endif
+
/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1
@@ -79,7 +83,8 @@
/* Define to 1 if you have the `__malloc_hook' function. */
#define HAVE___MALLOC_HOOK 1
-/* Define to the sub-directory where libtool stores uninstalled libraries. */
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+ */
#define LT_OBJDIR ".libs/"
#ifdef OPUS_ARM_OPT
@@ -186,7 +191,7 @@
#define PACKAGE_NAME "opus"
/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "opus 1.3.1"
+#define PACKAGE_STRING "opus unknown"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "opus"
@@ -195,7 +200,7 @@
#define PACKAGE_URL ""
/* Define to the version of this package. */
-#define PACKAGE_VERSION "1.3.1"
+#define PACKAGE_VERSION "unknown"
/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1
@@ -227,7 +232,11 @@
/* Define to the equivalent of the C99 'restrict' keyword, or to
nothing if this is not supported. Do not define if restrict is
supported directly. */
+#if (!defined( _MSC_VER ) || ( _MSC_VER >= 1800 ))
#define restrict __restrict
+#else
+#undef restrict
+#endif
/* Work around a bug in Sun C++: it does not support _Restrict or
__restrict__, even though the corresponding Sun C compiler ends up with
"#define restrict _Restrict" or "#define restrict __restrict__" in the
diff --git a/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h b/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h
new file mode 100644
index 0000000000..21b256885f
--- /dev/null
+++ b/thirdparty/opus/silk/fixed/mips/prefilter_FIX_mipsr1.h
@@ -0,0 +1,184 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+#ifndef __PREFILTER_FIX_MIPSR1_H__
+#define __PREFILTER_FIX_MIPSR1_H__
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "main_FIX.h"
+#include "stack_alloc.h"
+#include "tuning_parameters.h"
+
+#define OVERRIDE_silk_warped_LPC_analysis_filter_FIX
+void silk_warped_LPC_analysis_filter_FIX(
+ opus_int32 state[], /* I/O State [order + 1] */
+ opus_int32 res_Q2[], /* O Residual signal [length] */
+ const opus_int16 coef_Q13[], /* I Coefficients [order] */
+ const opus_int16 input[], /* I Input signal [length] */
+ const opus_int16 lambda_Q16, /* I Warping factor */
+ const opus_int length, /* I Length of input signal */
+ const opus_int order, /* I Filter order (even) */
+ int arch
+)
+{
+ opus_int n, i;
+ opus_int32 acc_Q11, acc_Q22, tmp1, tmp2, tmp3, tmp4;
+ opus_int32 state_cur, state_next;
+
+ (void)arch;
+
+ /* Order must be even */
+ /* Length must be even */
+
+ silk_assert( ( order & 1 ) == 0 );
+ silk_assert( ( length & 1 ) == 0 );
+
+ for( n = 0; n < length; n+=2 ) {
+ /* Output of lowpass section */
+ tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
+ state_cur = silk_LSHIFT( input[ n ], 14 );
+ /* Output of allpass section */
+ tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
+ state_next = tmp2;
+ acc_Q11 = silk_RSHIFT( order, 1 );
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
+
+
+ /* Output of lowpass section */
+ tmp4 = silk_SMLAWB( state_cur, state_next, lambda_Q16 );
+ state[ 0 ] = silk_LSHIFT( input[ n+1 ], 14 );
+ /* Output of allpass section */
+ tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
+ state[ 1 ] = tmp4;
+ acc_Q22 = silk_RSHIFT( order, 1 );
+ acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ 0 ] );
+
+ /* Loop over allpass sections */
+ for( i = 2; i < order; i += 2 ) {
+ /* Output of allpass section */
+ tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
+ state_cur = tmp1;
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
+ /* Output of allpass section */
+ tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
+ state_next = tmp2;
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
+
+
+ /* Output of allpass section */
+ tmp4 = silk_SMLAWB( state_cur, state_next - tmp3, lambda_Q16 );
+ state[ i ] = tmp3;
+ acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ i - 1 ] );
+ /* Output of allpass section */
+ tmp3 = silk_SMLAWB( state_next, tmp1 - tmp4, lambda_Q16 );
+ state[ i + 1 ] = tmp4;
+ acc_Q22 = silk_SMLAWB( acc_Q22, tmp4, coef_Q13[ i ] );
+ }
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
+ res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
+
+ state[ order ] = tmp3;
+ acc_Q22 = silk_SMLAWB( acc_Q22, tmp3, coef_Q13[ order - 1 ] );
+ res_Q2[ n+1 ] = silk_LSHIFT( (opus_int32)input[ n+1 ], 2 ) - silk_RSHIFT_ROUND( acc_Q22, 9 );
+ }
+}
+
+
+
+/* Prefilter for finding Quantizer input signal */
+#define OVERRIDE_silk_prefilt_FIX
+static inline void silk_prefilt_FIX(
+ silk_prefilter_state_FIX *P, /* I/O state */
+ opus_int32 st_res_Q12[], /* I short term residual signal */
+ opus_int32 xw_Q3[], /* O prefiltered signal */
+ opus_int32 HarmShapeFIRPacked_Q12, /* I Harmonic shaping coeficients */
+ opus_int Tilt_Q14, /* I Tilt shaping coeficient */
+ opus_int32 LF_shp_Q14, /* I Low-frequancy shaping coeficients */
+ opus_int lag, /* I Lag for harmonic shaping */
+ opus_int length /* I Length of signals */
+)
+{
+ opus_int i, idx, LTP_shp_buf_idx;
+ opus_int32 n_LTP_Q12, n_Tilt_Q10, n_LF_Q10;
+ opus_int32 sLF_MA_shp_Q12, sLF_AR_shp_Q12;
+ opus_int16 *LTP_shp_buf;
+
+ /* To speed up use temp variables instead of using the struct */
+ LTP_shp_buf = P->sLTP_shp;
+ LTP_shp_buf_idx = P->sLTP_shp_buf_idx;
+ sLF_AR_shp_Q12 = P->sLF_AR_shp_Q12;
+ sLF_MA_shp_Q12 = P->sLF_MA_shp_Q12;
+
+ if( lag > 0 ) {
+ for( i = 0; i < length; i++ ) {
+ /* unrolled loop */
+ silk_assert( HARM_SHAPE_FIR_TAPS == 3 );
+ idx = lag + LTP_shp_buf_idx;
+ n_LTP_Q12 = silk_SMULBB( LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 - 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
+ n_LTP_Q12 = silk_SMLABT( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 ) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
+ n_LTP_Q12 = silk_SMLABB( n_LTP_Q12, LTP_shp_buf[ ( idx - HARM_SHAPE_FIR_TAPS / 2 + 1) & LTP_MASK ], HarmShapeFIRPacked_Q12 );
+
+ n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
+ n_LF_Q10 = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
+
+ sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
+ sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12, silk_LSHIFT( n_LF_Q10, 2 ) );
+
+ LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
+ LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
+
+ xw_Q3[i] = silk_RSHIFT_ROUND( silk_SUB32( sLF_MA_shp_Q12, n_LTP_Q12 ), 9 );
+ }
+ }
+ else
+ {
+ for( i = 0; i < length; i++ ) {
+
+ n_LTP_Q12 = 0;
+
+ n_Tilt_Q10 = silk_SMULWB( sLF_AR_shp_Q12, Tilt_Q14 );
+ n_LF_Q10 = silk_SMLAWB( silk_SMULWT( sLF_AR_shp_Q12, LF_shp_Q14 ), sLF_MA_shp_Q12, LF_shp_Q14 );
+
+ sLF_AR_shp_Q12 = silk_SUB32( st_res_Q12[ i ], silk_LSHIFT( n_Tilt_Q10, 2 ) );
+ sLF_MA_shp_Q12 = silk_SUB32( sLF_AR_shp_Q12, silk_LSHIFT( n_LF_Q10, 2 ) );
+
+ LTP_shp_buf_idx = ( LTP_shp_buf_idx - 1 ) & LTP_MASK;
+ LTP_shp_buf[ LTP_shp_buf_idx ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 12 ) );
+
+ xw_Q3[i] = silk_RSHIFT_ROUND( sLF_MA_shp_Q12, 9 );
+ }
+ }
+
+ /* Copy temp variable back to state */
+ P->sLF_AR_shp_Q12 = sLF_AR_shp_Q12;
+ P->sLF_MA_shp_Q12 = sLF_MA_shp_Q12;
+ P->sLTP_shp_buf_idx = LTP_shp_buf_idx;
+}
+
+#endif /* __PREFILTER_FIX_MIPSR1_H__ */
diff --git a/thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c b/thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c
new file mode 100644
index 0000000000..555432cd96
--- /dev/null
+++ b/thirdparty/opus/silk/fixed/x86/prefilter_FIX_sse.c
@@ -0,0 +1,160 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "main.h"
+#include "celt/x86/x86cpu.h"
+
+void silk_warped_LPC_analysis_filter_FIX_sse4_1(
+ opus_int32 state[], /* I/O State [order + 1] */
+ opus_int32 res_Q2[], /* O Residual signal [length] */
+ const opus_int16 coef_Q13[], /* I Coefficients [order] */
+ const opus_int16 input[], /* I Input signal [length] */
+ const opus_int16 lambda_Q16, /* I Warping factor */
+ const opus_int length, /* I Length of input signal */
+ const opus_int order /* I Filter order (even) */
+)
+{
+ opus_int n, i;
+ opus_int32 acc_Q11, tmp1, tmp2;
+
+ /* Order must be even */
+ celt_assert( ( order & 1 ) == 0 );
+
+ if (order == 10)
+ {
+ if (0 == lambda_Q16)
+ {
+ __m128i coef_Q13_3210, coef_Q13_7654;
+ __m128i coef_Q13_0123, coef_Q13_4567;
+ __m128i state_0123, state_4567;
+ __m128i xmm_product1, xmm_product2;
+ __m128i xmm_tempa, xmm_tempb;
+
+ register opus_int32 sum;
+ register opus_int32 state_8, state_9, state_a;
+ register opus_int64 coef_Q13_8, coef_Q13_9;
+
+ celt_assert( length > 0 );
+
+ coef_Q13_3210 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 0 ] );
+ coef_Q13_7654 = OP_CVTEPI16_EPI32_M64( &coef_Q13[ 4 ] );
+
+ coef_Q13_0123 = _mm_shuffle_epi32( coef_Q13_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+ coef_Q13_4567 = _mm_shuffle_epi32( coef_Q13_7654, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+ coef_Q13_8 = (opus_int64) coef_Q13[ 8 ];
+ coef_Q13_9 = (opus_int64) coef_Q13[ 9 ];
+
+ state_0123 = _mm_loadu_si128( (__m128i *)(&state[ 0 ] ) );
+ state_4567 = _mm_loadu_si128( (__m128i *)(&state[ 4 ] ) );
+
+ state_0123 = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+ state_4567 = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+ state_8 = state[ 8 ];
+ state_9 = state[ 9 ];
+ state_a = 0;
+
+ for( n = 0; n < length; n++ )
+ {
+ xmm_product1 = _mm_mul_epi32( coef_Q13_0123, state_0123 ); /* 64-bit multiply, only 2 pairs */
+ xmm_product2 = _mm_mul_epi32( coef_Q13_4567, state_4567 );
+
+ xmm_tempa = _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+ xmm_tempb = _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) );
+
+ xmm_product1 = _mm_srli_epi64( xmm_product1, 16 ); /* >> 16, zero extending works */
+ xmm_product2 = _mm_srli_epi64( xmm_product2, 16 );
+
+ xmm_tempa = _mm_mul_epi32( coef_Q13_3210, xmm_tempa );
+ xmm_tempb = _mm_mul_epi32( coef_Q13_7654, xmm_tempb );
+
+ xmm_tempa = _mm_srli_epi64( xmm_tempa, 16 );
+ xmm_tempb = _mm_srli_epi64( xmm_tempb, 16 );
+
+ xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_product1 );
+ xmm_tempb = _mm_add_epi32( xmm_tempb, xmm_product2 );
+ xmm_tempa = _mm_add_epi32( xmm_tempa, xmm_tempb );
+
+ sum = (opus_int32)((coef_Q13_8 * state_8) >> 16);
+ sum += (opus_int32)((coef_Q13_9 * state_9) >> 16);
+
+ xmm_tempa = _mm_add_epi32( xmm_tempa, _mm_shuffle_epi32( xmm_tempa, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
+ sum += _mm_cvtsi128_si32( xmm_tempa);
+ res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( ( 5 + sum ), 9);
+
+ /* move right */
+ state_a = state_9;
+ state_9 = state_8;
+ state_8 = _mm_cvtsi128_si32( state_4567 );
+ state_4567 = _mm_alignr_epi8( state_0123, state_4567, 4 );
+
+ state_0123 = _mm_alignr_epi8( _mm_cvtsi32_si128( silk_LSHIFT( input[ n ], 14 ) ), state_0123, 4 );
+ }
+
+ _mm_storeu_si128( (__m128i *)( &state[ 0 ] ), _mm_shuffle_epi32( state_0123, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
+ _mm_storeu_si128( (__m128i *)( &state[ 4 ] ), _mm_shuffle_epi32( state_4567, _MM_SHUFFLE( 0, 1, 2, 3 ) ) );
+ state[ 8 ] = state_8;
+ state[ 9 ] = state_9;
+ state[ 10 ] = state_a;
+
+ return;
+ }
+ }
+
+ for( n = 0; n < length; n++ ) {
+ /* Output of lowpass section */
+ tmp2 = silk_SMLAWB( state[ 0 ], state[ 1 ], lambda_Q16 );
+ state[ 0 ] = silk_LSHIFT( input[ n ], 14 );
+ /* Output of allpass section */
+ tmp1 = silk_SMLAWB( state[ 1 ], state[ 2 ] - tmp2, lambda_Q16 );
+ state[ 1 ] = tmp2;
+ acc_Q11 = silk_RSHIFT( order, 1 );
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ 0 ] );
+ /* Loop over allpass sections */
+ for( i = 2; i < order; i += 2 ) {
+ /* Output of allpass section */
+ tmp2 = silk_SMLAWB( state[ i ], state[ i + 1 ] - tmp1, lambda_Q16 );
+ state[ i ] = tmp1;
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ i - 1 ] );
+ /* Output of allpass section */
+ tmp1 = silk_SMLAWB( state[ i + 1 ], state[ i + 2 ] - tmp2, lambda_Q16 );
+ state[ i + 1 ] = tmp2;
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp2, coef_Q13[ i ] );
+ }
+ state[ order ] = tmp1;
+ acc_Q11 = silk_SMLAWB( acc_Q11, tmp1, coef_Q13[ order - 1 ] );
+ res_Q2[ n ] = silk_LSHIFT( (opus_int32)input[ n ], 2 ) - silk_RSHIFT_ROUND( acc_Q11, 9 );
+ }
+}