summaryrefslogtreecommitdiff
path: root/drivers/opus/celt/x86
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/opus/celt/x86')
-rw-r--r--drivers/opus/celt/x86/celt_lpc_sse.c129
-rw-r--r--drivers/opus/celt/x86/celt_lpc_sse.h65
-rw-r--r--drivers/opus/celt/x86/pitch_sse.c182
-rw-r--r--drivers/opus/celt/x86/pitch_sse.h257
-rw-r--r--drivers/opus/celt/x86/pitch_sse2.c92
-rw-r--r--drivers/opus/celt/x86/pitch_sse4_1.c192
-rw-r--r--drivers/opus/celt/x86/x86_celt_map.c152
-rw-r--r--drivers/opus/celt/x86/x86cpu.c154
-rw-r--r--drivers/opus/celt/x86/x86cpu.h93
9 files changed, 1204 insertions, 112 deletions
diff --git a/drivers/opus/celt/x86/celt_lpc_sse.c b/drivers/opus/celt/x86/celt_lpc_sse.c
new file mode 100644
index 0000000000..cfc86dc66c
--- /dev/null
+++ b/drivers/opus/celt/x86/celt_lpc_sse.c
@@ -0,0 +1,129 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "opus/opus_config.h"
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include "opus/celt/celt_lpc.h"
+#include "opus/celt/stack_alloc.h"
+#include "opus/celt/mathops.h"
+#include "opus/celt/pitch.h"
+#include "opus/celt/x86/x86cpu.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_sse4_1(const opus_val16 *_x,
+ const opus_val16 *num,
+ opus_val16 *_y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch)
+{
+ int i,j;
+ VARDECL(opus_val16, rnum);
+ VARDECL(opus_val16, x);
+
+ __m128i vecNoA;
+ opus_int32 noA ;
+ SAVE_STACK;
+
+ ALLOC(rnum, ord, opus_val16);
+ ALLOC(x, N+ord, opus_val16);
+ for(i=0;i<ord;i++)
+ rnum[i] = num[ord-i-1];
+ for(i=0;i<ord;i++)
+ x[i] = mem[ord-i-1];
+
+ for (i=0;i<N-7;i+=8)
+ {
+ x[i+ord ]=_x[i ];
+ x[i+ord+1]=_x[i+1];
+ x[i+ord+2]=_x[i+2];
+ x[i+ord+3]=_x[i+3];
+ x[i+ord+4]=_x[i+4];
+ x[i+ord+5]=_x[i+5];
+ x[i+ord+6]=_x[i+6];
+ x[i+ord+7]=_x[i+7];
+ }
+
+ for (;i<N-3;i+=4)
+ {
+ x[i+ord ]=_x[i ];
+ x[i+ord+1]=_x[i+1];
+ x[i+ord+2]=_x[i+2];
+ x[i+ord+3]=_x[i+3];
+ }
+
+ for (;i<N;i++)
+ x[i+ord]=_x[i];
+
+ for(i=0;i<ord;i++)
+ mem[i] = _x[N-i-1];
+#ifdef SMALL_FOOTPRINT
+ for (i=0;i<N;i++)
+ {
+ opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT);
+ for (j=0;j<ord;j++)
+ {
+ sum = MAC16_16(sum,rnum[j],x[i+j]);
+ }
+ _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT));
+ }
+#else
+ noA = EXTEND32(1) << SIG_SHIFT >> 1;
+ vecNoA = _mm_set_epi32(noA, noA, noA, noA);
+
+ for (i=0;i<N-3;i+=4)
+ {
+ opus_val32 sums[4] = {0};
+ __m128i vecSum, vecX;
+
+ xcorr_kernel(rnum, x+i, sums, ord, arch);
+
+ vecSum = _mm_loadu_si128((__m128i *)sums);
+ vecSum = _mm_add_epi32(vecSum, vecNoA);
+ vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT);
+ vecX = OP_CVTEPI16_EPI32_M64(_x + i);
+ vecSum = _mm_add_epi32(vecSum, vecX);
+ vecSum = _mm_packs_epi32(vecSum, vecSum);
+ _mm_storel_epi64((__m128i *)(_y + i), vecSum);
+ }
+ for (;i<N;i++)
+ {
+ opus_val32 sum = 0;
+ for (j=0;j<ord;j++)
+ sum = MAC16_16(sum, rnum[j], x[i + j]);
+ _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT)));
+ }
+
+#endif
+ RESTORE_STACK;
+}
+
+#endif
diff --git a/drivers/opus/celt/x86/celt_lpc_sse.h b/drivers/opus/celt/x86/celt_lpc_sse.h
new file mode 100644
index 0000000000..3787afd3ff
--- /dev/null
+++ b/drivers/opus/celt/x86/celt_lpc_sse.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_SSE_H
+#define CELT_LPC_SSE_H
+#include "opus/opus_config.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_FIR
+
+void celt_fir_sse4_1(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch);
+
+#if defined(OPUS_X86_PRESUME_SSE4_1)
+#define celt_fir(x, num, y, N, ord, mem, arch) \
+ ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch))
+
+#else
+
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch);
+
+# define celt_fir(x, num, y, N, ord, mem, arch) \
+ ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch))
+
+#endif
+#endif
+
+#endif
diff --git a/drivers/opus/celt/x86/pitch_sse.c b/drivers/opus/celt/x86/pitch_sse.c
new file mode 100644
index 0000000000..9f22ffd9ab
--- /dev/null
+++ b/drivers/opus/celt/x86/pitch_sse.c
@@ -0,0 +1,182 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "opus/opus_config.h"
+
+#include "opus/silk/macros.h"
+#include "opus/celt/celt_lpc.h"
+#include "opus/celt/stack_alloc.h"
+#include "opus/celt/mathops.h"
+#include "opus/celt/pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+
+#include <xmmintrin.h>
+#include "opus/celt/arch.h"
+
+void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
+{
+ int j;
+ __m128 xsum1, xsum2;
+ xsum1 = _mm_loadu_ps(sum);
+ xsum2 = _mm_setzero_ps();
+
+ for (j = 0; j < len-3; j += 4)
+ {
+ __m128 x0 = _mm_loadu_ps(x+j);
+ __m128 yj = _mm_loadu_ps(y+j);
+ __m128 y3 = _mm_loadu_ps(y+j+3);
+
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
+ _mm_shuffle_ps(yj,y3,0x49)));
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
+ _mm_shuffle_ps(yj,y3,0x9e)));
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
+ }
+ if (j < len)
+ {
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+ if (++j < len)
+ {
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+ if (++j < len)
+ {
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
+ }
+ }
+ }
+ _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
+}
+
+
+void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+ int i;
+ __m128 xsum1, xsum2;
+ xsum1 = _mm_setzero_ps();
+ xsum2 = _mm_setzero_ps();
+ for (i=0;i<N-3;i+=4)
+ {
+ __m128 xi = _mm_loadu_ps(x+i);
+ __m128 y1i = _mm_loadu_ps(y01+i);
+ __m128 y2i = _mm_loadu_ps(y02+i);
+ xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
+ xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
+ }
+ /* Horizontal sum */
+ xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
+ xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
+ _mm_store_ss(xy1, xsum1);
+ xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
+ xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
+ _mm_store_ss(xy2, xsum2);
+ for (;i<N;i++)
+ {
+ *xy1 = MAC16_16(*xy1, x[i], y01[i]);
+ *xy2 = MAC16_16(*xy2, x[i], y02[i]);
+ }
+}
+
+opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ int i;
+ float xy;
+ __m128 sum;
+ sum = _mm_setzero_ps();
+ /* FIXME: We should probably go 8-way and use 2 sums. */
+ for (i=0;i<N-3;i+=4)
+ {
+ __m128 xi = _mm_loadu_ps(x+i);
+ __m128 yi = _mm_loadu_ps(y+i);
+ sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
+ }
+ /* Horizontal sum */
+ sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
+ sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
+ _mm_store_ss(&xy, sum);
+ for (;i<N;i++)
+ {
+ xy = MAC16_16(xy, x[i], y[i]);
+ }
+ return xy;
+}
+
+void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
+ opus_val16 g10, opus_val16 g11, opus_val16 g12)
+{
+ int i;
+ __m128 x0v;
+ __m128 g10v, g11v, g12v;
+ g10v = _mm_load1_ps(&g10);
+ g11v = _mm_load1_ps(&g11);
+ g12v = _mm_load1_ps(&g12);
+ x0v = _mm_loadu_ps(&x[-T-2]);
+ for (i=0;i<N-3;i+=4)
+ {
+ __m128 yi, yi2, x1v, x2v, x3v, x4v;
+ const opus_val32 *xp = &x[i-T-2];
+ yi = _mm_loadu_ps(x+i);
+ x4v = _mm_loadu_ps(xp+4);
+#if 0
+ /* Slower version with all loads */
+ x1v = _mm_loadu_ps(xp+1);
+ x2v = _mm_loadu_ps(xp+2);
+ x3v = _mm_loadu_ps(xp+3);
+#else
+ x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
+ x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
+ x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
+#endif
+
+ yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
+#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
+ yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
+ yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#else
+ /* Use partial sums */
+ yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
+ _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+ yi = _mm_add_ps(yi, yi2);
+#endif
+ x0v=x4v;
+ _mm_storeu_ps(y+i, yi);
+ }
+#ifdef CUSTOM_MODES
+ for (;i<N;i++)
+ {
+ y[i] = x[i]
+ + MULT16_32_Q15(g10,x[i-T])
+ + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
+ + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
+ }
+#endif
+}
+
+
+#endif
diff --git a/drivers/opus/celt/x86/pitch_sse.h b/drivers/opus/celt/x86/pitch_sse.h
index 1542b87232..2b4b54f61f 100644
--- a/drivers/opus/celt/x86/pitch_sse.h
+++ b/drivers/opus/celt/x86/pitch_sse.h
@@ -1,4 +1,5 @@
-/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */
+/* Copyright (c) 2013 Jean-Marc Valin and John Ridges
+ Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/
/**
@file pitch_sse.h
@brief Pitch analysis
@@ -31,126 +32,158 @@
#ifndef PITCH_SSE_H
#define PITCH_SSE_H
+#include "opus/opus_config.h"
-#include <xmmintrin.h>
-#include "opus/celt/arch.h"
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+void xcorr_kernel_sse4_1(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ opus_val32 sum[4],
+ int len);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+void xcorr_kernel_sse(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len);
+#endif
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
#define OVERRIDE_XCORR_KERNEL
-static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
-{
- int j;
- __m128 xsum1, xsum2;
- xsum1 = _mm_loadu_ps(sum);
- xsum2 = _mm_setzero_ps();
-
- for (j = 0; j < len-3; j += 4)
- {
- __m128 x0 = _mm_loadu_ps(x+j);
- __m128 yj = _mm_loadu_ps(y+j);
- __m128 y3 = _mm_loadu_ps(y+j+3);
-
- xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
- xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
- _mm_shuffle_ps(yj,y3,0x49)));
- xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
- _mm_shuffle_ps(yj,y3,0x9e)));
- xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
- }
- if (j < len)
- {
- xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
- if (++j < len)
- {
- xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
- if (++j < len)
- {
- xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
- }
- }
- }
- _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
-}
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len))
-#define OVERRIDE_DUAL_INNER_PROD
-static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
- int N, opus_val32 *xy1, opus_val32 *xy2)
-{
- int i;
- __m128 xsum1, xsum2;
- xsum1 = _mm_setzero_ps();
- xsum2 = _mm_setzero_ps();
- for (i=0;i<N-3;i+=4)
- {
- __m128 xi = _mm_loadu_ps(x+i);
- __m128 y1i = _mm_loadu_ps(y01+i);
- __m128 y2i = _mm_loadu_ps(y02+i);
- xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
- xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
- }
- /* Horizontal sum */
- xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
- xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
- _mm_store_ss(xy1, xsum1);
- xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
- xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
- _mm_store_ss(xy2, xsum2);
- for (;i<N;i++)
- {
- *xy1 = MAC16_16(*xy1, x[i], y01[i]);
- *xy2 = MAC16_16(*xy2, x[i], y02[i]);
- }
-}
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((void)arch, xcorr_kernel_sse(x, y, sum, len))
+
+#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+
+extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len);
+
+#define OVERRIDE_XCORR_KERNEL
+#define xcorr_kernel(x, y, sum, len, arch) \
+ ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len))
+
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse4_1(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ int N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse2(
+ const opus_int16 *x,
+ const opus_int16 *y,
+ int N);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N);
+#endif
+
+
+#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((void)arch, celt_inner_prod_sse4_1(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((void)arch, celt_inner_prod_sse2(x, y, N))
+
+#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT)
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((void)arch, celt_inner_prod_sse(x, y, N))
+
+
+#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
+
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N);
+
+#define OVERRIDE_CELT_INNER_PROD
+#define celt_inner_prod(x, y, N, arch) \
+ ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
-#define OVERRIDE_COMB_FILTER_CONST
-static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N,
- opus_val16 g10, opus_val16 g11, opus_val16 g12)
-{
- int i;
- __m128 x0v;
- __m128 g10v, g11v, g12v;
- g10v = _mm_load1_ps(&g10);
- g11v = _mm_load1_ps(&g11);
- g12v = _mm_load1_ps(&g12);
- x0v = _mm_loadu_ps(&x[-T-2]);
- for (i=0;i<N-3;i+=4)
- {
- __m128 yi, yi2, x1v, x2v, x3v, x4v;
- const opus_val32 *xp = &x[i-T-2];
- yi = _mm_loadu_ps(x+i);
- x4v = _mm_loadu_ps(xp+4);
-#if 0
- /* Slower version with all loads */
- x1v = _mm_loadu_ps(xp+1);
- x2v = _mm_loadu_ps(xp+2);
- x3v = _mm_loadu_ps(xp+3);
-#else
- x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
- x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
- x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
#endif
- yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
-#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
- yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
- yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
+
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_COMB_FILTER_CONST
+
+#undef dual_inner_prod
+#undef comb_filter_const
+
+void dual_inner_prod_sse(const opus_val16 *x,
+ const opus_val16 *y01,
+ const opus_val16 *y02,
+ int N,
+ opus_val32 *xy1,
+ opus_val32 *xy2);
+
+void comb_filter_const_sse(opus_val32 *y,
+ opus_val32 *x,
+ int T,
+ int N,
+ opus_val16 g10,
+ opus_val16 g11,
+ opus_val16 g12);
+
+
+#if defined(OPUS_X86_PRESUME_SSE)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+ ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2))
+
+# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+ ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12))
#else
- /* Use partial sums */
- yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
- _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
- yi = _mm_add_ps(yi, yi2);
+
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y01,
+ const opus_val16 *y02,
+ int N,
+ opus_val32 *xy1,
+ opus_val32 *xy2);
+
+#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \
+ ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+
+extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+ opus_val32 *y,
+ opus_val32 *x,
+ int T,
+ int N,
+ opus_val16 g10,
+ opus_val16 g11,
+ opus_val16 g12);
+
+#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \
+ ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12))
+
+#define NON_STATIC_COMB_FILTER_CONST_C
+
#endif
- x0v=x4v;
- _mm_storeu_ps(y+i, yi);
- }
-#ifdef CUSTOM_MODES
- for (;i<N;i++)
- {
- y[i] = x[i]
- + MULT16_32_Q15(g10,x[i-T])
- + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
- + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
- }
#endif
-}
#endif
diff --git a/drivers/opus/celt/x86/pitch_sse2.c b/drivers/opus/celt/x86/pitch_sse2.c
new file mode 100644
index 0000000000..b5a78b4acd
--- /dev/null
+++ b/drivers/opus/celt/x86/pitch_sse2.c
@@ -0,0 +1,92 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "opus/opus_config.h"
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "opus/silk/macros.h"
+#include "opus/celt/celt_lpc.h"
+#include "opus/celt/stack_alloc.h"
+#include "opus/celt/mathops.h"
+#include "opus/celt/pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT)
+opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ opus_int i, dataSize16;
+ opus_int32 sum;
+
+ __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+ __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+
+ sum = 0;
+ dataSize16 = N & ~15;
+
+ acc1 = _mm_setzero_si128();
+ acc2 = _mm_setzero_si128();
+
+ for (i=0;i<dataSize16;i+=16)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+ inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+ inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+ }
+
+ acc1 = _mm_add_epi32( acc1, acc2 );
+
+ if (N - i >= 8)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ i += 8;
+ }
+
+ acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
+ acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
+ sum += _mm_cvtsi128_si32(acc1);
+
+ for (;i<N;i++) {
+ sum = silk_SMLABB(sum, x[i], y[i]);
+ }
+
+ return sum;
+}
+#endif
diff --git a/drivers/opus/celt/x86/pitch_sse4_1.c b/drivers/opus/celt/x86/pitch_sse4_1.c
new file mode 100644
index 0000000000..57fb332210
--- /dev/null
+++ b/drivers/opus/celt/x86/pitch_sse4_1.c
@@ -0,0 +1,192 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "opus/opus_config.h"
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "opus/silk/macros.h"
+#include "opus/celt/celt_lpc.h"
+#include "opus/celt/stack_alloc.h"
+#include "opus/celt/mathops.h"
+#include "opus/celt/pitch.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)
+#include <smmintrin.h>
+#include "opus/celt/x86/x86cpu.h"
+
+opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y,
+ int N)
+{
+ opus_int i, dataSize16;
+ opus_int32 sum;
+ __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
+ __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;
+ __m128i inVec1_3210, inVec2_3210;
+
+ sum = 0;
+ dataSize16 = N & ~15;
+
+ acc1 = _mm_setzero_si128();
+ acc2 = _mm_setzero_si128();
+
+ for (i=0;i<dataSize16;i+=16) {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
+ inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+ inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
+ }
+
+ acc1 = _mm_add_epi32(acc1, acc2);
+
+ if (N - i >= 8)
+ {
+ inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
+ inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));
+
+ inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_76543210);
+ i += 8;
+ }
+
+ if (N - i >= 4)
+ {
+ inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]);
+ inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]);
+
+ inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210);
+
+ acc1 = _mm_add_epi32(acc1, inVec1_3210);
+ i += 4;
+ }
+
+ acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1));
+ acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E));
+
+ sum += _mm_cvtsi128_si32(acc1);
+
+ for (;i<N;i++)
+ {
+ sum = silk_SMLABB(sum, x[i], y[i]);
+ }
+
+ return sum;
+}
+
+void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len)
+{
+ int j;
+
+ __m128i vecX, vecX0, vecX1, vecX2, vecX3;
+ __m128i vecY0, vecY1, vecY2, vecY3;
+ __m128i sum0, sum1, sum2, sum3, vecSum;
+ __m128i initSum;
+
+ celt_assert(len >= 3);
+
+ sum0 = _mm_setzero_si128();
+ sum1 = _mm_setzero_si128();
+ sum2 = _mm_setzero_si128();
+ sum3 = _mm_setzero_si128();
+
+ for (j=0;j<(len-7);j+=8)
+ {
+ vecX = _mm_loadu_si128((__m128i *)(&x[j + 0]));
+ vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0]));
+ vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1]));
+ vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2]));
+ vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3]));
+
+ sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0));
+ sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1));
+ sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2));
+ sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3));
+ }
+
+ sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0));
+ sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E));
+
+ sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1));
+ sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E));
+
+ sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2));
+ sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E));
+
+ sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3));
+ sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E));
+
+ vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1),
+ _mm_unpacklo_epi32(sum2, sum3));
+
+ for (;j<(len-3);j+=4)
+ {
+ vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+ vecX1 = _mm_shuffle_epi32(vecX, 0x55);
+ vecX2 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX3 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+ vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+ vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+ sum2 = _mm_mullo_epi32(vecX2, vecY2);
+ sum3 = _mm_mullo_epi32(vecX3, vecY3);
+
+ sum0 = _mm_add_epi32(sum0, sum1);
+ sum2 = _mm_add_epi32(sum2, sum3);
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum2);
+ }
+
+ for (;j<len;j++)
+ {
+ vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ }
+
+ initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
+ initSum = _mm_add_epi32(initSum, vecSum);
+ _mm_storeu_si128((__m128i *)sum, initSum);
+}
+#endif
diff --git a/drivers/opus/celt/x86/x86_celt_map.c b/drivers/opus/celt/x86/x86_celt_map.c
new file mode 100644
index 0000000000..5146ea2b38
--- /dev/null
+++ b/drivers/opus/celt/x86/x86_celt_map.c
@@ -0,0 +1,152 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "opus/opus_config.h"
+
+#include "opus/celt/x86/x86cpu.h"
+#include "opus/celt/celt_lpc.h"
+#include "opus/celt/pitch.h"
+#include "opus/celt/x86/pitch_sse.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+# if defined(FIXED_POINT)
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ opus_val16 *mem,
+ int arch
+) = {
+ celt_fir_c, /* non-sse */
+ celt_fir_c,
+ celt_fir_c,
+ MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */
+ MAY_HAVE_SSE4_1(celt_fir) /* avx */
+};
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len
+) = {
+ xcorr_kernel_c, /* non-sse */
+ xcorr_kernel_c,
+ xcorr_kernel_c,
+ MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */
+ MAY_HAVE_SSE4_1(xcorr_kernel) /* avx */
+};
+
+#endif
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+ (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N
+) = {
+ celt_inner_prod_c, /* non-sse */
+ celt_inner_prod_c,
+ MAY_HAVE_SSE2(celt_inner_prod),
+ MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */
+ MAY_HAVE_SSE4_1(celt_inner_prod) /* avx */
+};
+
+#endif
+
+# else
+
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
+
+void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ opus_val32 sum[4],
+ int len
+) = {
+ xcorr_kernel_c, /* non-sse */
+ MAY_HAVE_SSE(xcorr_kernel),
+ MAY_HAVE_SSE(xcorr_kernel),
+ MAY_HAVE_SSE(xcorr_kernel),
+ MAY_HAVE_SSE(xcorr_kernel)
+};
+
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y,
+ int N
+) = {
+ celt_inner_prod_c, /* non-sse */
+ MAY_HAVE_SSE(celt_inner_prod),
+ MAY_HAVE_SSE(celt_inner_prod),
+ MAY_HAVE_SSE(celt_inner_prod),
+ MAY_HAVE_SSE(celt_inner_prod)
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
+ const opus_val16 *x,
+ const opus_val16 *y01,
+ const opus_val16 *y02,
+ int N,
+ opus_val32 *xy1,
+ opus_val32 *xy2
+) = {
+ dual_inner_prod_c, /* non-sse */
+ MAY_HAVE_SSE(dual_inner_prod),
+ MAY_HAVE_SSE(dual_inner_prod),
+ MAY_HAVE_SSE(dual_inner_prod),
+ MAY_HAVE_SSE(dual_inner_prod)
+};
+
+void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
+ opus_val32 *y,
+ opus_val32 *x,
+ int T,
+ int N,
+ opus_val16 g10,
+ opus_val16 g11,
+ opus_val16 g12
+) = {
+ comb_filter_const_c, /* non-sse */
+ MAY_HAVE_SSE(comb_filter_const),
+ MAY_HAVE_SSE(comb_filter_const),
+ MAY_HAVE_SSE(comb_filter_const),
+ MAY_HAVE_SSE(comb_filter_const)
+};
+
+
+#endif
+
+#endif
+#endif
diff --git a/drivers/opus/celt/x86/x86cpu.c b/drivers/opus/celt/x86/x86cpu.c
new file mode 100644
index 0000000000..91b9b185c2
--- /dev/null
+++ b/drivers/opus/celt/x86/x86cpu.c
@@ -0,0 +1,154 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "opus/opus_config.h"
+
+#include "opus/celt/cpu_support.h"
+#include "opus/silk/macros.h"
+#include "opus/silk/main.h"
+#include "opus/celt/pitch.h"
+#include "opus/celt/x86/x86cpu.h"
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \
+ (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \
+ (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX))
+
+
+#if defined(_MSC_VER)
+
+#include <intrin.h>
+static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+ __cpuid((int*)CPUInfo, InfoType);
+}
+
+#else
+
+#if defined(CPU_INFO_BY_C)
+#include <cpuid.h>
+#endif
+
+static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType)
+{
+#if defined(CPU_INFO_BY_ASM)
+#if defined(__i386__) && defined(__PIC__)
+/* %ebx is PIC register in 32-bit, so mustn't clobber it. */
+ __asm__ __volatile__ (
+ "xchg %%ebx, %1\n"
+ "cpuid\n"
+ "xchg %%ebx, %1\n":
+ "=a" (CPUInfo[0]),
+ "=r" (CPUInfo[1]),
+ "=c" (CPUInfo[2]),
+ "=d" (CPUInfo[3]) :
+ "0" (InfoType)
+ );
+#else
+ __asm__ __volatile__ (
+ "cpuid":
+ "=a" (CPUInfo[0]),
+ "=b" (CPUInfo[1]),
+ "=c" (CPUInfo[2]),
+ "=d" (CPUInfo[3]) :
+ "0" (InfoType)
+ );
+#endif
+#elif defined(CPU_INFO_BY_C)
+ __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3]));
+#endif
+}
+
+#endif
+
+typedef struct CPU_Feature{
+ /* SIMD: 128-bit */
+ int HW_SSE;
+ int HW_SSE2;
+ int HW_SSE41;
+ /* SIMD: 256-bit */
+ int HW_AVX;
+} CPU_Feature;
+
+static void opus_cpu_feature_check(CPU_Feature *cpu_feature)
+{
+ unsigned int info[4] = {0};
+ unsigned int nIds = 0;
+
+ cpuid(info, 0);
+ nIds = info[0];
+
+ if (nIds >= 1){
+ cpuid(info, 1);
+ cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0;
+ cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0;
+ cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0;
+ cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0;
+ }
+ else {
+ cpu_feature->HW_SSE = 0;
+ cpu_feature->HW_SSE2 = 0;
+ cpu_feature->HW_SSE41 = 0;
+ cpu_feature->HW_AVX = 0;
+ }
+}
+
+int opus_select_arch(void)
+{
+ CPU_Feature cpu_feature;
+ int arch;
+
+ opus_cpu_feature_check(&cpu_feature);
+
+ arch = 0;
+ if (!cpu_feature.HW_SSE)
+ {
+ return arch;
+ }
+ arch++;
+
+ if (!cpu_feature.HW_SSE2)
+ {
+ return arch;
+ }
+ arch++;
+
+ if (!cpu_feature.HW_SSE41)
+ {
+ return arch;
+ }
+ arch++;
+
+ if (!cpu_feature.HW_AVX)
+ {
+ return arch;
+ }
+ arch++;
+
+ return arch;
+}
+
+#endif
diff --git a/drivers/opus/celt/x86/x86cpu.h b/drivers/opus/celt/x86/x86cpu.h
new file mode 100644
index 0000000000..04fd48aac4
--- /dev/null
+++ b/drivers/opus/celt/x86/x86cpu.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2014, Cisco Systems, INC
+ Written by XiangMingZhu WeiZhou MinPeng YanWang
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(X86CPU_H)
+# define X86CPU_H
+
+# if defined(OPUS_X86_MAY_HAVE_SSE)
+# define MAY_HAVE_SSE(name) name ## _sse
+# else
+# define MAY_HAVE_SSE(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE2)
+# define MAY_HAVE_SSE2(name) name ## _sse2
+# else
+# define MAY_HAVE_SSE2(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# define MAY_HAVE_SSE4_1(name) name ## _sse4_1
+# else
+# define MAY_HAVE_SSE4_1(name) name ## _c
+# endif
+
+# if defined(OPUS_X86_MAY_HAVE_AVX)
+# define MAY_HAVE_AVX(name) name ## _avx
+# else
+# define MAY_HAVE_AVX(name) name ## _c
+# endif
+
+# if defined(OPUS_HAVE_RTCD)
+int opus_select_arch(void);
+# endif
+
+/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32()
+ or _mm_cvtepi16_epi32() when optimizations are disabled, even though the
+ actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory
+ reference, these require 16-byte alignment and load a full 16 bytes (instead
+ of 4 or 8), possibly reading out of bounds.
+
+ We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or
+ _mm_loadl_epi64(), which should have the same semantics as an m32 or m64
+ reference in the PMOVSXWD instruction itself, but gcc is not smart enough to
+ optimize this out when optimizations ARE enabled.
+
+ Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32
+ (which is fair, since technically the compiler is always allowed to do the
+ dereference before invoking the function implementing the intrinsic).
+ However, it is smart enough to eliminate the extra MOVD instruction.
+ For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out
+ the extra MOVQ if it's specified explicitly */
+
+# if defined(__clang__) || !defined(__OPTIMIZE__)
+# define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x))))
+# else
+# define OP_CVTEPI8_EPI32_M32(x) \
+ (_mm_cvtepi8_epi32(*(__m128i *)(x)))
+#endif
+
+# if !defined(__OPTIMIZE__)
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x))))
+# else
+# define OP_CVTEPI16_EPI32_M64(x) \
+ (_mm_cvtepi16_epi32(*(__m128i *)(x)))
+# endif
+
+#endif