diff options
Diffstat (limited to 'drivers/opus/celt/x86')
-rw-r--r-- | drivers/opus/celt/x86/celt_lpc_sse.c | 129 | ||||
-rw-r--r-- | drivers/opus/celt/x86/celt_lpc_sse.h | 65 | ||||
-rw-r--r-- | drivers/opus/celt/x86/pitch_sse.c | 182 | ||||
-rw-r--r-- | drivers/opus/celt/x86/pitch_sse.h | 257 | ||||
-rw-r--r-- | drivers/opus/celt/x86/pitch_sse2.c | 92 | ||||
-rw-r--r-- | drivers/opus/celt/x86/pitch_sse4_1.c | 192 | ||||
-rw-r--r-- | drivers/opus/celt/x86/x86_celt_map.c | 152 | ||||
-rw-r--r-- | drivers/opus/celt/x86/x86cpu.c | 154 | ||||
-rw-r--r-- | drivers/opus/celt/x86/x86cpu.h | 93 |
9 files changed, 1204 insertions, 112 deletions
diff --git a/drivers/opus/celt/x86/celt_lpc_sse.c b/drivers/opus/celt/x86/celt_lpc_sse.c new file mode 100644 index 0000000000..cfc86dc66c --- /dev/null +++ b/drivers/opus/celt/x86/celt_lpc_sse.c @@ -0,0 +1,129 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include "opus/opus_config.h" + +#include <xmmintrin.h> +#include <emmintrin.h> +#include <smmintrin.h> +#include "opus/celt/celt_lpc.h" +#include "opus/celt/stack_alloc.h" +#include "opus/celt/mathops.h" +#include "opus/celt/pitch.h" +#include "opus/celt/x86/x86cpu.h" + +#if defined(FIXED_POINT) + +void celt_fir_sse4_1(const opus_val16 *_x, + const opus_val16 *num, + opus_val16 *_y, + int N, + int ord, + opus_val16 *mem, + int arch) +{ + int i,j; + VARDECL(opus_val16, rnum); + VARDECL(opus_val16, x); + + __m128i vecNoA; + opus_int32 noA ; + SAVE_STACK; + + ALLOC(rnum, ord, opus_val16); + ALLOC(x, N+ord, opus_val16); + for(i=0;i<ord;i++) + rnum[i] = num[ord-i-1]; + for(i=0;i<ord;i++) + x[i] = mem[ord-i-1]; + + for (i=0;i<N-7;i+=8) + { + x[i+ord ]=_x[i ]; + x[i+ord+1]=_x[i+1]; + x[i+ord+2]=_x[i+2]; + x[i+ord+3]=_x[i+3]; + x[i+ord+4]=_x[i+4]; + x[i+ord+5]=_x[i+5]; + x[i+ord+6]=_x[i+6]; + x[i+ord+7]=_x[i+7]; + } + + for (;i<N-3;i+=4) + { + x[i+ord ]=_x[i ]; + x[i+ord+1]=_x[i+1]; + x[i+ord+2]=_x[i+2]; + x[i+ord+3]=_x[i+3]; + } + + for (;i<N;i++) + x[i+ord]=_x[i]; + + for(i=0;i<ord;i++) + mem[i] = _x[N-i-1]; +#ifdef SMALL_FOOTPRINT + for (i=0;i<N;i++) + { + opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); + for (j=0;j<ord;j++) + { + sum = MAC16_16(sum,rnum[j],x[i+j]); + } + _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); + } +#else + noA = EXTEND32(1) << SIG_SHIFT >> 1; + vecNoA = _mm_set_epi32(noA, noA, noA, noA); + + for (i=0;i<N-3;i+=4) + { + opus_val32 sums[4] = {0}; + __m128i vecSum, vecX; + + xcorr_kernel(rnum, x+i, sums, ord, arch); + + vecSum = _mm_loadu_si128((__m128i *)sums); + vecSum = _mm_add_epi32(vecSum, vecNoA); + vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); + vecX = OP_CVTEPI16_EPI32_M64(_x + i); + vecSum = _mm_add_epi32(vecSum, vecX); + vecSum = _mm_packs_epi32(vecSum, vecSum); + _mm_storel_epi64((__m128i *)(_y + i), vecSum); + } + for (;i<N;i++) + { + opus_val32 sum = 0; + for (j=0;j<ord;j++) + sum = MAC16_16(sum, rnum[j], x[i + j]); + _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); + } + +#endif + RESTORE_STACK; +} + +#endif diff --git a/drivers/opus/celt/x86/celt_lpc_sse.h b/drivers/opus/celt/x86/celt_lpc_sse.h new file mode 100644 index 0000000000..3787afd3ff --- /dev/null +++ b/drivers/opus/celt/x86/celt_lpc_sse.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CELT_LPC_SSE_H +#define CELT_LPC_SSE_H +#include "opus/opus_config.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_CELT_FIR + +void celt_fir_sse4_1( + const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + opus_val16 *mem, + int arch); + +#if defined(OPUS_X86_PRESUME_SSE4_1) +#define celt_fir(x, num, y, N, ord, mem, arch) \ + ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) + +#else + +extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + opus_val16 *mem, + int arch); + +# define celt_fir(x, num, y, N, ord, mem, arch) \ + ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch)) + +#endif +#endif + +#endif diff --git a/drivers/opus/celt/x86/pitch_sse.c b/drivers/opus/celt/x86/pitch_sse.c new file mode 100644 index 0000000000..9f22ffd9ab --- /dev/null +++ b/drivers/opus/celt/x86/pitch_sse.c @@ -0,0 +1,182 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include "opus/opus_config.h" + +#include "opus/silk/macros.h" +#include "opus/celt/celt_lpc.h" +#include "opus/celt/stack_alloc.h" +#include "opus/celt/mathops.h" +#include "opus/celt/pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) + +#include <xmmintrin.h> +#include "opus/celt/arch.h" + +void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) +{ + int j; + __m128 xsum1, xsum2; + xsum1 = _mm_loadu_ps(sum); + xsum2 = _mm_setzero_ps(); + + for (j = 0; j < len-3; j += 4) + { + __m128 x0 = _mm_loadu_ps(x+j); + __m128 yj = _mm_loadu_ps(y+j); + __m128 y3 = _mm_loadu_ps(y+j+3); + + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), + _mm_shuffle_ps(yj,y3,0x49))); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), + _mm_shuffle_ps(yj,y3,0x9e))); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); + } + if (j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + if (++j < len) + { + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); + } + } + } + _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); +} + + +void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, + int N, opus_val32 *xy1, opus_val32 *xy2) +{ + int i; + __m128 xsum1, xsum2; + xsum1 = _mm_setzero_ps(); + xsum2 = _mm_setzero_ps(); + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 y1i = _mm_loadu_ps(y01+i); + __m128 y2i = _mm_loadu_ps(y02+i); + xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); + xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); + } + /* Horizontal sum */ + xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); + xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); + _mm_store_ss(xy1, xsum1); + xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); + xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); + _mm_store_ss(xy2, xsum2); + for (;i<N;i++) + { + *xy1 = MAC16_16(*xy1, x[i], y01[i]); + *xy2 = MAC16_16(*xy2, x[i], y02[i]); + } +} + +opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, + int N) +{ + int i; + float xy; + __m128 sum; + sum = _mm_setzero_ps(); + /* FIXME: We should probably go 8-way and use 2 sums. */ + for (i=0;i<N-3;i+=4) + { + __m128 xi = _mm_loadu_ps(x+i); + __m128 yi = _mm_loadu_ps(y+i); + sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); + } + /* Horizontal sum */ + sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); + _mm_store_ss(&xy, sum); + for (;i<N;i++) + { + xy = MAC16_16(xy, x[i], y[i]); + } + return xy; +} + +void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, + opus_val16 g10, opus_val16 g11, opus_val16 g12) +{ + int i; + __m128 x0v; + __m128 g10v, g11v, g12v; + g10v = _mm_load1_ps(&g10); + g11v = _mm_load1_ps(&g11); + g12v = _mm_load1_ps(&g12); + x0v = _mm_loadu_ps(&x[-T-2]); + for (i=0;i<N-3;i+=4) + { + __m128 yi, yi2, x1v, x2v, x3v, x4v; + const opus_val32 *xp = &x[i-T-2]; + yi = _mm_loadu_ps(x+i); + x4v = _mm_loadu_ps(xp+4); +#if 0 + /* Slower version with all loads */ + x1v = _mm_loadu_ps(xp+1); + x2v = _mm_loadu_ps(xp+2); + x3v = _mm_loadu_ps(xp+3); +#else + x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); + x1v = _mm_shuffle_ps(x0v, x2v, 0x99); + x3v = _mm_shuffle_ps(x2v, x4v, 0x99); +#endif + + yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); +#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ + yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); + yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#else + /* Use partial sums */ + yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), + _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); + yi = _mm_add_ps(yi, yi2); +#endif + x0v=x4v; + _mm_storeu_ps(y+i, yi); + } +#ifdef CUSTOM_MODES + for (;i<N;i++) + { + y[i] = x[i] + + MULT16_32_Q15(g10,x[i-T]) + + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) + + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); + } +#endif +} + + +#endif diff --git a/drivers/opus/celt/x86/pitch_sse.h b/drivers/opus/celt/x86/pitch_sse.h index 1542b87232..2b4b54f61f 100644 --- a/drivers/opus/celt/x86/pitch_sse.h +++ b/drivers/opus/celt/x86/pitch_sse.h @@ -1,4 +1,5 @@ -/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ +/* Copyright (c) 2013 Jean-Marc Valin and John Ridges + Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/ /** @file pitch_sse.h @brief Pitch analysis @@ -31,126 +32,158 @@ #ifndef PITCH_SSE_H #define PITCH_SSE_H +#include "opus/opus_config.h" -#include <xmmintrin.h> -#include "opus/celt/arch.h" +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +void xcorr_kernel_sse4_1( + const opus_int16 *x, + const opus_int16 *y, + opus_val32 sum[4], + int len); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) +void xcorr_kernel_sse( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); +#endif +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) #define OVERRIDE_XCORR_KERNEL -static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) -{ - int j; - __m128 xsum1, xsum2; - xsum1 = _mm_loadu_ps(sum); - xsum2 = _mm_setzero_ps(); - - for (j = 0; j < len-3; j += 4) - { - __m128 x0 = _mm_loadu_ps(x+j); - __m128 yj = _mm_loadu_ps(y+j); - __m128 y3 = _mm_loadu_ps(y+j+3); - - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), - _mm_shuffle_ps(yj,y3,0x49))); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), - _mm_shuffle_ps(yj,y3,0x9e))); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); - } - if (j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - } - } - } - _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); -} +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len)) -#define OVERRIDE_DUAL_INNER_PROD -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, - int N, opus_val32 *xy1, opus_val32 *xy2) -{ - int i; - __m128 xsum1, xsum2; - xsum1 = _mm_setzero_ps(); - xsum2 = _mm_setzero_ps(); - for (i=0;i<N-3;i+=4) - { - __m128 xi = _mm_loadu_ps(x+i); - __m128 y1i = _mm_loadu_ps(y01+i); - __m128 y2i = _mm_loadu_ps(y02+i); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); - } - /* Horizontal sum */ - xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); - xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); - _mm_store_ss(xy1, xsum1); - xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); - xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); - _mm_store_ss(xy2, xsum2); - for (;i<N;i++) - { - *xy1 = MAC16_16(*xy1, x[i], y01[i]); - *xy2 = MAC16_16(*xy2, x[i], y02[i]); - } -} +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse(x, y, sum, len)) + +#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) + +extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); + +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) + +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse4_1( + const opus_int16 *x, + const opus_int16 *y, + int N); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse2( + const opus_int16 *x, + const opus_int16 *y, + int N); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse( + const opus_val16 *x, + const opus_val16 *y, + int N); +#endif + + +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse4_1(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse2(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse(x, y, N)) + + +#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) + +extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + int N); + +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) -#define OVERRIDE_COMB_FILTER_CONST -static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12) -{ - int i; - __m128 x0v; - __m128 g10v, g11v, g12v; - g10v = _mm_load1_ps(&g10); - g11v = _mm_load1_ps(&g11); - g12v = _mm_load1_ps(&g12); - x0v = _mm_loadu_ps(&x[-T-2]); - for (i=0;i<N-3;i+=4) - { - __m128 yi, yi2, x1v, x2v, x3v, x4v; - const opus_val32 *xp = &x[i-T-2]; - yi = _mm_loadu_ps(x+i); - x4v = _mm_loadu_ps(xp+4); -#if 0 - /* Slower version with all loads */ - x1v = _mm_loadu_ps(xp+1); - x2v = _mm_loadu_ps(xp+2); - x3v = _mm_loadu_ps(xp+3); -#else - x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); - x1v = _mm_shuffle_ps(x0v, x2v, 0x99); - x3v = _mm_shuffle_ps(x2v, x4v, 0x99); #endif - yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); -#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ - yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); - yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) + +#define OVERRIDE_DUAL_INNER_PROD +#define OVERRIDE_COMB_FILTER_CONST + +#undef dual_inner_prod +#undef comb_filter_const + +void dual_inner_prod_sse(const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +void comb_filter_const_sse(opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); + + +#if defined(OPUS_X86_PRESUME_SSE) +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2)) + +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12)) #else - /* Use partial sums */ - yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), - _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); - yi = _mm_add_ps(yi, yi2); + +extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) + +extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( + opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); + +#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12)) + +#define NON_STATIC_COMB_FILTER_CONST_C + #endif - x0v=x4v; - _mm_storeu_ps(y+i, yi); - } -#ifdef CUSTOM_MODES - for (;i<N;i++) - { - y[i] = x[i] - + MULT16_32_Q15(g10,x[i-T]) - + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) - + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); - } #endif -} #endif diff --git a/drivers/opus/celt/x86/pitch_sse2.c b/drivers/opus/celt/x86/pitch_sse2.c new file mode 100644 index 0000000000..b5a78b4acd --- /dev/null +++ b/drivers/opus/celt/x86/pitch_sse2.c @@ -0,0 +1,92 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include "opus/opus_config.h" + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "opus/silk/macros.h" +#include "opus/celt/celt_lpc.h" +#include "opus/celt/stack_alloc.h" +#include "opus/celt/mathops.h" +#include "opus/celt/pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32( acc1, acc2 ); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} +#endif diff --git a/drivers/opus/celt/x86/pitch_sse4_1.c b/drivers/opus/celt/x86/pitch_sse4_1.c new file mode 100644 index 0000000000..57fb332210 --- /dev/null +++ b/drivers/opus/celt/x86/pitch_sse4_1.c @@ -0,0 +1,192 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include "opus/opus_config.h" + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "opus/silk/macros.h" +#include "opus/celt/celt_lpc.h" +#include "opus/celt/stack_alloc.h" +#include "opus/celt/mathops.h" +#include "opus/celt/pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +#include <smmintrin.h> +#include "opus/celt/x86/x86cpu.h" + +opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + __m128i inVec1_3210, inVec2_3210; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32(acc1, acc2); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + if (N - i >= 4) + { + inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); + inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); + + inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); + + acc1 = _mm_add_epi32(acc1, inVec1_3210); + i += 4; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); + + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) + { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} + +void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) +{ + int j; + + __m128i vecX, vecX0, vecX1, vecX2, vecX3; + __m128i vecY0, vecY1, vecY2, vecY3; + __m128i sum0, sum1, sum2, sum3, vecSum; + __m128i initSum; + + celt_assert(len >= 3); + + sum0 = _mm_setzero_si128(); + sum1 = _mm_setzero_si128(); + sum2 = _mm_setzero_si128(); + sum3 = _mm_setzero_si128(); + + for (j=0;j<(len-7);j+=8) + { + vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); + vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); + vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); + vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); + vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); + + sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); + sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); + sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); + sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); + } + + sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); + sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); + + sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); + sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); + + sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); + sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); + + sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); + sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); + + vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), + _mm_unpacklo_epi32(sum2, sum3)); + + for (;j<(len-3);j+=4) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + vecX1 = _mm_shuffle_epi32(vecX, 0x55); + vecX2 = _mm_shuffle_epi32(vecX, 0xaa); + vecX3 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); + vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + sum2 = _mm_mullo_epi32(vecX2, vecY2); + sum3 = _mm_mullo_epi32(vecX3, vecY3); + + sum0 = _mm_add_epi32(sum0, sum1); + sum2 = _mm_add_epi32(sum2, sum3); + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum2); + } + + for (;j<len;j++) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + vecSum = _mm_add_epi32(vecSum, sum0); + } + + initSum = _mm_loadu_si128((__m128i *)(&sum[0])); + initSum = _mm_add_epi32(initSum, vecSum); + _mm_storeu_si128((__m128i *)sum, initSum); +} +#endif diff --git a/drivers/opus/celt/x86/x86_celt_map.c b/drivers/opus/celt/x86/x86_celt_map.c new file mode 100644 index 0000000000..5146ea2b38 --- /dev/null +++ b/drivers/opus/celt/x86/x86_celt_map.c @@ -0,0 +1,152 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include "opus/opus_config.h" + +#include "opus/celt/x86/x86cpu.h" +#include "opus/celt/celt_lpc.h" +#include "opus/celt/pitch.h" +#include "opus/celt/x86/pitch_sse.h" + +#if defined(OPUS_HAVE_RTCD) + +# if defined(FIXED_POINT) + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1) + +void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *num, + opus_val16 *y, + int N, + int ord, + opus_val16 *mem, + int arch +) = { + celt_fir_c, /* non-sse */ + celt_fir_c, + celt_fir_c, + MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */ + MAY_HAVE_SSE4_1(celt_fir) /* avx */ +}; + +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len +) = { + xcorr_kernel_c, /* non-sse */ + xcorr_kernel_c, + xcorr_kernel_c, + MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */ + MAY_HAVE_SSE4_1(xcorr_kernel) /* avx */ +}; + +#endif + +#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) + +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + int N +) = { + celt_inner_prod_c, /* non-sse */ + celt_inner_prod_c, + MAY_HAVE_SSE2(celt_inner_prod), + MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */ + MAY_HAVE_SSE4_1(celt_inner_prod) /* avx */ +}; + +#endif + +# else + +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE) + +void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len +) = { + xcorr_kernel_c, /* non-sse */ + MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel), + MAY_HAVE_SSE(xcorr_kernel) +}; + +opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + int N +) = { + celt_inner_prod_c, /* non-sse */ + MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod), + MAY_HAVE_SSE(celt_inner_prod) +}; + +void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2 +) = { + dual_inner_prod_c, /* non-sse */ + MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod), + MAY_HAVE_SSE(dual_inner_prod) +}; + +void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( + opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12 +) = { + comb_filter_const_c, /* non-sse */ + MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const), + MAY_HAVE_SSE(comb_filter_const) +}; + + +#endif + +#endif +#endif diff --git a/drivers/opus/celt/x86/x86cpu.c b/drivers/opus/celt/x86/x86cpu.c new file mode 100644 index 0000000000..91b9b185c2 --- /dev/null +++ b/drivers/opus/celt/x86/x86cpu.c @@ -0,0 +1,154 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#include "opus/opus_config.h" + +#include "opus/celt/cpu_support.h" +#include "opus/silk/macros.h" +#include "opus/silk/main.h" +#include "opus/celt/pitch.h" +#include "opus/celt/x86/x86cpu.h" + +#if (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1)) || \ + (defined(OPUS_X86_MAY_HAVE_AVX) && !defined(OPUS_X86_PRESUME_AVX)) + + +#if defined(_MSC_VER) + +#include <intrin.h> +static _inline void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) +{ + __cpuid((int*)CPUInfo, InfoType); +} + +#else + +#if defined(CPU_INFO_BY_C) +#include <cpuid.h> +#endif + +static void cpuid(unsigned int CPUInfo[4], unsigned int InfoType) +{ +#if defined(CPU_INFO_BY_ASM) +#if defined(__i386__) && defined(__PIC__) +/* %ebx is PIC register in 32-bit, so mustn't clobber it. */ + __asm__ __volatile__ ( + "xchg %%ebx, %1\n" + "cpuid\n" + "xchg %%ebx, %1\n": + "=a" (CPUInfo[0]), + "=r" (CPUInfo[1]), + "=c" (CPUInfo[2]), + "=d" (CPUInfo[3]) : + "0" (InfoType) + ); +#else + __asm__ __volatile__ ( + "cpuid": + "=a" (CPUInfo[0]), + "=b" (CPUInfo[1]), + "=c" (CPUInfo[2]), + "=d" (CPUInfo[3]) : + "0" (InfoType) + ); +#endif +#elif defined(CPU_INFO_BY_C) + __get_cpuid(InfoType, &(CPUInfo[0]), &(CPUInfo[1]), &(CPUInfo[2]), &(CPUInfo[3])); +#endif +} + +#endif + +typedef struct CPU_Feature{ + /* SIMD: 128-bit */ + int HW_SSE; + int HW_SSE2; + int HW_SSE41; + /* SIMD: 256-bit */ + int HW_AVX; +} CPU_Feature; + +static void opus_cpu_feature_check(CPU_Feature *cpu_feature) +{ + unsigned int info[4] = {0}; + unsigned int nIds = 0; + + cpuid(info, 0); + nIds = info[0]; + + if (nIds >= 1){ + cpuid(info, 1); + cpu_feature->HW_SSE = (info[3] & (1 << 25)) != 0; + cpu_feature->HW_SSE2 = (info[3] & (1 << 26)) != 0; + cpu_feature->HW_SSE41 = (info[2] & (1 << 19)) != 0; + cpu_feature->HW_AVX = (info[2] & (1 << 28)) != 0; + } + else { + cpu_feature->HW_SSE = 0; + cpu_feature->HW_SSE2 = 0; + cpu_feature->HW_SSE41 = 0; + cpu_feature->HW_AVX = 0; + } +} + +int opus_select_arch(void) +{ + CPU_Feature cpu_feature; + int arch; + + opus_cpu_feature_check(&cpu_feature); + + arch = 0; + if (!cpu_feature.HW_SSE) + { + return arch; + } + arch++; + + if (!cpu_feature.HW_SSE2) + { + return arch; + } + arch++; + + if (!cpu_feature.HW_SSE41) + { + return arch; + } + arch++; + + if (!cpu_feature.HW_AVX) + { + return arch; + } + arch++; + + return arch; +} + +#endif diff --git a/drivers/opus/celt/x86/x86cpu.h b/drivers/opus/celt/x86/x86cpu.h new file mode 100644 index 0000000000..04fd48aac4 --- /dev/null +++ b/drivers/opus/celt/x86/x86cpu.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(X86CPU_H) +# define X86CPU_H + +# if defined(OPUS_X86_MAY_HAVE_SSE) +# define MAY_HAVE_SSE(name) name ## _sse +# else +# define MAY_HAVE_SSE(name) name ## _c +# endif + +# if defined(OPUS_X86_MAY_HAVE_SSE2) +# define MAY_HAVE_SSE2(name) name ## _sse2 +# else +# define MAY_HAVE_SSE2(name) name ## _c +# endif + +# if defined(OPUS_X86_MAY_HAVE_SSE4_1) +# define MAY_HAVE_SSE4_1(name) name ## _sse4_1 +# else +# define MAY_HAVE_SSE4_1(name) name ## _c +# endif + +# if defined(OPUS_X86_MAY_HAVE_AVX) +# define MAY_HAVE_AVX(name) name ## _avx +# else +# define MAY_HAVE_AVX(name) name ## _c +# endif + +# if defined(OPUS_HAVE_RTCD) +int opus_select_arch(void); +# endif + +/*gcc appears to emit MOVDQA's to load the argument of an _mm_cvtepi8_epi32() + or _mm_cvtepi16_epi32() when optimizations are disabled, even though the + actual PMOVSXWD instruction takes an m32 or m64. Unlike a normal memory + reference, these require 16-byte alignment and load a full 16 bytes (instead + of 4 or 8), possibly reading out of bounds. + + We can insert an explicit MOVD or MOVQ using _mm_cvtsi32_si128() or + _mm_loadl_epi64(), which should have the same semantics as an m32 or m64 + reference in the PMOVSXWD instruction itself, but gcc is not smart enough to + optimize this out when optimizations ARE enabled. + + Clang, in contrast, requires us to do this always for _mm_cvtepi8_epi32 + (which is fair, since technically the compiler is always allowed to do the + dereference before invoking the function implementing the intrinsic). + However, it is smart enough to eliminate the extra MOVD instruction. + For _mm_cvtepi16_epi32, it does the right thing, though does *not* optimize out + the extra MOVQ if it's specified explicitly */ + +# if defined(__clang__) || !defined(__OPTIMIZE__) +# define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(_mm_cvtsi32_si128(*(int *)(x)))) +# else +# define OP_CVTEPI8_EPI32_M32(x) \ + (_mm_cvtepi8_epi32(*(__m128i *)(x))) +#endif + +# if !defined(__OPTIMIZE__) +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) +# else +# define OP_CVTEPI16_EPI32_M64(x) \ + (_mm_cvtepi16_epi32(*(__m128i *)(x))) +# endif + +#endif |