diff options
Diffstat (limited to 'drivers/opus/celt/x86/pitch_sse.h')
-rw-r--r-- | drivers/opus/celt/x86/pitch_sse.h | 257 |
1 files changed, 145 insertions, 112 deletions
diff --git a/drivers/opus/celt/x86/pitch_sse.h b/drivers/opus/celt/x86/pitch_sse.h index 1542b87232..2b4b54f61f 100644 --- a/drivers/opus/celt/x86/pitch_sse.h +++ b/drivers/opus/celt/x86/pitch_sse.h @@ -1,4 +1,5 @@ -/* Copyright (c) 2013 Jean-Marc Valin and John Ridges */ +/* Copyright (c) 2013 Jean-Marc Valin and John Ridges + Copyright (c) 2014, Cisco Systems, INC MingXiang WeiZhou MinPeng YanWang*/ /** @file pitch_sse.h @brief Pitch analysis @@ -31,126 +32,158 @@ #ifndef PITCH_SSE_H #define PITCH_SSE_H +#include "opus/opus_config.h" -#include <xmmintrin.h> -#include "opus/celt/arch.h" +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +void xcorr_kernel_sse4_1( + const opus_int16 *x, + const opus_int16 *y, + opus_val32 sum[4], + int len); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) +void xcorr_kernel_sse( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); +#endif +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) #define OVERRIDE_XCORR_KERNEL -static OPUS_INLINE void xcorr_kernel(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len) -{ - int j; - __m128 xsum1, xsum2; - xsum1 = _mm_loadu_ps(sum); - xsum2 = _mm_setzero_ps(); - - for (j = 0; j < len-3; j += 4) - { - __m128 x0 = _mm_loadu_ps(x+j); - __m128 yj = _mm_loadu_ps(y+j); - __m128 y3 = _mm_loadu_ps(y+j+3); - - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), - _mm_shuffle_ps(yj,y3,0x49))); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), - _mm_shuffle_ps(yj,y3,0x9e))); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); - } - if (j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - if (++j < len) - { - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); - } - } - } - _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); -} +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse4_1(x, y, sum, len)) -#define OVERRIDE_DUAL_INNER_PROD -static OPUS_INLINE void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, - int N, opus_val32 *xy1, opus_val32 *xy2) -{ - int i; - __m128 xsum1, xsum2; - xsum1 = _mm_setzero_ps(); - xsum2 = _mm_setzero_ps(); - for (i=0;i<N-3;i+=4) - { - __m128 xi = _mm_loadu_ps(x+i); - __m128 y1i = _mm_loadu_ps(y01+i); - __m128 y2i = _mm_loadu_ps(y02+i); - xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); - xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); - } - /* Horizontal sum */ - xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); - xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); - _mm_store_ss(xy1, xsum1); - xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); - xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); - _mm_store_ss(xy2, xsum2); - for (;i<N;i++) - { - *xy1 = MAC16_16(*xy1, x[i], y01[i]); - *xy2 = MAC16_16(*xy2, x[i], y02[i]); - } -} +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((void)arch, xcorr_kernel_sse(x, y, sum, len)) + +#elif (defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT)) || (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) + +extern void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + opus_val32 sum[4], + int len); + +#define OVERRIDE_XCORR_KERNEL +#define xcorr_kernel(x, y, sum, len, arch) \ + ((*XCORR_KERNEL_IMPL[(arch) & OPUS_ARCHMASK])(x, y, sum, len)) + +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse4_1( + const opus_int16 *x, + const opus_int16 *y, + int N); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse2( + const opus_int16 *x, + const opus_int16 *y, + int N); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse( + const opus_val16 *x, + const opus_val16 *y, + int N); +#endif + + +#if defined(OPUS_X86_PRESUME_SSE4_1) && defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse4_1(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse2(x, y, N)) + +#elif defined(OPUS_X86_PRESUME_SSE) && !defined(FIXED_POINT) +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((void)arch, celt_inner_prod_sse(x, y, N)) + + +#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \ + (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)) + +extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y, + int N); + +#define OVERRIDE_CELT_INNER_PROD +#define celt_inner_prod(x, y, N, arch) \ + ((*CELT_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N)) -#define OVERRIDE_COMB_FILTER_CONST -static OPUS_INLINE void comb_filter_const(opus_val32 *y, opus_val32 *x, int T, int N, - opus_val16 g10, opus_val16 g11, opus_val16 g12) -{ - int i; - __m128 x0v; - __m128 g10v, g11v, g12v; - g10v = _mm_load1_ps(&g10); - g11v = _mm_load1_ps(&g11); - g12v = _mm_load1_ps(&g12); - x0v = _mm_loadu_ps(&x[-T-2]); - for (i=0;i<N-3;i+=4) - { - __m128 yi, yi2, x1v, x2v, x3v, x4v; - const opus_val32 *xp = &x[i-T-2]; - yi = _mm_loadu_ps(x+i); - x4v = _mm_loadu_ps(xp+4); -#if 0 - /* Slower version with all loads */ - x1v = _mm_loadu_ps(xp+1); - x2v = _mm_loadu_ps(xp+2); - x3v = _mm_loadu_ps(xp+3); -#else - x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); - x1v = _mm_shuffle_ps(x0v, x2v, 0x99); - x3v = _mm_shuffle_ps(x2v, x4v, 0x99); #endif - yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); -#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ - yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); - yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); +#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) + +#define OVERRIDE_DUAL_INNER_PROD +#define OVERRIDE_COMB_FILTER_CONST + +#undef dual_inner_prod +#undef comb_filter_const + +void dual_inner_prod_sse(const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +void comb_filter_const_sse(opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); + + +#if defined(OPUS_X86_PRESUME_SSE) +# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((void)(arch),dual_inner_prod_sse(x, y01, y02, N, xy1, xy2)) + +# define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((void)(arch),comb_filter_const_sse(y, x, T, N, g10, g11, g12)) #else - /* Use partial sums */ - yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), - _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); - yi = _mm_add_ps(yi, yi2); + +extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])( + const opus_val16 *x, + const opus_val16 *y01, + const opus_val16 *y02, + int N, + opus_val32 *xy1, + opus_val32 *xy2); + +#define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) \ + ((*DUAL_INNER_PROD_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2)) + +extern void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( + opus_val32 *y, + opus_val32 *x, + int T, + int N, + opus_val16 g10, + opus_val16 g11, + opus_val16 g12); + +#define comb_filter_const(y, x, T, N, g10, g11, g12, arch) \ + ((*COMB_FILTER_CONST_IMPL[(arch) & OPUS_ARCHMASK])(y, x, T, N, g10, g11, g12)) + +#define NON_STATIC_COMB_FILTER_CONST_C + #endif - x0v=x4v; - _mm_storeu_ps(y+i, yi); - } -#ifdef CUSTOM_MODES - for (;i<N;i++) - { - y[i] = x[i] - + MULT16_32_Q15(g10,x[i-T]) - + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) - + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); - } #endif -} #endif |