diff options
Diffstat (limited to 'thirdparty/opus/celt/x86')
-rw-r--r-- | thirdparty/opus/celt/x86/celt_lpc_sse.c (renamed from thirdparty/opus/celt/x86/celt_lpc_sse4_1.c) | 57 | ||||
-rw-r--r-- | thirdparty/opus/celt/x86/celt_lpc_sse.h | 10 | ||||
-rw-r--r-- | thirdparty/opus/celt/x86/vq_sse.h | 50 | ||||
-rw-r--r-- | thirdparty/opus/celt/x86/vq_sse2.c | 217 | ||||
-rw-r--r-- | thirdparty/opus/celt/x86/x86_celt_map.c | 14 | ||||
-rw-r--r-- | thirdparty/opus/celt/x86/x86cpu.h | 4 |
6 files changed, 58 insertions, 294 deletions
diff --git a/thirdparty/opus/celt/x86/celt_lpc_sse4_1.c b/thirdparty/opus/celt/x86/celt_lpc_sse.c index 5478568849..67e5592acf 100644 --- a/thirdparty/opus/celt/x86/celt_lpc_sse4_1.c +++ b/thirdparty/opus/celt/x86/celt_lpc_sse.c @@ -40,23 +40,65 @@ #if defined(FIXED_POINT) -void celt_fir_sse4_1(const opus_val16 *x, +void celt_fir_sse4_1(const opus_val16 *_x, const opus_val16 *num, - opus_val16 *y, + opus_val16 *_y, int N, int ord, + opus_val16 *mem, int arch) { int i,j; VARDECL(opus_val16, rnum); + VARDECL(opus_val16, x); __m128i vecNoA; opus_int32 noA ; SAVE_STACK; ALLOC(rnum, ord, opus_val16); + ALLOC(x, N+ord, opus_val16); for(i=0;i<ord;i++) rnum[i] = num[ord-i-1]; + for(i=0;i<ord;i++) + x[i] = mem[ord-i-1]; + + for (i=0;i<N-7;i+=8) + { + x[i+ord ]=_x[i ]; + x[i+ord+1]=_x[i+1]; + x[i+ord+2]=_x[i+2]; + x[i+ord+3]=_x[i+3]; + x[i+ord+4]=_x[i+4]; + x[i+ord+5]=_x[i+5]; + x[i+ord+6]=_x[i+6]; + x[i+ord+7]=_x[i+7]; + } + + for (;i<N-3;i+=4) + { + x[i+ord ]=_x[i ]; + x[i+ord+1]=_x[i+1]; + x[i+ord+2]=_x[i+2]; + x[i+ord+3]=_x[i+3]; + } + + for (;i<N;i++) + x[i+ord]=_x[i]; + + for(i=0;i<ord;i++) + mem[i] = _x[N-i-1]; +#ifdef SMALL_FOOTPRINT + for (i=0;i<N;i++) + { + opus_val32 sum = SHL32(EXTEND32(_x[i]), SIG_SHIFT); + for (j=0;j<ord;j++) + { + sum = MAC16_16(sum,rnum[j],x[i+j]); + } + _y[i] = SATURATE16(PSHR32(sum, SIG_SHIFT)); + } +#else noA = EXTEND32(1) << SIG_SHIFT >> 1; vecNoA = _mm_set_epi32(noA, noA, noA, noA); @@ -65,24 +107,25 @@ void celt_fir_sse4_1(const opus_val16 *x, opus_val32 sums[4] = {0}; __m128i vecSum, vecX; - xcorr_kernel(rnum, x+i-ord, sums, ord, arch); + xcorr_kernel(rnum, x+i, sums, ord, arch); vecSum = _mm_loadu_si128((__m128i *)sums); vecSum = _mm_add_epi32(vecSum, vecNoA); vecSum = _mm_srai_epi32(vecSum, SIG_SHIFT); - vecX = OP_CVTEPI16_EPI32_M64(x + i); + vecX = OP_CVTEPI16_EPI32_M64(_x + i); vecSum = _mm_add_epi32(vecSum, vecX); vecSum = _mm_packs_epi32(vecSum, vecSum); - _mm_storel_epi64((__m128i *)(y + i), vecSum); + _mm_storel_epi64((__m128i *)(_y + i), vecSum); } for (;i<N;i++) { opus_val32 sum = 0; for (j=0;j<ord;j++) - sum = MAC16_16(sum, rnum[j], x[i+j-ord]); - y[i] = SATURATE16(ADD32(EXTEND32(x[i]), PSHR32(sum, SIG_SHIFT))); + sum = MAC16_16(sum, rnum[j], x[i + j]); + _y[i] = SATURATE16(ADD32(EXTEND32(_x[i]), PSHR32(sum, SIG_SHIFT))); } +#endif RESTORE_STACK; } diff --git a/thirdparty/opus/celt/x86/celt_lpc_sse.h b/thirdparty/opus/celt/x86/celt_lpc_sse.h index 7d1ecf7533..c5ec796ed5 100644 --- a/thirdparty/opus/celt/x86/celt_lpc_sse.h +++ b/thirdparty/opus/celt/x86/celt_lpc_sse.h @@ -41,11 +41,12 @@ void celt_fir_sse4_1( opus_val16 *y, int N, int ord, + opus_val16 *mem, int arch); #if defined(OPUS_X86_PRESUME_SSE4_1) -#define celt_fir(x, num, y, N, ord, arch) \ - ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, arch)) +#define celt_fir(x, num, y, N, ord, mem, arch) \ + ((void)arch, celt_fir_sse4_1(x, num, y, N, ord, mem, arch)) #else @@ -55,10 +56,11 @@ extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( opus_val16 *y, int N, int ord, + opus_val16 *mem, int arch); -# define celt_fir(x, num, y, N, ord, arch) \ - ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, arch)) +# define celt_fir(x, num, y, N, ord, mem, arch) \ + ((*CELT_FIR_IMPL[(arch) & OPUS_ARCHMASK])(x, num, y, N, ord, mem, arch)) #endif #endif diff --git a/thirdparty/opus/celt/x86/vq_sse.h b/thirdparty/opus/celt/x86/vq_sse.h deleted file mode 100644 index b4efe8f249..0000000000 --- a/thirdparty/opus/celt/x86/vq_sse.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2016 Jean-Marc Valin */ -/* - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifndef VQ_SSE_H -#define VQ_SSE_H - -#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT) -#define OVERRIDE_OP_PVQ_SEARCH - -opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch); - -#if defined(OPUS_X86_PRESUME_SSE2) -#define op_pvq_search(x, iy, K, N, arch) \ - (op_pvq_search_sse2(x, iy, K, N, arch)) - -#else - -extern opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( - celt_norm *_X, int *iy, int K, int N, int arch); - -# define op_pvq_search(X, iy, K, N, arch) \ - ((*OP_PVQ_SEARCH_IMPL[(arch) & OPUS_ARCHMASK])(X, iy, K, N, arch)) - -#endif -#endif - -#endif diff --git a/thirdparty/opus/celt/x86/vq_sse2.c b/thirdparty/opus/celt/x86/vq_sse2.c deleted file mode 100644 index 775042860d..0000000000 --- a/thirdparty/opus/celt/x86/vq_sse2.c +++ /dev/null @@ -1,217 +0,0 @@ -/* Copyright (c) 2007-2008 CSIRO - Copyright (c) 2007-2009 Xiph.Org Foundation - Copyright (c) 2007-2016 Jean-Marc Valin */ -/* - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - - Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include <xmmintrin.h> -#include <emmintrin.h> -#include "celt_lpc.h" -#include "stack_alloc.h" -#include "mathops.h" -#include "vq.h" -#include "x86cpu.h" - - -#ifndef FIXED_POINT - -opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch) -{ - int i, j; - int pulsesLeft; - float xy, yy; - VARDECL(celt_norm, y); - VARDECL(celt_norm, X); - VARDECL(float, signy); - __m128 signmask; - __m128 sums; - __m128i fours; - SAVE_STACK; - - (void)arch; - /* All bits set to zero, except for the sign bit. */ - signmask = _mm_set_ps1(-0.f); - fours = _mm_set_epi32(4, 4, 4, 4); - ALLOC(y, N+3, celt_norm); - ALLOC(X, N+3, celt_norm); - ALLOC(signy, N+3, float); - - OPUS_COPY(X, _X, N); - X[N] = X[N+1] = X[N+2] = 0; - sums = _mm_setzero_ps(); - for (j=0;j<N;j+=4) - { - __m128 x4, s4; - x4 = _mm_loadu_ps(&X[j]); - s4 = _mm_cmplt_ps(x4, _mm_setzero_ps()); - /* Get rid of the sign */ - x4 = _mm_andnot_ps(signmask, x4); - sums = _mm_add_ps(sums, x4); - /* Clear y and iy in case we don't do the projection. */ - _mm_storeu_ps(&y[j], _mm_setzero_ps()); - _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128()); - _mm_storeu_ps(&X[j], x4); - _mm_storeu_ps(&signy[j], s4); - } - sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2))); - sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1))); - - xy = yy = 0; - - pulsesLeft = K; - - /* Do a pre-search by projecting on the pyramid */ - if (K > (N>>1)) - { - __m128i pulses_sum; - __m128 yy4, xy4; - __m128 rcp4; - opus_val32 sum = _mm_cvtss_f32(sums); - /* If X is too small, just replace it with a pulse at 0 */ - /* Prevents infinities and NaNs from causing too many pulses - to be allocated. 64 is an approximation of infinity here. */ - if (!(sum > EPSILON && sum < 64)) - { - X[0] = QCONST16(1.f,14); - j=1; do - X[j]=0; - while (++j<N); - sums = _mm_set_ps1(1.f); - } - /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */ - rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums)); - xy4 = yy4 = _mm_setzero_ps(); - pulses_sum = _mm_setzero_si128(); - for (j=0;j<N;j+=4) - { - __m128 rx4, x4, y4; - __m128i iy4; - x4 = _mm_loadu_ps(&X[j]); - rx4 = _mm_mul_ps(x4, rcp4); - iy4 = _mm_cvttps_epi32(rx4); - pulses_sum = _mm_add_epi32(pulses_sum, iy4); - _mm_storeu_si128((__m128i*)&iy[j], iy4); - y4 = _mm_cvtepi32_ps(iy4); - xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4)); - yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4)); - /* double the y[] vector so we don't have to do it in the search loop. */ - _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4)); - } - pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2))); - pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1))); - pulsesLeft -= _mm_cvtsi128_si32(pulses_sum); - xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2))); - xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1))); - xy = _mm_cvtss_f32(xy4); - yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2))); - yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1))); - yy = _mm_cvtss_f32(yy4); - } - X[N] = X[N+1] = X[N+2] = -100; - y[N] = y[N+1] = y[N+2] = 100; - celt_sig_assert(pulsesLeft>=0); - - /* This should never happen, but just in case it does (e.g. on silence) - we fill the first bin with pulses. */ - if (pulsesLeft > N+3) - { - opus_val16 tmp = (opus_val16)pulsesLeft; - yy = MAC16_16(yy, tmp, tmp); - yy = MAC16_16(yy, tmp, y[0]); - iy[0] += pulsesLeft; - pulsesLeft=0; - } - - for (i=0;i<pulsesLeft;i++) - { - int best_id; - __m128 xy4, yy4; - __m128 max, max2; - __m128i count; - __m128i pos; - /* The squared magnitude term gets added anyway, so we might as well - add it outside the loop */ - yy = ADD16(yy, 1); - xy4 = _mm_load1_ps(&xy); - yy4 = _mm_load1_ps(&yy); - max = _mm_setzero_ps(); - pos = _mm_setzero_si128(); - count = _mm_set_epi32(3, 2, 1, 0); - for (j=0;j<N;j+=4) - { - __m128 x4, y4, r4; - x4 = _mm_loadu_ps(&X[j]); - y4 = _mm_loadu_ps(&y[j]); - x4 = _mm_add_ps(x4, xy4); - y4 = _mm_add_ps(y4, yy4); - y4 = _mm_rsqrt_ps(y4); - r4 = _mm_mul_ps(x4, y4); - /* Update the index of the max. */ - pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max)))); - /* Update the max. */ - max = _mm_max_ps(max, r4); - /* Update the indices (+4) */ - count = _mm_add_epi32(count, fours); - } - /* Horizontal max */ - max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2))); - max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1))); - /* Now that max2 contains the max at all positions, look at which value(s) of the - partial max is equal to the global max. */ - pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2))); - pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos)); - pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2))); - best_id = _mm_cvtsi128_si32(pos); - - /* Updating the sums of the new pulse(s) */ - xy = ADD32(xy, EXTEND32(X[best_id])); - /* We're multiplying y[j] by two so we don't have to do it here */ - yy = ADD16(yy, y[best_id]); - - /* Only now that we've made the final choice, update y/iy */ - /* Multiplying y[j] by 2 so we don't have to do it everywhere else */ - y[best_id] += 2; - iy[best_id]++; - } - - /* Put the original sign back */ - for (j=0;j<N;j+=4) - { - __m128i y4; - __m128i s4; - y4 = _mm_loadu_si128((__m128i*)&iy[j]); - s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j])); - y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4); - _mm_storeu_si128((__m128i*)&iy[j], y4); - } - RESTORE_STACK; - return yy; -} - -#endif diff --git a/thirdparty/opus/celt/x86/x86_celt_map.c b/thirdparty/opus/celt/x86/x86_celt_map.c index d39d88edec..47ba41b9ee 100644 --- a/thirdparty/opus/celt/x86/x86_celt_map.c +++ b/thirdparty/opus/celt/x86/x86_celt_map.c @@ -33,7 +33,6 @@ #include "celt_lpc.h" #include "pitch.h" #include "pitch_sse.h" -#include "vq.h" #if defined(OPUS_HAVE_RTCD) @@ -47,6 +46,7 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])( opus_val16 *y, int N, int ord, + opus_val16 *mem, int arch ) = { celt_fir_c, /* non-sse */ @@ -151,17 +151,5 @@ void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])( #endif -#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2) -opus_val16 (*const OP_PVQ_SEARCH_IMPL[OPUS_ARCHMASK + 1])( - celt_norm *_X, int *iy, int K, int N, int arch -) = { - op_pvq_search_c, /* non-sse */ - op_pvq_search_c, - MAY_HAVE_SSE2(op_pvq_search), - MAY_HAVE_SSE2(op_pvq_search), - MAY_HAVE_SSE2(op_pvq_search) -}; -#endif - #endif #endif diff --git a/thirdparty/opus/celt/x86/x86cpu.h b/thirdparty/opus/celt/x86/x86cpu.h index 1e2bf17b9b..04fd48aac4 100644 --- a/thirdparty/opus/celt/x86/x86cpu.h +++ b/thirdparty/opus/celt/x86/x86cpu.h @@ -82,9 +82,7 @@ int opus_select_arch(void); (_mm_cvtepi8_epi32(*(__m128i *)(x))) #endif -/* similar reasoning about the instruction sequence as in the 32-bit macro above, - */ -# if defined(__clang__) || !defined(__OPTIMIZE__) +# if !defined(__OPTIMIZE__) # define OP_CVTEPI16_EPI32_M64(x) \ (_mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i *)(x)))) # else |