diff options
Diffstat (limited to 'thirdparty/libvpx/vpx_dsp/x86')
18 files changed, 0 insertions, 15330 deletions
diff --git a/thirdparty/libvpx/vpx_dsp/x86/convolve.h b/thirdparty/libvpx/vpx_dsp/x86/convolve.h deleted file mode 100644 index 7e43eb7c72..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/convolve.h +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#ifndef VPX_DSP_X86_CONVOLVE_H_ -#define VPX_DSP_X86_CONVOLVE_H_ - -#include <assert.h> - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" - -typedef void filter8_1dfunction ( - const uint8_t *src_ptr, - ptrdiff_t src_pitch, - uint8_t *output_ptr, - ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter -); - -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - assert(filter[3] != 128); \ - assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - if (w == 8) { \ - vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } else if (w == 4) { \ - vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } \ - } else { \ - while (w >= 16) { \ - vpx_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - if (w == 8) { \ - vpx_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } else if (w == 4) { \ - vpx_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter); \ - } \ - } \ -} - -#define FUN_CONV_2D(avg, opt) \ -void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ - assert(filter_x[3] != 128); \ - assert(filter_y[3] != 128); \ - assert(w <= 64); \ - assert(h <= 64); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - if (filter_x[0] | filter_x[1] | filter_x[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ - vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 7); \ - vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ - vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 1); \ - vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } \ -} - -#if CONFIG_VP9_HIGHBITDEPTH - -typedef void highbd_filter8_1dfunction ( - const uint16_t *src_ptr, - const ptrdiff_t src_pitch, - uint16_t *output_ptr, - ptrdiff_t out_pitch, - unsigned int output_height, - const int16_t *filter, - int bd -); - -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \ - ptrdiff_t src_stride, \ - uint8_t *dst8, \ - ptrdiff_t dst_stride, \ - const int16_t *filter_x, \ - int x_step_q4, \ - const int16_t *filter_y, \ - int y_step_q4, \ - int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \ - src_stride, \ - dst, \ - dst_stride, \ - h, \ - filter, \ - bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ -} - -#define HIGH_FUN_CONV_2D(avg, opt) \ -void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \ - 64, dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt(src, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \ - dst, dst_stride, \ - filter_x, x_step_q4, \ - filter_y, y_step_q4, \ - w, h, bd); \ - } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h, bd); \ - } \ -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -#endif // VPX_DSP_X86_CONVOLVE_H_ diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm deleted file mode 100644 index cd6a6ae982..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_sse2.asm +++ /dev/null @@ -1,860 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pb_1: times 16 db 1 -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -dc_128: times 16 db 128 -pw2_4: times 8 dw 2 -pw2_8: times 8 dw 4 -pw2_16: times 8 dw 8 -pw2_32: times 8 dw 16 - -SECTION .text - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM sse2 -cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - DEFINE_ARGS dst, stride, temp - psrldq m1, m0, 1 - psrldq m2, m0, 2 - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 - - ; store 4 lines - movd [dstq ], m3 - psrlq m3, 8 - movd [dstq+strideq ], m3 - lea dstq, [dstq+strideq*2] - psrlq m3, 8 - movd [dstq ], m3 - psrlq m3, 8 - movd [dstq+strideq ], m3 - psrlq m0, 56 - movd tempq, m0 - mov [dstq+strideq+3], tempb - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset - GET_GOT goffsetq - - movu m1, [aboveq] - pslldq m0, m1, 1 - psrldq m2, m1, 1 - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 - punpckhbw m0, m0 ; 7 7 - punpcklwd m0, m0 ; 7 7 7 7 - punpckldq m0, m0 ; 7 7 7 7 7 7 7 7 - punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7 - - ; store 4 lines - psrldq m3, 1 - movq [dstq ], m3 - psrldq m3, 1 - movq [dstq+strideq ], m3 - psrldq m3, 1 - movq [dstq+strideq*2], m3 - psrldq m3, 1 - movq [dstq+stride3q ], m3 - lea dstq, [dstq+strideq*4] - - ; store next 4 lines - psrldq m3, 1 - movq [dstq ], m3 - psrldq m3, 1 - movq [dstq+strideq ], m3 - psrldq m3, 1 - movq [dstq+strideq*2], m3 - psrldq m3, 1 - movq [dstq+stride3q ], m3 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset - GET_GOT goffsetq - - movd m0, [leftq] ; abcd [byte] - punpcklbw m4, m0, m0 ; aabb ccdd - punpcklwd m4, m4 ; aaaa bbbb cccc dddd - psrldq m4, 12 ; dddd - punpckldq m0, m4 ; abcd dddd - psrldq m1, m0, 1 ; bcdd - psrldq m2, m0, 2 ; cddd - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d - pavgb m1, m0 ; ab, bc, cd, d [byte] - - punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d - movd [dstq ], m1 - psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d - movd [dstq+strideq], m1 - - lea dstq, [dstq+strideq*2] - psrlq m1, 16 ; cd, c3d, d, d - movd [dstq ], m1 - movd [dstq+strideq], m4 ; d, d, d, d - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - movd m2, [leftq] - movd m0, [aboveq] - pxor m1, m1 - punpckldq m0, m2 - psadbw m0, m1 - paddw m0, [GLOBAL(pw_4)] - psraw m0, 3 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset - movifnidn leftq, leftmp - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [leftq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [aboveq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - movq m2, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - paddw m0, [GLOBAL(pw_8)] - psraw m0, 4 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset - movifnidn leftq, leftmp - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movd m0, [GLOBAL(dc_128)] - movd [dstq ], m0 - movd [dstq+strideq ], m0 - movd [dstq+strideq*2], m0 - movd [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq m0, [GLOBAL(dc_128)] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_16)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - - -INIT_XMM sse2 -cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - - -INIT_XMM sse2 -cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - mova m3, [leftq] - mova m4, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - psadbw m3, m1 - psadbw m4, m1 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_32)] - psraw m0, 6 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [leftq] - mova m2, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above - movd m0, [aboveq] - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - RET - -INIT_XMM sse2 -cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RET - -INIT_XMM sse2 -cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 4 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m1 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m1 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left - movifnidn leftq, leftmp - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 - pshufd m1, m0, 0x1 - movd [dstq ], m0 - movd [dstq+strideq], m1 - pshufd m2, m0, 0x2 - lea dstq, [dstq+strideq*2] - pshufd m3, m0, 0x3 - movd [dstq ], m2 - movd [dstq+strideq], m3 - RET - -INIT_XMM sse2 -cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -2 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] - movq m0, [leftq ] - punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 -.loop: - pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 - pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 - movq [dstq ], m1 - movq [dstq+strideq], m2 - pshuflw m1, m0, 0xaa - pshuflw m2, m0, 0xff - movq [dstq+strideq*2], m1 - movq [dstq+stride3q ], m2 - pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 - inc lineq - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -4 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] -.loop: - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 ; l1 to l4 each repeated 4 times - pshufd m1, m0, 0x0 ; l1 repeated 16 times - pshufd m2, m0, 0x55 ; l2 repeated 16 times - mova [dstq ], m1 - mova [dstq+strideq ], m2 - pshufd m1, m0, 0xaa - pshufd m2, m0, 0xff - mova [dstq+strideq*2], m1 - mova [dstq+stride3q ], m2 - inc lineq - lea leftq, [leftq+4 ] - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -8 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] -.loop: - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 ; l1 to l4 each repeated 4 times - pshufd m1, m0, 0x0 ; l1 repeated 16 times - pshufd m2, m0, 0x55 ; l2 repeated 16 times - mova [dstq ], m1 - mova [dstq+16 ], m1 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16 ], m2 - pshufd m1, m0, 0xaa - pshufd m2, m0, 0xff - mova [dstq+strideq*2 ], m1 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m2 - inc lineq - lea leftq, [leftq+4 ] - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x - punpcklbw m0, m1 - pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] - psrldq m0, 2 - psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] - movd m2, [leftq] - punpcklbw m2, m1 - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - lea dstq, [dstq+strideq*2] - pshuflw m4, m2, 0xaa - pshuflw m3, m2, 0xff - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - RET - -INIT_XMM sse2 -cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - movq m0, [aboveq] - punpcklbw m2, m1 - punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] - pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] - DEFINE_ARGS dst, stride, line, left - mov lineq, -4 - punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] - psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] - movq m2, [leftq] - punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] -.loop - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] - punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m3 - movq [dstq ], m4 - movhps [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m2, 4 - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left - pxor m1, m1 - mova m2, [aboveq-16]; - mova m0, [aboveq] ; t1 t2 ... t16 [byte] - punpckhbw m2, m1 ; [127:112] tl [word] - punpckhbw m4, m0, m1 - punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] - DEFINE_ARGS dst, stride, line, left, stride8 - mov lineq, -8 - pshufhw m2, m2, 0xff - mova m3, [leftq] ; l1 l2 ... l16 [byte] - punpckhqdq m2, m2 ; tl repeated 8 times [word] - psubw m0, m2 - psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] - punpckhbw m5, m3, m1 - punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] - lea stride8q, [strideq*8] -.loop: - pshuflw m6, m3, 0x0 - pshuflw m7, m5, 0x0 - punpcklqdq m6, m6 ; l1 repeated 8 times [word] - punpcklqdq m7, m7 ; l8 repeated 8 times [word] - paddw m1, m6, m0 - paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] - psrldq m5, 2 - packuswb m1, m6 - mova [dstq ], m1 - paddw m1, m7, m0 - paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] - psrldq m3, 2 - packuswb m1, m7 - mova [dstq+stride8q], m1 - inc lineq - lea dstq, [dstq+strideq] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - mova m0, [aboveq] - mova m4, [aboveq+16] - punpcklbw m2, m1 - punpckhbw m3, m0, m1 - punpckhbw m5, m4, m1 - punpcklbw m0, m1 - punpcklbw m4, m1 - pshuflw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - punpcklqdq m2, m2 - add leftq, 32 - psubw m0, m2 - psubw m3, m2 - psubw m4, m2 - psubw m5, m2 -.loop: - movd m2, [leftq+lineq*2] - pxor m1, m1 - punpcklbw m2, m1 - pshuflw m7, m2, 0x55 - pshuflw m2, m2, 0x0 - punpcklqdq m2, m2 - punpcklqdq m7, m7 - paddw m6, m2, m3 - paddw m1, m2, m0 - packuswb m1, m6 - mova [dstq ], m1 - paddw m6, m2, m5 - paddw m1, m2, m4 - packuswb m1, m6 - mova [dstq+16 ], m1 - paddw m6, m7, m3 - paddw m1, m7, m0 - packuswb m1, m6 - mova [dstq+strideq ], m1 - paddw m6, m7, m5 - paddw m1, m7, m4 - packuswb m1, m6 - mova [dstq+strideq+16], m1 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET diff --git a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm deleted file mode 100644 index 5e0139fa8d..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/intrapred_ssse3.asm +++ /dev/null @@ -1,871 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA - -pb_1: times 16 db 1 -sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 -sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 -sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 -sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -SECTION .text - -INIT_XMM ssse3 -cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, dst8, line - lea stride3q, [strideq*3] - lea dst8q, [dstq+strideq*8] - mova m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] - pavgb m3, m2, m0 - pxor m2, m0 - pshufb m0, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m0, m3 - - ; first 4 lines and first half of 3rd 4 lines - mov lined, 2 -.loop: - mova [dstq ], m0 - movhps [dst8q ], m0 - pshufb m0, m1 - mova [dstq +strideq ], m0 - movhps [dst8q+strideq ], m0 - pshufb m0, m1 - mova [dstq +strideq*2 ], m0 - movhps [dst8q+strideq*2 ], m0 - pshufb m0, m1 - mova [dstq +stride3q ], m0 - movhps [dst8q+stride3q ], m0 - pshufb m0, m1 - lea dstq, [dstq +strideq*4] - lea dst8q, [dst8q+strideq*4] - dec lined - jnz .loop - - ; bottom-right 8x8 block - movhps [dstq +8], m0 - movhps [dstq+strideq +8], m0 - movhps [dstq+strideq*2+8], m0 - movhps [dstq+stride3q +8], m0 - lea dstq, [dstq+strideq*4] - movhps [dstq +8], m0 - movhps [dstq+strideq +8], m0 - movhps [dstq+strideq*2+8], m0 - movhps [dstq+stride3q +8], m0 - - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - mova m4, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, dst16, line - lea stride3q, [strideq*3] - lea dst16q, [dstq +strideq*8] - lea dst16q, [dst16q+strideq*8] - mova m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] - pavgb m3, m2, m4 - pxor m2, m4 - palignr m5, m4, m0, 1 - palignr m6, m4, m0, 2 - pshufb m4, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m4, m3 - pavgb m3, m0, m6 - pxor m0, m6 - pand m0, [GLOBAL(pb_1)] - psubb m3, m0 - pavgb m5, m3 - - ; write 4x4 lines (and the first half of the second 4x4 lines) - mov lined, 4 -.loop: - mova [dstq ], m5 - mova [dstq +16], m4 - mova [dst16q ], m4 - palignr m3, m4, m5, 1 - pshufb m4, m1 - mova [dstq +strideq ], m3 - mova [dstq +strideq +16], m4 - mova [dst16q+strideq ], m4 - palignr m5, m4, m3, 1 - pshufb m4, m1 - mova [dstq +strideq*2 ], m5 - mova [dstq +strideq*2+16], m4 - mova [dst16q+strideq*2 ], m4 - palignr m3, m4, m5, 1 - pshufb m4, m1 - mova [dstq +stride3q ], m3 - mova [dstq +stride3q +16], m4 - mova [dst16q+stride3q ], m4 - palignr m5, m4, m3, 1 - pshufb m4, m1 - lea dstq, [dstq +strideq*4] - lea dst16q, [dst16q+strideq*4] - dec lined - jnz .loop - - ; write second half of second 4x4 lines - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - lea dstq, [dstq +strideq*4] - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - lea dstq, [dstq +strideq*4] - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - lea dstq, [dstq +strideq*4] - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - - RESTORE_GOT - RET - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM ssse3 -cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset - GET_GOT goffsetq - - movq m3, [aboveq] - pshufb m1, m3, [GLOBAL(sh_b23456777)] - pshufb m2, m3, [GLOBAL(sh_b12345677)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 - pavgb m3, m2 - - ; store 4 lines - movd [dstq ], m3 - movd [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m3, 1 - psrldq m4, 1 - movd [dstq ], m3 - movd [dstq+strideq], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset - GET_GOT goffsetq - - movq m3, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] - pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] - pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] - pshufb m3, [GLOBAL(sh_b0123456777777777)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 - pavgb m3, m2 - - ; store 4 lines - movq [dstq ], m3 - movq [dstq+strideq], m4 - psrldq m3, 1 - psrldq m4, 1 - movq [dstq+strideq*2], m3 - movq [dstq+stride3q ], m4 - lea dstq, [dstq+strideq*4] - psrldq m3, 1 - psrldq m4, 1 - - ; store 4 lines - movq [dstq ], m3 - movq [dstq+strideq], m4 - psrldq m3, 1 - psrldq m4, 1 - movq [dstq+strideq*2], m3 - movq [dstq+stride3q ], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, line - lea stride3q, [strideq*3] - mova m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] - pshufb m3, m0, m1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 - pavgb m0, m3 - - mov lined, 4 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m4 - pshufb m0, m1 - pshufb m4, m1 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m4 - pshufb m0, m1 - pshufb m4, m1 - lea dstq, [dstq+strideq*4] - dec lined - jnz .loop - RESTORE_GOT - REP_RET - -INIT_XMM ssse3 -cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - mova m7, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, line - mova m1, [GLOBAL(sh_b123456789abcdeff)] - lea stride3q, [strideq*3] - pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] - pshufb m3, m7, m1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 - palignr m6, m7, m0, 1 - palignr m5, m7, m0, 2 - pavgb m7, m3 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 - pavgb m0, m6 - - mov lined, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m7 - mova [dstq+strideq ], m2 - mova [dstq+strideq +16], m4 - palignr m3, m7, m0, 1 - palignr m5, m4, m2, 1 - pshufb m7, m1 - pshufb m4, m1 - - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m5 - mova [dstq+stride3q +16], m4 - palignr m0, m7, m3, 1 - palignr m2, m4, m5, 1 - pshufb m7, m1 - pshufb m4, m1 - lea dstq, [dstq+strideq*4] - dec lined - jnz .loop - RESTORE_GOT - REP_RET - -INIT_XMM ssse3 -cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - movd m0, [leftq] ; l1, l2, l3, l4 - movd m1, [aboveq-1] ; tl, t1, t2, t3 - punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 - pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 - psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 - psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 - ; comments below are for a predictor like this - ; A1 B1 C1 D1 - ; A2 B2 A1 B1 - ; A3 B3 A2 B2 - ; A4 B4 A3 B3 - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 - pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 - - punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+stride3q ], m3 - psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq*2], m3 - psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq ], m3 - psrldq m3, 2 ; A1 B1 C1 D1 .. - movd [dstq ], m3 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - movq m0, [leftq] ; [0- 7] l1-8 [byte] - movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] - pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] - pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] - pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] - pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] - psrldq m4, m0, 1 ; t1-7 [word] - psrldq m5, m0, 2 ; t2-7 [word] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 - ; A2 B2 A1 B1 C1 D1 E1 F1 - ; A3 B3 A2 B2 A1 B1 C1 D1 - ; A4 B4 A3 B3 A2 B2 A1 B1 - ; A5 B5 A4 B4 A3 B3 A2 B2 - ; A6 B6 A5 B5 A4 B4 A3 B3 - ; A7 B7 A6 B6 A5 B5 A4 B4 - ; A8 B8 A7 B7 A6 B6 A5 B5 - pavgb m6, m1, m2 ; 2-tap avg A8-A1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 - - punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - - movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 - palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 - movq [dstq+strideq*2], m0 - psrldq m0, 2 ; A-B2, A-B1, C-H1 - movq [dstq+strideq ], m0 - psrldq m0, 2 ; A-H1 - movq [dstq ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 - psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 - movq [dstq+strideq*2], m6 - psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 - movq [dstq+strideq ], m6 - psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 - movq [dstq ], m6 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 - ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 - ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 - ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 - ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 - ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 - ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 - ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 - ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 - ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 - ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 - ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 - ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 - ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 - ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 - ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 - pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m6, 15 - palignr m3, m0, m6, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] - pavgb m5, m0 ; A1 - Ag - - punpcklbw m0, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - - pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] - pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 - - pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - palignr m2, m1, m6, 14 - mova [dstq ], m2 - palignr m2, m1, m6, 12 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 10 - mova [dstq+strideq*2], m2 - palignr m2, m1, m6, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m1, m6, 6 - mova [dstq ], m2 - palignr m2, m1, m6, 4 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 2 - mova [dstq+strideq*2], m2 - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - mova [dstq+stride3q ], m6 - lea dstq, [dstq+strideq*4] - - palignr m2, m6, m4, 14 - mova [dstq ], m2 - palignr m2, m6, m4, 12 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 10 - mova [dstq+strideq*2], m2 - palignr m2, m6, m4, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m6, m4, 6 - mova [dstq ], m2 - palignr m2, m6, m4, 4 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 2 - mova [dstq+strideq*2], m2 - mova [dstq+stride3q ], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - movu m1, [aboveq+15] - - pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] - - palignr m3, m1, m7, 1 - palignr m5, m1, m7, 2 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] - - pshufb m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m7, 15 - palignr m3, m0, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pavgb m5, m0 ; A1 - Ag - punpcklbw m6, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - pshufb m6, [GLOBAL(sh_bfedcba9876543210)] - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - - DEFINE_ARGS dst, stride, stride3, left, line - lea stride3q, [strideq*3] - - palignr m5, m2, m1, 14 - palignr m7, m1, m6, 14 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 12 - palignr m7, m1, m6, 12 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 10 - palignr m7, m1, m6, 10 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - palignr m5, m2, m1, 8 - palignr m7, m1, m6, 8 - mova [dstq+stride3q ], m7 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m2, m1, 6 - palignr m7, m1, m6, 6 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 4 - palignr m7, m1, m6, 4 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 2 - palignr m7, m1, m6, 2 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m6 - mova [dstq+stride3q+16 ], m1 - lea dstq, [dstq+strideq*4] - - palignr m5, m1, m6, 14 - palignr m3, m6, m4, 14 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 12 - palignr m3, m6, m4, 12 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 10 - palignr m3, m6, m4, 10 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - palignr m5, m1, m6, 8 - palignr m3, m6, m4, 8 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m1, m6, 6 - palignr m3, m6, m4, 6 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 4 - palignr m3, m6, m4, 4 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 2 - palignr m3, m6, m4, 2 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m4 - mova [dstq+stride3q+16 ], m6 - lea dstq, [dstq+strideq*4] - - mova m7, [leftq] - mova m3, [leftq+16] - palignr m5, m3, m7, 15 - palignr m0, m3, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - - pavgb m5, m3 ; Ah - - punpcklbw m3, m2, m5 ; A-B8 ... A-B1 - punpckhbw m2, m5 ; A-B9 ... A-Bg - pshufb m3, [GLOBAL(sh_bfedcba9876543210)] - pshufb m2, [GLOBAL(sh_bfedcba9876543210)] - - palignr m7, m6, m4, 14 - palignr m0, m4, m3, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 12 - palignr m0, m4, m3, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 10 - palignr m0, m4, m3, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m6, m4, 8 - palignr m0, m4, m3, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m6, m4, 6 - palignr m0, m4, m3, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 4 - palignr m0, m4, m3, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 2 - palignr m0, m4, m3, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m4 - lea dstq, [dstq+strideq*4] - - palignr m7, m4, m3, 14 - palignr m0, m3, m2, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 12 - palignr m0, m3, m2, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 10 - palignr m0, m3, m2, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m4, m3, 8 - palignr m0, m3, m2, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m4, m3, 6 - palignr m0, m3, m2, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 4 - palignr m0, m3, m2, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 2 - palignr m0, m3, m2, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m3 - - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset - GET_GOT goffsetq - movq m3, [leftq] ; abcdefgh [byte] - lea stride3q, [strideq*3] - - pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] - pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] - pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 - pavgb m0, m2 - punpcklbw m0, m3 ; interleaved output - - movq [dstq ], m0 - psrldq m0, 2 - movq [dstq+strideq ], m0 - psrldq m0, 2 - movq [dstq+strideq*2], m0 - psrldq m0, 2 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh - psrldq m0, 2 - movq [dstq ], m0 - psrldq m0, 2 - movq [dstq+strideq ], m0 - psrldq m0, 2 - movq [dstq+strideq*2], m0 - psrldq m0, 2 - movq [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset - GET_GOT goffsetq - lea stride3q, [strideq*3] - mova m0, [leftq] ; abcdefghijklmnop [byte] - pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp - pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 - pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] - - punpckhbw m4, m1, m3 ; interleaved input - punpcklbw m1, m3 ; interleaved output - mova [dstq ], m1 - palignr m3, m4, m1, 2 - mova [dstq+strideq ], m3 - palignr m3, m4, m1, 4 - mova [dstq+strideq*2], m3 - palignr m3, m4, m1, 6 - mova [dstq+stride3q ], m3 - lea dstq, [dstq+strideq*4] - palignr m3, m4, m1, 8 - mova [dstq ], m3 - palignr m3, m4, m1, 10 - mova [dstq+strideq ], m3 - palignr m3, m4, m1, 12 - mova [dstq+strideq*2], m3 - palignr m3, m4, m1, 14 - mova [dstq+stride3q ], m3 - DEFINE_ARGS dst, stride, stride3, line - mov lined, 2 - mova m0, [GLOBAL(sh_b23456789abcdefff)] -.loop: - lea dstq, [dstq+strideq*4] - mova [dstq ], m4 - pshufb m4, m0 - mova [dstq+strideq ], m4 - pshufb m4, m0 - mova [dstq+strideq*2], m4 - pshufb m4, m0 - mova [dstq+stride3q ], m4 - pshufb m4, m0 - dec lined - jnz .loop - RESTORE_GOT - REP_RET - -INIT_XMM ssse3 -cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset - GET_GOT goffsetq - lea stride3q, [strideq*3] - mova m1, [leftq] ; 0-15 [byte] - mova m2, [leftq+16] ; 16-31 [byte] - pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] - pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 - palignr m6, m2, m1, 1 - palignr m5, m2, m1, 2 - pavgb m2, m4 ; high 16px even lines - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 - pavgb m1, m6 ; low 16px even lines - - punpckhbw m6, m1, m0 ; interleaved output 2 - punpcklbw m1, m0 ; interleaved output 1 - - punpckhbw m7, m2, m3 ; interleaved output 4 - punpcklbw m2, m3 ; interleaved output 3 - - ; output 1st 8 lines (and half of 2nd 8 lines) - DEFINE_ARGS dst, stride, stride3, dst8 - lea dst8q, [dstq+strideq*8] - mova [dstq ], m1 - mova [dstq +16], m6 - mova [dst8q ], m6 - palignr m0, m6, m1, 2 - palignr m4, m2, m6, 2 - mova [dstq +strideq ], m0 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m0, m6, m1, 4 - palignr m4, m2, m6, 4 - mova [dstq +strideq*2 ], m0 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m0, m6, m1, 6 - palignr m4, m2, m6, 6 - mova [dstq +stride3q ], m0 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq +strideq*4] - lea dst8q, [dst8q+strideq*4] - palignr m0, m6, m1, 8 - palignr m4, m2, m6, 8 - mova [dstq ], m0 - mova [dstq +16], m4 - mova [dst8q ], m4 - palignr m0, m6, m1, 10 - palignr m4, m2, m6, 10 - mova [dstq +strideq ], m0 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m0, m6, m1, 12 - palignr m4, m2, m6, 12 - mova [dstq +strideq*2 ], m0 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m0, m6, m1, 14 - palignr m4, m2, m6, 14 - mova [dstq +stride3q ], m0 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - - ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines - mova [dstq +16], m2 - mova [dst8q ], m2 - palignr m4, m7, m2, 2 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m4, m7, m2, 4 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m4, m7, m2, 6 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - palignr m4, m7, m2, 8 - mova [dstq +16], m4 - mova [dst8q ], m4 - palignr m4, m7, m2, 10 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m4, m7, m2, 12 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m4, m7, m2, 14 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - - ; output 2nd half of 3rd 8 lines and half of 4th 8 lines - mova m0, [GLOBAL(sh_b23456789abcdefff)] - mova [dstq +16], m7 - mova [dst8q ], m7 - pshufb m7, m0 - mova [dstq +strideq +16], m7 - mova [dst8q+strideq ], m7 - pshufb m7, m0 - mova [dstq +strideq*2+16], m7 - mova [dst8q+strideq*2 ], m7 - pshufb m7, m0 - mova [dstq +stride3q +16], m7 - mova [dst8q+stride3q ], m7 - pshufb m7, m0 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - mova [dstq +16], m7 - mova [dst8q ], m7 - pshufb m7, m0 - mova [dstq +strideq +16], m7 - mova [dst8q+strideq ], m7 - pshufb m7, m0 - mova [dstq +strideq*2+16], m7 - mova [dst8q+strideq*2 ], m7 - pshufb m7, m0 - mova [dstq +stride3q +16], m7 - mova [dst8q+stride3q ], m7 - pshufb m7, m0 - lea dstq, [dstq+strideq*4] - - ; output last half of 4th 8 lines - mova [dstq +16], m7 - mova [dstq +strideq +16], m7 - mova [dstq +strideq*2+16], m7 - mova [dstq +stride3q +16], m7 - lea dstq, [dstq+strideq*4] - mova [dstq +16], m7 - mova [dstq +strideq +16], m7 - mova [dstq +strideq*2+16], m7 - mova [dstq +stride3q +16], m7 - - ; done! - RESTORE_GOT - RET diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c deleted file mode 100644 index df5068c624..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ /dev/null @@ -1,4046 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" - -#define RECON_AND_STORE4X4(dest, in_x) \ -{ \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ -} - -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; - - // Rows - input0 = load_input_data(input); - input2 = load_input_data(input + 8); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } -} - -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 4); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -static INLINE void transpose_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); -} - -void idct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - - transpose_4x4(in); - // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); - - // stage 2 - in[0] = _mm_add_epi16(u[0], u[1]); - in[1] = _mm_sub_epi16(u[0], u[1]); - in[1] = _mm_shuffle_epi32(in[1], 0x4E); -} - -void iadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); - const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); - const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8], in7; - - transpose_4x4(in); - in7 = _mm_srli_si128(in[1], 8); - in7 = _mm_add_epi16(in7, in[0]); - in7 = _mm_sub_epi16(in7, in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpackhi_epi16(in[0], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 - v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_add_epi32(v[3], v[4]); - u[2] = v[2]; - u[3] = _mm_add_epi32(u[0], u[1]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_add_epi32(u[3], v[5]); - u[6] = _mm_sub_epi32(u[5], u[4]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); -} - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ - out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ - cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_4, \ - stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_0, \ - stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ - } - -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in0, in1, in2, in3, in4, in5, in6, in7); - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); -} - -void idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], - in0, in1, in2, in3, in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, - in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); -} - -void iadst8_sse2(__m128i *in) { - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // transpose - array_transpose_8x8(in, in); - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); -} - -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } - - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); - - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, - in0, in1, in2, in3, in4, in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ - stg2_0, stg2_1, stg2_2, stg2_3, \ - stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ - stg2_4, stg2_5, stg2_6, stg2_7, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ - stg3_0, stg3_1, stg3_2, stg3_3, \ - stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ - stg4_0, stg4_1, stg4_2, stg4_3, \ - stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ - stg4_4, stg4_5, stg4_6, stg4_7, \ - stp2_9, stp2_14, stp2_10, stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } - -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ - stg2_0, stg2_1, stg2_6, stg2_7, \ - stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ - stg3_0, stg3_1, \ - stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ - stg4_0, stg4_1, \ - stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ - stg4_4, stg4_5, stg4_6, stg4_7, \ - stp2_9, stp2_14, stp2_10, stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, \ - stp2_10, stp2_13, stp2_11, stp2_12) \ - } - -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - curr1 = l; - for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = load_input_data(input); - in[8] = load_input_data(input + 8 * 1); - in[1] = load_input_data(input + 8 * 2); - in[9] = load_input_data(input + 8 * 3); - in[2] = load_input_data(input + 8 * 4); - in[10] = load_input_data(input + 8 * 5); - in[3] = load_input_data(input + 8 * 6); - in[11] = load_input_data(input + 8 * 7); - in[4] = load_input_data(input + 8 * 8); - in[12] = load_input_data(input + 8 * 9); - in[5] = load_input_data(input + 8 * 10); - in[13] = load_input_data(input + 8 * 11); - in[6] = load_input_data(input + 8 * 12); - in[14] = load_input_data(input + 8 * 13); - in[7] = load_input_data(input + 8 * 14); - in[15] = load_input_data(input + 8 * 15); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; - input += 128; - } - for (i = 0; i < 2; i++) { - int j; - // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); - - for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); - dest += stride; - } -} - -static void iadst16_8col(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -static void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; - - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; - - // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); - - // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); -} - -void idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); -} - -void iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); -} - -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = load_input_data(input); \ - input += 8; \ - } \ - -#define IDCT32_34 \ -/* Stage1 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ - stg1_1, stp1_16, stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ - stg1_7, stp1_19, stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ - stg1_9, stp1_20, stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ - stg1_15, stp1_23, stp1_24); \ -} \ -\ -/* Stage2 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ - stg2_1, stp2_8, stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ - stg2_7, stp2_11, stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ -} \ -\ -/* Stage3 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ - stg3_1, stp1_4, stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ - stp1_18, stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ - stp1_22, stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ -} \ -\ -/* Stage4 */ \ -{ \ - const __m128i zero = _mm_setzero_si128();\ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ - stg4_1, stp2_0, stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ - stp2_10, stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ -} \ -\ -/* Stage5 */ \ -{ \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} \ -\ -/* Stage6 */ \ -{ \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ - stp2_13, stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ -} \ -\ -/* Stage7 */ \ -{ \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} - - -#define IDCT32 \ -/* Stage1 */ \ -{ \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ - stp1_17, stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ - stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ -} \ -\ -/* Stage2 */ \ -{ \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ - stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ -} \ -\ -/* Stage3 */ \ -{ \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ - stp1_18, stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ - stp1_22, stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ -} \ -\ -/* Stage4 */ \ -{ \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ - stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ - stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ - stp2_10, stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ -} \ -\ -/* Stage5 */ \ -{ \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ - stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} \ -\ -/* Stage6 */ \ -{ \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ - stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ - stp2_13, stp2_11, stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ -} \ -\ -/* Stage7 */ \ -{ \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ -} - -// Only upper-left 8x8 has non-zero coeff -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1<<5); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, - stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, - stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, - stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, - stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); - - for (i = 8; i < 32; ++i) { - in[i] = _mm_setzero_si128(); - } - - array_transpose_8x8(in, in); - // TODO(hkuang): Following transposes are unnecessary. But remove them will - // lead to performance drop on some devices. - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { - int j; - const __m128i zero = _mm_setzero_si128(); - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, - stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, - stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, - stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, - stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; - - for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - IDCT32 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, j; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); - - for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); - RECON_AND_STORE(dest + 16 + j * stride, dc_value); - RECON_AND_STORE(dest + 24 + j * stride, dc_value); - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j * 4 + i]; - vpx_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i ] = _mm_add_epi16(inptr[i ], rounding); - inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); - inptr[i ] = _mm_srai_epi16(inptr[i ], 6); - inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i ] = _mm_add_epi16(inptr[i ], rounding); - inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); - inptr[i ] = _mm_srai_epi16(inptr[i ], 6); - inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h deleted file mode 100644 index bd520c18e5..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_sse2.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_ -#define VPX_DSP_X86_INV_TXFM_SSE2_H_ - -#include <emmintrin.h> // SSE2 -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vpx_dsp/inv_txfm.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" - -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - -static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -// Function to allow 8 bit optimisations to be used when profile 0 is used with -// highbitdepth enabled -static INLINE __m128i load_input_data(const tran_low_t *data) { -#if CONFIG_VP9_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); -#else - return _mm_load_si128((const __m128i *)data); -#endif -} - -static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); -} - -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - } - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1<<5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); -} - -void idct4_sse2(__m128i *in); -void idct8_sse2(__m128i *in); -void idct16_sse2(__m128i *in0, __m128i *in1); -void iadst4_sse2(__m128i *in); -void iadst8_sse2(__m128i *in); -void iadst16_sse2(__m128i *in0, __m128i *in1); - -#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm deleted file mode 100644 index 20baf820f6..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ /dev/null @@ -1,1793 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the inverse transformation. Part -; of the functions are originally derived from the ffmpeg project. -; Note that the current version applies to x86 64-bit only. - -SECTION_RODATA - -pw_11585x2: times 8 dw 23170 - -pw_m2404x2: times 8 dw -2404*2 -pw_m4756x2: times 8 dw -4756*2 -pw_m5520x2: times 8 dw -5520*2 -pw_m8423x2: times 8 dw -8423*2 -pw_m9102x2: times 8 dw -9102*2 -pw_m10394x2: times 8 dw -10394*2 -pw_m11003x2: times 8 dw -11003*2 - -pw_16364x2: times 8 dw 16364*2 -pw_16305x2: times 8 dw 16305*2 -pw_16207x2: times 8 dw 16207*2 -pw_16069x2: times 8 dw 16069*2 -pw_15893x2: times 8 dw 15893*2 -pw_15679x2: times 8 dw 15679*2 -pw_15426x2: times 8 dw 15426*2 -pw_15137x2: times 8 dw 15137*2 -pw_14811x2: times 8 dw 14811*2 -pw_14449x2: times 8 dw 14449*2 -pw_14053x2: times 8 dw 14053*2 -pw_13623x2: times 8 dw 13623*2 -pw_13160x2: times 8 dw 13160*2 -pw_12665x2: times 8 dw 12665*2 -pw_12140x2: times 8 dw 12140*2 -pw__9760x2: times 8 dw 9760*2 -pw__7723x2: times 8 dw 7723*2 -pw__7005x2: times 8 dw 7005*2 -pw__6270x2: times 8 dw 6270*2 -pw__3981x2: times 8 dw 3981*2 -pw__3196x2: times 8 dw 3196*2 -pw__1606x2: times 8 dw 1606*2 -pw___804x2: times 8 dw 804*2 - -pd_8192: times 4 dd 8192 -pw_32: times 8 dw 32 -pw_16: times 8 dw 16 - -%macro TRANSFORM_COEFFS 2 -pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 -pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 -pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2 -%endmacro - -TRANSFORM_COEFFS 6270, 15137 -TRANSFORM_COEFFS 3196, 16069 -TRANSFORM_COEFFS 13623, 9102 - -; constants for 32x32_34 -TRANSFORM_COEFFS 804, 16364 -TRANSFORM_COEFFS 15426, 5520 -TRANSFORM_COEFFS 3981, 15893 -TRANSFORM_COEFFS 16207, 2404 -TRANSFORM_COEFFS 1606, 16305 -TRANSFORM_COEFFS 15679, 4756 -TRANSFORM_COEFFS 11585, 11585 - -; constants for 32x32_1024 -TRANSFORM_COEFFS 12140, 11003 -TRANSFORM_COEFFS 7005, 14811 -TRANSFORM_COEFFS 14053, 8423 -TRANSFORM_COEFFS 9760, 13160 -TRANSFORM_COEFFS 12665, 10394 -TRANSFORM_COEFFS 7723, 14449 - -%macro PAIR_PP_COEFFS 2 -dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MP_COEFFS 2 -dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MM_COEFFS 2 -dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 -%endmacro - -PAIR_PP_COEFFS 30274, 12540 -PAIR_PP_COEFFS 6392, 32138 -PAIR_MP_COEFFS 18204, 27246 - -PAIR_PP_COEFFS 12540, 12540 -PAIR_PP_COEFFS 30274, 30274 -PAIR_PP_COEFFS 6392, 6392 -PAIR_PP_COEFFS 32138, 32138 -PAIR_MM_COEFFS 18204, 18204 -PAIR_PP_COEFFS 27246, 27246 - -SECTION .text - -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro IDCT8_1D 0 - SUM_SUB 0, 4, 9 - BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 - pmulhrsw m0, m12 - pmulhrsw m4, m12 - BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 - - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - SUM_SUB 0, 6, 9 - SUM_SUB 4, 2, 9 - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 4, 3, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 4 -%endmacro - -; This macro handles 8 pixels per line -%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero - paddw m%1, m11 - paddw m%2, m11 - psraw m%1, 5 - psraw m%2, 5 - - movh m%3, [outputq] - movh m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%3, m%1 - paddw m%4, m%2 - packuswb m%3, m%5 - packuswb m%4, m%5 - movh [outputq], m%3 - movh [outputq + strideq], m%4 -%endmacro - -INIT_XMM ssse3 -; full inverse 8x8 2D-DCT transform -cglobal idct8x8_64_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] - mova m4, [inputq + 128] - packssdw m4, [inputq + 144] - mova m5, [inputq + 160] - packssdw m5, [inputq + 176] - mova m6, [inputq + 192] - packssdw m6, [inputq + 208] - mova m7, [inputq + 224] - packssdw m7, [inputq + 240] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] - mova m4, [inputq + 64] - mova m5, [inputq + 80] - mova m6, [inputq + 96] - mova m7, [inputq + 112] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero -cglobal idct8x8_12_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] - -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] -%endif - - punpcklwd m0, m1 - punpcklwd m2, m3 - punpckhdq m9, m0, m2 - punpckldq m0, m2 - SWAP 2, 9 - - ; m0 -> [0], [0] - ; m1 -> [1], [1] - ; m2 -> [2], [2] - ; m3 -> [3], [3] - punpckhqdq m10, m0, m0 - punpcklqdq m0, m0 - punpckhqdq m9, m2, m2 - punpcklqdq m2, m2 - SWAP 1, 10 - SWAP 3, 9 - - pmulhrsw m0, m12 - pmulhrsw m2, [dpw_30274_12540] - pmulhrsw m1, [dpw_6392_32138] - pmulhrsw m3, [dpw_m18204_27246] - - SUM_SUB 0, 2, 9 - SUM_SUB 1, 3, 9 - - punpcklqdq m9, m3, m3 - punpckhqdq m5, m3, m9 - - SUM_SUB 3, 5, 9 - punpckhqdq m5, m3 - pmulhrsw m5, m12 - - punpckhqdq m9, m1, m5 - punpcklqdq m1, m5 - SWAP 5, 9 - - SUM_SUB 0, 5, 9 - SUM_SUB 2, 1, 9 - - punpckhqdq m3, m0, m0 - punpckhqdq m4, m1, m1 - punpckhqdq m6, m5, m5 - punpckhqdq m7, m2, m2 - - punpcklwd m0, m3 - punpcklwd m7, m2 - punpcklwd m1, m4 - punpcklwd m6, m5 - - punpckhdq m4, m0, m7 - punpckldq m0, m7 - punpckhdq m10, m1, m6 - punpckldq m5, m1, m6 - - punpckhqdq m1, m0, m5 - punpcklqdq m0, m5 - punpckhqdq m3, m4, m10 - punpcklqdq m2, m4, m10 - - - pmulhrsw m0, m12 - pmulhrsw m6, m2, [dpw_30274_30274] - pmulhrsw m4, m2, [dpw_12540_12540] - - pmulhrsw m7, m1, [dpw_32138_32138] - pmulhrsw m1, [dpw_6392_6392] - pmulhrsw m5, m3, [dpw_m18204_m18204] - pmulhrsw m3, [dpw_27246_27246] - - mova m2, m0 - SUM_SUB 0, 6, 9 - SUM_SUB 2, 4, 9 - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 2, 3, 9 - SUM_SUB 4, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 2 - SWAP 2, 4 - - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -%define idx0 16 * 0 -%define idx1 16 * 1 -%define idx2 16 * 2 -%define idx3 16 * 3 -%define idx4 16 * 4 -%define idx5 16 * 5 -%define idx6 16 * 6 -%define idx7 16 * 7 -%define idx8 16 * 0 -%define idx9 16 * 1 -%define idx10 16 * 2 -%define idx11 16 * 3 -%define idx12 16 * 4 -%define idx13 16 * 5 -%define idx14 16 * 6 -%define idx15 16 * 7 -%define idx16 16 * 0 -%define idx17 16 * 1 -%define idx18 16 * 2 -%define idx19 16 * 3 -%define idx20 16 * 4 -%define idx21 16 * 5 -%define idx22 16 * 6 -%define idx23 16 * 7 -%define idx24 16 * 0 -%define idx25 16 * 1 -%define idx26 16 * 2 -%define idx27 16 * 3 -%define idx28 16 * 4 -%define idx29 16 * 5 -%define idx30 16 * 6 -%define idx31 16 * 7 - -; FROM idct32x32_add_neon.asm -; -; Instead of doing the transforms stage by stage, it is done by loading -; some input values and doing as many stages as possible to minimize the -; storing/loading of intermediate results. To fit within registers, the -; final coefficients are cut into four blocks: -; BLOCK A: 16-19,28-31 -; BLOCK B: 20-23,24-27 -; BLOCK C: 8-11,12-15 -; BLOCK D: 0-3,4-7 -; Blocks A and C are straight calculation through the various stages. In -; block B, further calculations are performed using the results from -; block A. In block D, further calculations are performed using the results -; from block C and then the final calculations are done using results from -; block A and B which have been combined at the end of block B. -; - -%macro IDCT32X32_34 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - mova [r4 + 0], m0 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - mova [r4 + 16 * 2], m2 - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - mova [r4 + 16 * 4], m4 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - mova [r4 + 16 * 6], m6 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, m1 ; stp1_16 - mova m0, m11 ; stp1_31 - mova m4, m7 ; stp1_28 - mova m15, m12 ; stp1_19 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m15 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - mova [stp + %4 + idx30], m2 - mova m2, m3 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - mova [stp + %4 + idx31], m11 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m13, m5 ; stp1_20 - mova m14, m6 ; stp1_27 - mova m15, m3 ; stp1_23 - mova m11, m2 ; stp1_24 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 15, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 11, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_22 - SUM_SUB 4, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 7, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m10, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 10, 11, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m10 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 11, 15, 9 - pmulhrsw m11, m10 ; stp1_25 - pmulhrsw m15, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_24 - pmulhrsw m3, m10 ; stp1_23 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 11, 15 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m11 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m6, [rsp + transposed_in + 16 * 6] - - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - mova [stp + %3 + idx22], m15 - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - mova [stp + %3 + idx23], m3 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m3, m0 ; stp1_8 - mova m2, m1 ; stp1_15 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - mova m4, m7 ; stp1_11 - mova m5, m6 ; stp1_12 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m10, [pw_11585x2] - pmulhrsw m0, m10 ; stp1_1 - - mova m14, m11 ; stp1_4 - mova m13, m12 ; stp1_7 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m7, m0 ; stp1_0 = stp1_1 - mova m4, m0 ; stp1_1 - mova m2, m7 ; stp1_0 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 7, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 4, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 1, 9 ; stp1_0, stp1_15 - SUM_SUB 7, 3, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 4, 6, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m15, [stp + %4 + idx30] - mova m10, [stp + %4 + idx31] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 7, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m7 - mova [stp + %4 + idx30], m15 - mova [stp + %4 + idx31], m10 - mova m7, [stp + %4 + idx28] - mova m0, [stp + %4 + idx29] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 4, 7, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m4 - mova [stp + %4 + idx28], m7 - mova [stp + %4 + idx29], m0 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m4, [stp + %3 + idx19] - SUM_SUB 1, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 3, 7, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 6, 4, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m4 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m0, [stp + %4 + idx27] - mova m1, [stp + %4 + idx26] - mova m2, [stp + %4 + idx25] - mova m3, [stp + %4 + idx24] - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - mova [stp + %4 + idx27], m0 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx24], m3 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -%macro RECON_AND_STORE 1 - mova m11, [pw_32] - lea stp, [rsp + %1] - mov r6, 32 - pxor m8, m8 -%%recon_and_store: - mova m0, [stp + 16 * 32 * 0] - mova m1, [stp + 16 * 32 * 1] - mova m2, [stp + 16 * 32 * 2] - mova m3, [stp + 16 * 32 * 3] - add stp, 16 - - paddw m0, m11 - paddw m1, m11 - paddw m2, m11 - paddw m3, m11 - psraw m0, 6 - psraw m1, 6 - psraw m2, 6 - psraw m3, 6 - movh m4, [outputq + 0] - movh m5, [outputq + 8] - movh m6, [outputq + 16] - movh m7, [outputq + 24] - punpcklbw m4, m8 - punpcklbw m5, m8 - punpcklbw m6, m8 - punpcklbw m7, m8 - paddw m0, m4 - paddw m1, m5 - paddw m2, m6 - paddw m3, m7 - packuswb m0, m1 - packuswb m2, m3 - mova [outputq + 0], m0 - mova [outputq + 16], m2 - lea outputq, [outputq + strideq] - dec r6 - jnz %%recon_and_store -%endmacro - -%define i32x32_size 16*32*5 -%define pass_two_start 16*32*0 -%define transposed_in 16*32*4 -%define pass_one_start 16*32*0 -%define stp r8 - -INIT_XMM ssse3 -cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - lea stp, [rsp + pass_one_start] - -idct32x32_34: - mov r3, inputq - lea r4, [rsp + transposed_in] - -idct32x32_34_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_34_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - -idct32x32_34_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_34_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_135 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, m3 - pmulhrsw m3, [pw__7005x2] ; stp1_18 - pmulhrsw m4, [pw_14811x2] ; stp2_29 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, m0 - pmulhrsw m0, [pw_12140x2] ; stp1_30 - pmulhrsw m2, [pw_m11003x2] ; stp2_17 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, m2 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - - mova m14, [rsp + transposed_in + 16 * 11] - mova m13, m14 - pmulhrsw m13, [pw_m8423x2] ; stp1_21 - pmulhrsw m14, [pw_14053x2] ; stp2_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, m0 - pmulhrsw m0, [pw__9760x2] ; stp1_22 - pmulhrsw m1, [pw_13160x2] ; stp2_25 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, m4 - pmulhrsw m4, [pw__7723x2] ; stp1_10 - pmulhrsw m5, [pw_14449x2] ; stp2_13 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, m2 - pmulhrsw m3, [pw_m10394x2] ; stp1_9 - pmulhrsw m2, [pw_12665x2] ; stp2_14 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, m13 - pmulhrsw m13, [pw_13623x2] ; stp1_6 - pmulhrsw m14, [pw_m9102x2] ; stp1_5 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m2, [rsp + transposed_in + 16 * 8] - pmulhrsw m0, [pw_11585x2] ; stp1_1 - mova m3, m2 - pmulhrsw m2, [pw__6270x2] ; stp1_2 - pmulhrsw m3, [pw_15137x2] ; stp1_3 - - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m1, m0 ; stp1_0 = stp1_1 - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 2 - lea stp, [rsp + pass_one_start] - -idct32x32_135: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 2 - -idct32x32_135_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose - - IDCT32X32_135 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_135 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_135_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 2 - -idct32x32_135_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose_2 - - IDCT32X32_135 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_135_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_1024 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, [rsp + transposed_in + 16 * 31] - BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, [rsp + transposed_in + 16 * 17] - BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, [rsp + transposed_in + 16 * 25] - BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, [rsp + transposed_in + 16 * 23] - BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, [rsp + transposed_in + 16 * 27] - BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 - - mova m13, [rsp + transposed_in + 16 * 21] - mova m14, [rsp + transposed_in + 16 * 11] - BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, [rsp + transposed_in + 16 * 19] - BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 - - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, [rsp + transposed_in + 16 * 29] - BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, [rsp + transposed_in + 16 * 30] - BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, [rsp + transposed_in + 16 * 18] - BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, [rsp + transposed_in + 16 * 22] - BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, [rsp + transposed_in + 16 * 26] - BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, [rsp + transposed_in + 16 * 28] - BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, [rsp + transposed_in + 16 * 20] - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m1, [rsp + transposed_in + 16 * 16] - -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 0, 1, 9 - pmulhrsw m0, m10 ; stp1_1 - pmulhrsw m1, m10 ; stp1_0 -%else - BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 - SWAP 0, 1 -%endif - mova m2, [rsp + transposed_in + 16 * 8] - mova m3, [rsp + transposed_in + 16 * 24] - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 - - mova m10, [pw_11585x2] - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 4 - lea stp, [rsp + pass_one_start] - -idct32x32_1024: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 4 - -idct32x32_1024_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose - - IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 - - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_1024 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_1024_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 4 - -idct32x32_1024_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose_2 - - IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_1024_2 - - RECON_AND_STORE pass_two_start - - RET -%endif diff --git a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm deleted file mode 100644 index fbbcd76bd7..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/inv_wht_sse2.asm +++ /dev/null @@ -1,109 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro REORDER_INPUTS 0 - ; a c d b to a b c d - SWAP 1, 3, 2 -%endmacro - -%macro TRANSFORM_COLS 0 - ; input: - ; m0 a - ; m1 b - ; m2 c - ; m3 d - paddw m0, m2 - psubw m3, m1 - - ; wide subtract - punpcklwd m4, m0 - punpcklwd m5, m3 - psrad m4, 16 - psrad m5, 16 - psubd m4, m5 - psrad m4, 1 - packssdw m4, m4 ; e - - psubw m5, m4, m1 ; b - psubw m4, m2 ; c - psubw m0, m5 - paddw m3, m4 - ; m0 a - SWAP 1, 5 ; m1 b - SWAP 2, 4 ; m2 c - ; m3 d -%endmacro - -%macro TRANSPOSE_4X4 0 - punpcklwd m0, m2 - punpcklwd m1, m3 - mova m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - pshufd m1, m0, 0x0e - pshufd m3, m2, 0x0e -%endmacro - -; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 -%macro TRANSPOSE_4X4_WIDE 0 - mova m3, m0 - punpcklwd m0, m1 - punpckhwd m3, m1 - mova m2, m0 - punpcklwd m0, m3 - punpckhwd m2, m3 - pshufd m1, m0, 0x0e - pshufd m3, m2, 0x0e -%endmacro - -%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero - movd m%3, [outputq] - movd m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%1, m%3 - paddw m%2, m%4 - packuswb m%1, m%5 - packuswb m%2, m%5 - movd [outputq], m%1 - movd [outputq + strideq], m%2 -%endmacro - -INIT_XMM sse2 -cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] -%endif - psraw m0, 2 - psraw m1, 2 - - TRANSPOSE_4X4_WIDE - REORDER_INPUTS - TRANSFORM_COLS - TRANSPOSE_4X4 - REORDER_INPUTS - TRANSFORM_COLS - - pxor m4, m4 - ADD_STORE_4P_2X 0, 1, 5, 6, 4 - lea outputq, [outputq + 2 * strideq] - ADD_STORE_4P_2X 2, 3, 5, 6, 4 - - RET diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c deleted file mode 100644 index be1087c1e9..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_avx2.c +++ /dev/null @@ -1,979 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <immintrin.h> /* AVX2 */ - -#include "./vpx_dsp_rtcd.h" -#include "vpx_ports/mem.h" - -void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; - __m128i abs_p1p0; - - const __m128i thresh = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _thresh[0])); - const __m128i limit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _limit[0])); - const __m128i blimit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _blimit[0])); - - q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p)); - q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p)); - q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p)); - q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p)); - q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p)); - q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); - - { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), - _mm_subs_epu8(q0p0, q1p1)); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), - _mm_subs_epu8(p0q0, q0p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), - _mm_subs_epu8(p1q1, q1p1)); - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), - _mm_subs_epu8(q1p1, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), - _mm_subs_epu8(q2p2, q3p3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - /* Filter1 >> 3 */ - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128( - _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), - _mm_subs_epu8(q0p0, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), - _mm_subs_epu8(q0p0, q3p3))); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p)); - q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), - (__m64 *) (s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p)); - q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), - (__m64 *) (s + 6 * p))); - - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), - _mm_subs_epu8(q0p0, q4p4)), - _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), - _mm_subs_epu8(q0p0, q5p5))); - - q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p)); - q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), - (__m64 *) (s + 7 * p))); - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), - _mm_subs_epu8(q0p0, q6p6)), - _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), - _mm_subs_epu8(q0p0, q7p7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero); - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, - _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, - _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = _mm_add_epi16(eight, - _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16(four, - _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), - 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), - 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), - 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), - 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), - 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), - 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), - 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6)); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5)); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4)); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3)); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2)); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1)); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0)); - } -} - -DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { - 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, - 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 -}; - -void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, - q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, - p256_0, q256_0; - - const __m128i thresh = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _thresh[0])); - const __m128i limit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _limit[0])); - const __m128i blimit = _mm_broadcastb_epi8( - _mm_cvtsi32_si128((int) _blimit[0])); - - p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 5 * p))); - p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 4 * p))); - p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 3 * p))); - p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 2 * p))); - p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 1 * p))); - q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 0 * p))); - q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 1 * p))); - q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 2 * p))); - q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 3 * p))); - q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 4 * p))); - - p4 = _mm256_castsi256_si128(p256_4); - p3 = _mm256_castsi256_si128(p256_3); - p2 = _mm256_castsi256_si128(p256_2); - p1 = _mm256_castsi256_si128(p256_1); - p0 = _mm256_castsi256_si128(p256_0); - q0 = _mm256_castsi256_si128(q256_0); - q1 = _mm256_castsi256_si128(q256_1); - q2 = _mm256_castsi256_si128(q256_2); - q3 = _mm256_castsi256_si128(q256_3); - q4 = _mm256_castsi256_si128(q256_4); - - { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); - __m128i work; - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - __m128i ps1 = _mm_xor_si128(p1, t80); - __m128i ps0 = _mm_xor_si128(p0, t80); - __m128i qs0 = _mm_xor_si128(q0, t80); - __m128i qs1 = _mm_xor_si128(q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, - flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, - flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, - flat_q2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - // loopfilter done - - { - __m128i work; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 6 * p))); - q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 5 * p))); - p5 = _mm256_castsi256_si128(p256_5); - q5 = _mm256_castsi256_si128(q256_5); - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), - _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - - flat2 = _mm_max_epu8(work, flat2); - p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 7 * p))); - q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 6 * p))); - p6 = _mm256_castsi256_si128(p256_6); - q6 = _mm256_castsi256_si128(q256_6); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), - _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - - flat2 = _mm_max_epu8(work, flat2); - - p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s - 8 * p))); - q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd( - (__m128d const *)(s + 7 * p))); - p7 = _mm256_castsi256_si128(p256_7); - q7 = _mm256_castsi256_si128(q256_7); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), - _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, - pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, - res_q; - - const __m256i filter = _mm256_load_si256( - (__m256i const *)filt_loopfilter_avx2); - p256_7 = _mm256_shuffle_epi8(p256_7, filter); - p256_6 = _mm256_shuffle_epi8(p256_6, filter); - p256_5 = _mm256_shuffle_epi8(p256_5, filter); - p256_4 = _mm256_shuffle_epi8(p256_4, filter); - p256_3 = _mm256_shuffle_epi8(p256_3, filter); - p256_2 = _mm256_shuffle_epi8(p256_2, filter); - p256_1 = _mm256_shuffle_epi8(p256_1, filter); - p256_0 = _mm256_shuffle_epi8(p256_0, filter); - q256_0 = _mm256_shuffle_epi8(q256_0, filter); - q256_1 = _mm256_shuffle_epi8(q256_1, filter); - q256_2 = _mm256_shuffle_epi8(q256_2, filter); - q256_3 = _mm256_shuffle_epi8(q256_3, filter); - q256_4 = _mm256_shuffle_epi8(q256_4, filter); - q256_5 = _mm256_shuffle_epi8(q256_5, filter); - q256_6 = _mm256_shuffle_epi8(q256_6, filter); - q256_7 = _mm256_shuffle_epi8(q256_7, filter); - - pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), - _mm256_add_epi16(p256_4, p256_3)); - pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), - _mm256_add_epi16(q256_4, q256_3)); - - pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0, - _mm256_add_epi16(p256_2, p256_1)); - pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0, - _mm256_add_epi16(q256_2, q256_1)); - pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - - pixelFilter_p = _mm256_add_epi16(eight, - _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); - - pixetFilter_p2p1p0 = _mm256_add_epi16(four, - _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(p256_7, p256_0)), 4); - - flat2_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(q256_7, q256_0)), 4); - - flat2_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(p256_3, p256_0)), 3); - - flat_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(q256_3, q256_0)), 3); - - flat_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - sum_p7 = _mm256_add_epi16(p256_7, p256_7); - - sum_q7 = _mm256_add_epi16(q256_7, q256_7); - - sum_p3 = _mm256_add_epi16(p256_3, p256_3); - - sum_q3 = _mm256_add_epi16(q256_3, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_1)), 4); - - flat2_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_1)), 4); - - flat2_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_1)), 3); - - flat_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_1)), 3); - - flat_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - sum_p3 = _mm256_add_epi16(sum_p3, p256_3); - - sum_q3 = _mm256_add_epi16(sum_q3, q256_3); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_2)), 4); - - flat2_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_2)), 4); - - flat2_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_2)), 3); - - flat_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_2)), 3); - - flat_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_3)), 4); - - flat2_p3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_3)), 4); - - flat2_q3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_4)), 4); - - flat2_p4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_4)), 4); - - flat2_q4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_5)), 4); - - flat2_p5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_5)), 4); - - flat2_q5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, - _mm256_add_epi16(sum_p7, p256_6)), 4); - - flat2_p6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), - 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, - _mm256_add_epi16(sum_q7, q256_6)), 4); - - flat2_q6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), - 168)); - } - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - p2 = _mm_andnot_si128(flat, p2); - flat_p2 = _mm_and_si128(flat, flat_p2); - p2 = _mm_or_si128(flat_p2, p2); - - p1 = _mm_andnot_si128(flat, ps1); - flat_p1 = _mm_and_si128(flat, flat_p1); - p1 = _mm_or_si128(flat_p1, p1); - - p0 = _mm_andnot_si128(flat, ps0); - flat_p0 = _mm_and_si128(flat, flat_p0); - p0 = _mm_or_si128(flat_p0, p0); - - q0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(flat_q0, q0); - - q1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(flat_q1, q1); - - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(flat_q2, q2); - - p6 = _mm_andnot_si128(flat2, p6); - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *) (s - 7 * p), p6); - - p5 = _mm_andnot_si128(flat2, p5); - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *) (s - 6 * p), p5); - - p4 = _mm_andnot_si128(flat2, p4); - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *) (s - 5 * p), p4); - - p3 = _mm_andnot_si128(flat2, p3); - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *) (s - 4 * p), p3); - - p2 = _mm_andnot_si128(flat2, p2); - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *) (s - 3 * p), p2); - - p1 = _mm_andnot_si128(flat2, p1); - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *) (s - 2 * p), p1); - - p0 = _mm_andnot_si128(flat2, p0); - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *) (s - 1 * p), p0); - - q0 = _mm_andnot_si128(flat2, q0); - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *) (s - 0 * p), q0); - - q1 = _mm_andnot_si128(flat2, q1); - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *) (s + 1 * p), q1); - - q2 = _mm_andnot_si128(flat2, q2); - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *) (s + 2 * p), q2); - - q3 = _mm_andnot_si128(flat2, q3); - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *) (s + 3 * p), q3); - - q4 = _mm_andnot_si128(flat2, q4); - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *) (s + 4 * p), q4); - - q5 = _mm_andnot_si128(flat2, q5); - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *) (s + 5 * p), q5); - - q6 = _mm_andnot_si128(flat2, q6); - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *) (s + 6 * p), q6); - } -} diff --git a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c deleted file mode 100644 index 739adf31d0..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/loopfilter_sse2.c +++ /dev/null @@ -1,1776 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <emmintrin.h> // SSE2 - -#include "./vpx_dsp_rtcd.h" -#include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" - -static INLINE __m128i abs_diff(__m128i a, __m128i b) { - return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); -} - -// filter_mask and hev_mask -#define FILTER_HEV_MASK do { \ - /* (abs(q1 - q0), abs(p1 - p0) */ \ - __m128i flat = abs_diff(q1p1, q0p0); \ - /* abs(p1 - q1), abs(p0 - q0) */ \ - const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ - __m128i abs_p0q0, abs_p1q1, work; \ - \ - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ - hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ - hev = _mm_packs_epi16(hev, hev); \ - \ - /* const int8_t mask = filter_mask(*limit, *blimit, */ \ - /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ - abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\ - abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\ - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ - /* abs(p3 - p2), abs(p2 - p1) */ \ - work = abs_diff(p3p2, p2p1); \ - flat = _mm_max_epu8(work, flat); \ - /* abs(q3 - q2), abs(q2 - q1) */ \ - work = abs_diff(q3q2, q2q1); \ - flat = _mm_max_epu8(work, flat); \ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ - mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ - mask = _mm_cmpeq_epi8(mask, zero); \ - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ -} while (0) - -#define FILTER4 do { \ - const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \ - 4, 4, 4, 4, 4, 4, 4, 4); \ - const __m128i t80 = _mm_set1_epi8(0x80); \ - __m128i filter, filter2filter1, work; \ - \ - ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ - qs1qs0 = _mm_xor_si128(q1q0, t80); \ - \ - /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ - work = _mm_subs_epi8(ps1ps0, qs1qs0); \ - filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ - /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ - filter = _mm_and_si128(filter, mask); /* & mask */ \ - filter = _mm_unpacklo_epi64(filter, filter); \ - \ - /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ - /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ - filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ - filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ - filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ - filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ - \ - /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ - filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ - filter = _mm_unpacklo_epi8(filter, filter); \ - filter = _mm_srai_epi16(filter, 9); /* round */ \ - filter = _mm_packs_epi16(filter, filter); \ - filter = _mm_andnot_si128(hev, filter); \ - \ - hev = _mm_unpackhi_epi64(filter2filter1, filter); \ - filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ - \ - /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ - qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ - /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ - ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ - qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ - ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ -} while (0) - -void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); - __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; - __m128i mask, hev; - - p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s - 4 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s + 0 * p))); - q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); - - FILTER_HEV_MASK; - FILTER4; - - _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 -} - -void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); - __m128i x0, x1, x2, x3; - __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; - __m128i mask, hev; - - // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); - - // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); - - // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); - - // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); - - // Transpose 8x8 - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - p1p0 = _mm_unpacklo_epi16(q1q0, x1); - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - x0 = _mm_unpacklo_epi16(x2, x3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - p3p2 = _mm_unpacklo_epi32(p1p0, x0); - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - p1p0 = _mm_unpackhi_epi32(p1p0, x0); - p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high - p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high - - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - q1q0 = _mm_unpackhi_epi16(q1q0, x1); - // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - x2 = _mm_unpackhi_epi16(x2, x3); - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - q3q2 = _mm_unpackhi_epi32(q1q0, x2); - // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - q1q0 = _mm_unpacklo_epi32(q1q0, x2); - - q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); - q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); - - FILTER_HEV_MASK; - FILTER4; - - // Transpose 8x4 to 4x8 - // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 - // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 - // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 - ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); - // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 - x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); - // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 - ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - - *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - - *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); -} - -void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - __m128i mask, hev, flat, flat2; - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; - __m128i abs_p1p0; - - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), - (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), - (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), - (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), - (__m64 *)(s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), - (__m64 *)(s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); - - { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = abs_diff(q0p0, p0q0); - abs_p1q1 = abs_diff(q1p1, p1q1); - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8(abs_diff(q2p2, q1p1), - abs_diff(q3p3, q2p2)); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (vpx_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - // Filter1 >> 3 - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - // filt >> 1 - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), - filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), - (__m64 *)(s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), - (__m64 *)(s + 6 * p))); - flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); - - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), - (__m64 *)(s + 7 * p))); - work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero);; - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, - pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16(four, - _mm_add_epi16(pixetFilter_p2p1p0, - pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(p7_16, p0_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(q7_16, q0_16)), 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p1_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q1_16)), 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p2_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q2_16)), 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, - _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, - _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p3_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q3_16)), 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p4_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q4_16)), 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p5_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q5_16)), 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, - _mm_add_epi16(sum_p7, p6_16)), 4); - res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, - _mm_add_epi16(sum_q7, q6_16)), 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); - } -} - -static INLINE __m128i filter_add2_sub2(const __m128i *const total, - const __m128i *const a1, - const __m128i *const a2, - const __m128i *const s1, - const __m128i *const s2) { - __m128i x = _mm_add_epi16(*a1, *total); - x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); - return x; -} - -static INLINE __m128i filter8_mask(const __m128i *const flat, - const __m128i *const other_filt, - const __m128i *const f8_lo, - const __m128i *const f8_hi) { - const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), - _mm_srli_epi16(*f8_hi, 3)); - const __m128i result = _mm_and_si128(*flat, f8); - return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); -} - -static INLINE __m128i filter16_mask(const __m128i *const flat, - const __m128i *const other_filt, - const __m128i *const f_lo, - const __m128i *const f_hi) { - const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), - _mm_srli_epi16(*f_hi, 4)); - const __m128i result = _mm_and_si128(*flat, f); - return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); -} - -void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - __m128i mask, hev, flat, flat2; - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - - __m128i op2, op1, op0, oq0, oq1, oq2; - - __m128i max_abs_p1p0q1q0; - - p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); - - { - const __m128i abs_p1p0 = abs_diff(p1, p0); - const __m128i abs_q1q0 = abs_diff(q1, q0); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); - __m128i abs_p0q0 = abs_diff(p0, q0); - __m128i abs_p1q1 = abs_diff(p1, q1); - __m128i work; - max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); - - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - { - __m128i work; - work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); - flat = _mm_max_epu8(work, max_abs_p1p0q1q0); - work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // filter4 - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ff = _mm_cmpeq_epi8(t4, t4); - - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - op1 = _mm_xor_si128(p1, t80); - op0 = _mm_xor_si128(p0, t80); - oq0 = _mm_xor_si128(q0, t80); - oq1 = _mm_xor_si128(q1, t80); - - hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); - - work_a = _mm_subs_epi8(oq0, op0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (vpx_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); - oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); - // loopfilter done - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // filter8 - { - const __m128i four = _mm_set1_epi16(4); - const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); - const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); - const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); - const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); - const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); - const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); - const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); - const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); - - const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); - const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); - const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); - const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); - const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); - const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); - const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); - const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); - __m128i f8_lo, f8_hi; - - f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), - _mm_add_epi16(p3_lo, p2_lo)); - f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), - _mm_add_epi16(p2_lo, p1_lo)); - f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); - - f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), - _mm_add_epi16(p3_hi, p2_hi)); - f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), - _mm_add_epi16(p2_hi, p1_hi)); - f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); - - op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); - op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); - op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); - oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); - oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); - oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); - const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); - const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); - const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); - const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); - const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); - const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); - const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); - const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); - const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); - const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); - const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); - const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); - const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); - const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); - const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); - - const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); - const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); - const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); - const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); - const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); - const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); - const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); - const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); - const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); - const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); - const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); - const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); - const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); - const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); - const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); - const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); - - __m128i f_lo; - __m128i f_hi; - - f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 - f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), - _mm_add_epi16(p4_lo, f_lo)); - f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), - _mm_add_epi16(p2_lo, p1_lo)); - f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); - f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); - - f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 - f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), - _mm_add_epi16(p4_hi, f_hi)); - f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), - _mm_add_epi16(p2_hi, p1_hi)); - f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); - f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); - - p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); - - f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); - p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); - - f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); - p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); - - f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); - p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); - - f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); - op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 3 * p), op2); - - f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); - op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 2 * p), op1); - - f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); - op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 1 * p), op0); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); - oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); - oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); - oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); - q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); - q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); - q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); - q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - } -} - -void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - __m128i mask, hev, flat; - __m128i p3, p2, p1, p0, q0, q1, q2, q3; - __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; - - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s - 0 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - p0q0 = _mm_shuffle_epi32(q0p0, 78); - - { - // filter_mask and hev_mask - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(fe, fe); - __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - abs_p1p0 = abs_diff(q1p1, q0p0); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - - abs_p0q0 = abs_diff(q0p0, p0q0); - abs_p1q1 = abs_diff(q1p1, p1q1); - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8(abs_diff(q2p2, q1p1), - abs_diff(q3p3, q2p2)); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - - // flat_mask4 - - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), - abs_diff(q3p3, q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - } - - { - const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - { - __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[0], - _mm_packus_epi16(workp_shft, workp_shft)); - } - } - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), - t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (vpx_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 11); - filter1 = _mm_packs_epi16(filter1, filter1); - - // Filter2 >> 3 - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 11); - filter2 = _mm_packs_epi16(filter2, zero); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - filt = _mm_unpacklo_epi8(zero, filt); - filt = _mm_srai_epi16(filt, 9); - filt = _mm_packs_epi16(filt, zero); - - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_loadl_epi64((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_loadl_epi64((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_loadl_epi64((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_loadl_epi64((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_loadl_epi64((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_loadl_epi64((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); - } -} - -void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, - const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); - const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); - - __m128i mask, hev, flat; - __m128i p3, p2, p1, p0, q0, q1, q2, q3; - - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); - __m128i work; - - // filter_mask and hev_mask - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), - _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), - _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), - _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), - _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - - // flat_mask4 - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - } - { - const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - int i = 0; - - do { - __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); - } - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (vpx_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - } -} - -void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit0, - const unsigned char *_limit0, - const unsigned char *_thresh0, - const unsigned char *_blimit1, - const unsigned char *_limit1, - const unsigned char *_thresh1) { - const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); - const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); - const __m128i zero = _mm_set1_epi16(0); - __m128i p3, p2, p1, p0, q0, q1, q2, q3; - __m128i mask, hev, flat; - - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - - // filter_mask and hev_mask - { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); - __m128i work; - - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), - _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), - _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), - _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), - _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // filter4 - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (vpx_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - } -} - -static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, - int in_p, unsigned char *out, int out_p) { - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i x8, x9, x10, x11, x12, x13, x14, x15; - - // 2-way interleave w/hoisting of unpacks - x0 = _mm_loadl_epi64((__m128i *)in0); // 1 - x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 - x0 = _mm_unpacklo_epi8(x0, x1); // 1 - - x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 - x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); // 7 - x1 = _mm_unpacklo_epi8(x2, x3); // 2 - - x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); // 9 - x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); // 11 - x2 = _mm_unpacklo_epi8(x4, x5); // 3 - - x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); // 13 - x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); // 15 - x3 = _mm_unpacklo_epi8(x6, x7); // 4 - x4 = _mm_unpacklo_epi16(x0, x1); // 9 - - x8 = _mm_loadl_epi64((__m128i *)in1); // 2 - x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 - x8 = _mm_unpacklo_epi8(x8, x9); // 5 - x5 = _mm_unpacklo_epi16(x2, x3); // 10 - - x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 - x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); // 8 - x9 = _mm_unpacklo_epi8(x10, x11); // 6 - - x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); // 10 - x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); // 12 - x10 = _mm_unpacklo_epi8(x12, x13); // 7 - x12 = _mm_unpacklo_epi16(x8, x9); // 11 - - x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); // 14 - x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); // 16 - x11 = _mm_unpacklo_epi8(x14, x15); // 8 - x13 = _mm_unpacklo_epi16(x10, x11); // 12 - - x6 = _mm_unpacklo_epi32(x4, x5); // 13 - x7 = _mm_unpackhi_epi32(x4, x5); // 14 - x14 = _mm_unpacklo_epi32(x12, x13); // 15 - x15 = _mm_unpackhi_epi32(x12, x13); // 16 - - // Store first 4-line result - _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); - _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); - - x4 = _mm_unpackhi_epi16(x0, x1); - x5 = _mm_unpackhi_epi16(x2, x3); - x12 = _mm_unpackhi_epi16(x8, x9); - x13 = _mm_unpackhi_epi16(x10, x11); - - x6 = _mm_unpacklo_epi32(x4, x5); - x7 = _mm_unpackhi_epi32(x4, x5); - x14 = _mm_unpacklo_epi32(x12, x13); - x15 = _mm_unpackhi_epi32(x12, x13); - - // Store second 4-line result - _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); - _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); -} - -static INLINE void transpose(unsigned char *src[], int in_p, - unsigned char *dst[], int out_p, - int num_8x8_to_transpose) { - int idx8x8 = 0; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - do { - unsigned char *in = src[idx8x8]; - unsigned char *out = dst[idx8x8]; - - x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 - x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 - // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - x0 = _mm_unpacklo_epi8(x0, x1); - - x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 - x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 - // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(x2, x3); - - x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 - x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 - // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(x4, x5); - - x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 - x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 - // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(x6, x7); - - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - x4 = _mm_unpacklo_epi16(x0, x1); - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - x5 = _mm_unpacklo_epi16(x2, x3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 0*out_p), - _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1*out_p), - _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 2*out_p), - _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3*out_p), - _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 - - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi16(x0, x1); - // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi16(x2, x3); - // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 4*out_p), - _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5*out_p), - _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi32(x4, x5); - - _mm_storel_pd((double *)(out + 6*out_p), - _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7*out_p), - _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); -} - -void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); - unsigned char *src[2]; - unsigned char *dst[2]; - - // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - transpose(src, 16, dst, p, 2); -} - -void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); - unsigned char *src[1]; - unsigned char *dst[1]; - - // Transpose 8x8 - src[0] = s - 4; - dst[0] = t_dst; - - transpose(src, p, dst, 8, 1); - - // Loop filtering - vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); - - src[0] = t_dst; - dst[0] = s - 4; - - // Transpose back - transpose(src, 8, dst, p, 1); -} - -void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); - unsigned char *src[2]; - unsigned char *dst[2]; - - // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); - src[0] = t_dst; - src[1] = t_dst + 8; - - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - transpose(src, 16, dst, p, 2); -} - -void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); - unsigned char *src[2]; - unsigned char *dst[2]; - - src[0] = s - 8; - src[1] = s; - dst[0] = t_dst; - dst[1] = t_dst + 8 * 8; - - // Transpose 16x8 - transpose(src, p, dst, 8, 2); - - // Loop filtering - vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); - - src[0] = t_dst; - src[1] = t_dst + 8 * 8; - dst[0] = s - 8; - dst[1] = s; - - // Transpose back - transpose(src, 8, dst, p, 2); -} - -void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - DECLARE_ALIGNED(16, unsigned char, t_dst[256]); - - // Transpose 16x16 - transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - - // Loop filtering - vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); - - // Transpose back - transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); -} diff --git a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h deleted file mode 100644 index 536b206876..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/txfm_common_sse2.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_ -#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_ - -#include <emmintrin.h> -#include "vpx/vpx_integer.h" - -#define pair_set_epi16(a, b) \ - _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) - -#define dual_set_epi16(a, b) \ - _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ - (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) - -#define octa_set_epi16(a, b, c, d, e, f, g, h) \ - _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ - (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) - -#endif // VPX_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c deleted file mode 100644 index 422b0fc422..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_asm_stubs.c +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/convolve.h" - -#if HAVE_SSE2 -filter8_1dfunction vpx_filter_block1d16_v8_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_sse2; -filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; - -filter8_1dfunction vpx_filter_block1d16_v2_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_sse2; -filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; - -// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); - -// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_ , sse2); - -#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; - -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; - -// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - sse2); - -// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_ , sse2); -#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -#endif // HAVE_SSE2 diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm deleted file mode 100644 index abc0270655..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ /dev/null @@ -1,228 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro convolve_fn 1-2 -%ifidn %1, avg -%define AUX_XMM_REGS 4 -%else -%define AUX_XMM_REGS 0 -%endif -%ifidn %2, highbd -%define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ - dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd -%else -%define pavg pavgb -cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ - dst, dst_stride, \ - fx, fxs, fy, fys, w, h -%endif - mov r4d, dword wm -%ifidn %2, highbd - shl r4d, 1 - shl srcq, 1 - shl src_strideq, 1 - shl dstq, 1 - shl dst_strideq, 1 -%else - cmp r4d, 4 - je .w4 -%endif - cmp r4d, 8 - je .w8 - cmp r4d, 16 - je .w16 - cmp r4d, 32 - je .w32 -%ifidn %2, highbd - cmp r4d, 64 - je .w64 - - mov r4d, dword hm -.loop128: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+16] - pavg m2, [dstq+32] - pavg m3, [dstq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - movu m0, [srcq+64] - movu m1, [srcq+80] - movu m2, [srcq+96] - movu m3, [srcq+112] - add srcq, src_strideq -%ifidn %1, avg - pavg m0, [dstq+64] - pavg m1, [dstq+80] - pavg m2, [dstq+96] - pavg m3, [dstq+112] -%endif - mova [dstq+64], m0 - mova [dstq+80], m1 - mova [dstq+96], m2 - mova [dstq+112], m3 - add dstq, dst_strideq - dec r4d - jnz .loop128 - RET -%endif - -.w64 - mov r4d, dword hm -.loop64: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] - add srcq, src_strideq -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+16] - pavg m2, [dstq+32] - pavg m3, [dstq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - add dstq, dst_strideq - dec r4d - jnz .loop64 - RET - -.w32: - mov r4d, dword hm -.loop32: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+src_strideq] - movu m3, [srcq+src_strideq+16] - lea srcq, [srcq+src_strideq*2] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq +16] - pavg m2, [dstq+dst_strideq] - pavg m3, [dstq+dst_strideq+16] -%endif - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+dst_strideq ], m2 - mova [dstq+dst_strideq+16], m3 - lea dstq, [dstq+dst_strideq*2] - sub r4d, 2 - jnz .loop32 - RET - -.w16: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] -.loop16: - movu m0, [srcq] - movu m1, [srcq+src_strideq] - movu m2, [srcq+src_strideq*2] - movu m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+dst_strideq] - pavg m2, [dstq+dst_strideq*2] - pavg m3, [dstq+r6q] -%endif - mova [dstq ], m0 - mova [dstq+dst_strideq ], m1 - mova [dstq+dst_strideq*2], m2 - mova [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 - jnz .loop16 - RET - -.w8: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] -.loop8: - movh m0, [srcq] - movh m1, [srcq+src_strideq] - movh m2, [srcq+src_strideq*2] - movh m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] -%ifidn %1, avg - movh m4, [dstq] - movh m5, [dstq+dst_strideq] - movh m6, [dstq+dst_strideq*2] - movh m7, [dstq+r6q] - pavg m0, m4 - pavg m1, m5 - pavg m2, m6 - pavg m3, m7 -%endif - movh [dstq ], m0 - movh [dstq+dst_strideq ], m1 - movh [dstq+dst_strideq*2], m2 - movh [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 - jnz .loop8 - RET - -%ifnidn %2, highbd -.w4: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] -.loop4: - movd m0, [srcq] - movd m1, [srcq+src_strideq] - movd m2, [srcq+src_strideq*2] - movd m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] -%ifidn %1, avg - movd m4, [dstq] - movd m5, [dstq+dst_strideq] - movd m6, [dstq+dst_strideq*2] - movd m7, [dstq+r6q] - pavg m0, m4 - pavg m1, m5 - pavg m2, m6 - pavg m3, m7 -%endif - movd [dstq ], m0 - movd [dstq+dst_strideq ], m1 - movd [dstq+dst_strideq*2], m2 - movd [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 - jnz .loop4 - RET -%endif -%endmacro - -INIT_XMM sse2 -convolve_fn copy -convolve_fn avg -%if CONFIG_VP9_HIGHBITDEPTH -convolve_fn copy, highbd -convolve_fn avg, highbd -%endif diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c deleted file mode 100644 index d8a92354c9..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ /dev/null @@ -1,606 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Due to a header conflict between math.h and intrinsics includes with ceil() -// in certain configurations under vs9 this include needs to precede -// immintrin.h. - -#include <immintrin.h> - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/convolve.h" -#include "vpx_ports/mem.h" - -// filters for 16_h8 and 16_v8 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; - -#if defined(__clang__) -// -- GODOT start - -# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ - (!defined(__MACPORTS__) && defined(__APPLE__) && \ - ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ - (__clang_major__ == 5 && __clang_minor__ == 0))) -// -- GODOT end -- -# define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -# else // clang > 3.3, and not 5.0 on macosx. -# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -# endif // clang <= 3.3 -#elif defined(__GNUC__) -# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) -# define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 -# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) -# else // gcc > 4.7 -# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -# endif // gcc <= 4.6 -#else // !(gcc || clang) -# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // __clang__ - -static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, - ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, srcReg32b2, filtersReg32; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x706u)); - - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); - - // multiple the size of the source and destination stride by two - src_stride = src_pixels_per_line << 1; - dst_stride = output_pitch << 1; - for (i = output_height; i > 1; i-=2) { - // load the 2 strides of source - srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm_loadu_si128((const __m128i *) - (src_ptr+src_pixels_per_line-3)), 1); - - // filter the source buffer - srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, - _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - // reading 2 strides of the next 16 bytes - // (part of it was being read by earlier read) - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm_loadu_si128((const __m128i *) - (src_ptr+src_pixels_per_line+5)), 1); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, - _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - // filter the source buffer - srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, - _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, - _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); - - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, - srcRegFilt32b2_1); - - src_ptr+=src_stride; - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); - - // save the next 16 bits - _mm_store_si128((__m128i*)(output_ptr+output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); - output_ptr+=dst_stride; - } - - // if the number of strides is odd. - // process only 16 bytes - if (i > 0) { - __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; - __m128i srcRegFilt2, srcRegFilt3; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); - - // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, - _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - - // reading the next 16 bytes - // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - // filter the source buffer - srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, - _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm256_castsi256_si128(addFilterReg64)); - - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); - } -} - -static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, - ptrdiff_t src_pitch, - uint8_t *output_ptr, - ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64; - __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; - __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, filtersReg32; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, - _mm256_set1_epi16(0x706u)); - - // multiple the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); - - // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); - - // merge every two consecutive registers except the last one - srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); - srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); - - // save - srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save - srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); - - - for (i = output_height; i > 1; i-=2) { - // load the last 2 loads of 16 bytes and have every two - // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); - - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); - - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - - // add and saturate the results together - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); - - src_ptr+=src_stride; - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, - _mm256_castsi256_si128(srcReg32b1)); - - // save the next 16 bits - _mm_store_si128((__m128i*)(output_ptr+out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); - - output_ptr+=dst_stride; - - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b1 = srcReg32b3; - srcReg32b11 = srcReg32b2; - srcReg32b3 = srcReg32b5; - srcReg32b2 = srcReg32b4; - srcReg32b5 = srcReg32b7; - srcReg32b7 = srcReg32b9; - } - if (i > 0) { - __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; - __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; - // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the last 2 results together - srcRegFilt4 = _mm_unpacklo_epi8( - _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - srcRegFilt7 = _mm_unpackhi_epi8( - _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, - _mm256_castsi256_si128(forthFilters)); - srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, - _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); - - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); - srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), - _mm256_castsi256_si128(secondFilters)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); - srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), - _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_min_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, - _mm_min_epi16(srcRegFilt5, srcRegFilt7)); - - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_max_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, - _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, - _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); - } -} - -#if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 -filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction vpx_filter_block1d8_v8_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 -filter8_1dfunction vpx_filter_block1d16_v2_ssse3; -filter8_1dfunction vpx_filter_block1d16_h2_ssse3; -filter8_1dfunction vpx_filter_block1d8_v2_ssse3; -filter8_1dfunction vpx_filter_block1d8_h2_ssse3; -filter8_1dfunction vpx_filter_block1d4_v2_ssse3; -filter8_1dfunction vpx_filter_block1d4_h2_ssse3; -#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 -#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 -#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 -#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 -#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 -#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 -#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 -// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); - -// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, avx2); -#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c deleted file mode 100644 index 6fd52087c7..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ /dev/null @@ -1,915 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// Due to a header conflict between math.h and intrinsics includes with ceil() -// in certain configurations under vs9 this include needs to precede -// tmmintrin.h. - -#include <tmmintrin.h> - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_dsp/x86/convolve.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" - -// filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -// filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; - -// These are reused by the avx2 intrinsics. -filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; - -void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, - ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i firstFilters, secondFilters, shuffle1, shuffle2; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 =_mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter into the first lane - firstFilters = _mm_shufflelo_epi16(filtersReg, 0); - // duplicate only the third 16 bit in the filter into the first lane - secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); - // duplicate only the seconds 16 bits in the filter into the second lane - // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 - firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); - // duplicate only the forth 16 bits in the filter into the second lane - // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 - secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); - - // loading the local filters - shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); - shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); - - // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); - srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); - - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); - - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr+=src_pixels_per_line; - - // save only 4 bytes - *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); - - output_ptr+=output_pitch; - } -} - -void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, - ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); - - // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); - - // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pixels_per_line; - - // save only 8 bytes - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); - - output_ptr+=output_pitch; - } -} - -void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pitch, - uint8_t *output_ptr, - ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; - __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; - __m128i srcReg8; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - - for (i = 0; i < output_height; i++) { - // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); - srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); - - // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pitch; - - // shift down a row - srcReg1 = srcReg2; - srcReg2 = srcReg3; - srcReg3 = srcReg4; - srcReg4 = srcReg5; - srcReg5 = srcReg6; - srcReg6 = srcReg7; - srcReg7 = srcReg8; - - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); - - output_ptr+=out_pitch; - } -} - -filter8_1dfunction vpx_filter_block1d16_v8_ssse3; -filter8_1dfunction vpx_filter_block1d16_h8_ssse3; -filter8_1dfunction vpx_filter_block1d8_v8_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_ssse3; -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; -filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; -filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; -filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; - -filter8_1dfunction vpx_filter_block1d16_v2_ssse3; -filter8_1dfunction vpx_filter_block1d16_h2_ssse3; -filter8_1dfunction vpx_filter_block1d8_v2_ssse3; -filter8_1dfunction vpx_filter_block1d8_h2_ssse3; -filter8_1dfunction vpx_filter_block1d4_v2_ssse3; -filter8_1dfunction vpx_filter_block1d4_h2_ssse3; -filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; -filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; -filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; -filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; -filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; -filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; - -// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ - const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ - const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ - const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ - \ - const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ - const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ - const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ - const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ - \ - out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ - out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ - out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ - out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ - out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ - out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ - out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ - out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ -} - -static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *x_filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 - const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); - // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 - const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 - const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); - // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 - const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); - const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); - const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); - const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)dst, temp); -} - -static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A, B, C, D, E, F, G, H; - - A = _mm_loadl_epi64((const __m128i *)src); - B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); - F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); - G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); - H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); - - TRANSPOSE_8X8(A, B, C, D, E, F, G, H, - A, B, C, D, E, F, G, H); - - _mm_storel_epi64((__m128i*)dst, A); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), B); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), C); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), D); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), E); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), F); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), G); - _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), H); -} - -static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, int w, int h) { - DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; - - // This function processes 8x8 areas. The intermediate height is not always - // a multiple of 8, so force it to be a multiple of 8 here. - y = h + (8 - (h & 0x7)); - - do { - int x_q4 = x0_q4; - for (x = 0; x < w; x += 8) { - // process 8 src_x steps - for (z = 0; z < 8; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - if (x_q4 & SUBPEL_MASK) { - filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); - } else { - int i; - for (i = 0; i < 8; ++i) { - temp[z * 8 + i] = src_x[i * src_stride + 3]; - } - } - x_q4 += x_step_q4; - } - - // transpose the 8x8 filters values back to dst - transpose8x8_to_dst(temp, 8, dst + x, dst_stride); - } - - src += src_stride * 8; - dst += dst_stride * 8; - } while (y -= 8); -} - -static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - // TRANSPOSE... - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // - // TO - // - // 00 10 20 30 - // 01 11 21 31 - // 02 12 22 32 - // 03 13 23 33 - // 04 14 24 34 - // 05 15 25 35 - // 06 16 26 36 - // 07 17 27 37 - // - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 02 03 12 13 22 23 32 33 - const __m128i s3s2 = _mm_srli_si128(s1s0, 8); - // 06 07 16 17 26 27 36 37 - const __m128i s7s6 = _mm_srli_si128(s5s4, 8); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 4 bytes - *(int *)dst = _mm_cvtsi128_si32(temp); -} - -static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A = _mm_cvtsi32_si128(*(const int *)src); - __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); - __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); - __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); - // 00 10 01 11 02 12 03 13 - const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); - // 20 30 21 31 22 32 23 33 - const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - A = _mm_unpacklo_epi16(tr0_0, tr0_1); - B = _mm_srli_si128(A, 4); - C = _mm_srli_si128(A, 8); - D = _mm_srli_si128(A, 12); - - *(int *)(dst) = _mm_cvtsi128_si32(A); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); - *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); - *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); -} - -static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, int w, int h) { - DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; - - for (y = 0; y < h; y += 4) { - int x_q4 = x0_q4; - for (x = 0; x < w; x += 4) { - // process 4 src_x steps - for (z = 0; z < 4; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - if (x_q4 & SUBPEL_MASK) { - filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); - } else { - int i; - for (i = 0; i < 4; ++i) { - temp[z * 4 + i] = src_x[i * src_stride + 3]; - } - } - x_q4 += x_step_q4; - } - - // transpose the 4x4 filters values back to dst - transpose4x4_to_dst(temp, 4, dst + x, dst_stride); - } - - src += src_stride * 4; - dst += dst_stride * 4; - } -} - -static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); - const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); - const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 4 bytes - *(int *)dst = _mm_cvtsi128_si32(temp); -} - -static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - - if (y_q4 & SUBPEL_MASK) { - filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - - y_q4 += y_step_q4; - } -} - -static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)dst, temp); -} - -static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - if (y_q4 & SUBPEL_MASK) { - filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - y_q4 += y_step_q4; - } -} - -static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter, int w) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - int i; - - for (i = 0; i < w; i += 16) { - const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); - const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - // merge the result together - const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); - const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); - const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); - const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); - const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); - const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); - const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); - // add and saturate the results together - const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); - const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); - // merge the result together - const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); - const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); - const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); - // merge the result together - const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); - const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); - const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); - // add and saturate the results together - __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); - __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); - - // add and saturate the results together - temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); - temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); - // round and shift by 7 bit each 16 bit - temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); - temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - temp_hi = _mm_packus_epi16(temp_lo, temp_hi); - src_ptr += 16; - // save 16 bytes convolve result - _mm_store_si128((__m128i*)&dst[i], temp_hi); - } -} - -static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - if (y_q4 & SUBPEL_MASK) { - filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, - w); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - y_q4 += y_step_q4; - } -} - -static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, - int w, int h) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - // --Require an additional 8 rows for the horiz_w8 transpose tail. - DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= 64); - assert(h <= 64); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - if (w >= 8) { - scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); - } else { - scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); - } - - if (w >= 16) { - scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); - } else if (w == 8) { - scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); - } else { - scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); - } -} - -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - scaledconvolve2d(src, src_stride, dst, dst_stride, - filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); -} - -// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_ , ssse3); diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm deleted file mode 100644 index 08f3d6a6cf..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm +++ /dev/null @@ -1,987 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;Note: tap3 and tap4 have to be applied and added after other taps to avoid -;overflow. - -%macro GET_FILTERS_4 0 - mov rdx, arg(5) ;filter ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - psrldq xmm7, 8 - pshuflw xmm4, xmm7, 0b ;k4 - pshuflw xmm5, xmm7, 01010101b ;k5 - pshuflw xmm6, xmm7, 10101010b ;k6 - pshuflw xmm7, xmm7, 11111111b ;k7 - - punpcklqdq xmm0, xmm1 - punpcklqdq xmm2, xmm3 - punpcklqdq xmm5, xmm4 - punpcklqdq xmm6, xmm7 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm2 - movdqa k5k4, xmm5 - movdqa k6k7, xmm6 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro APPLY_FILTER_4 1 - punpckldq xmm0, xmm1 ;two row in one register - punpckldq xmm6, xmm7 - punpckldq xmm2, xmm3 - punpckldq xmm5, xmm4 - - punpcklbw xmm0, zero ;unpack to word - punpcklbw xmm6, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - - pmullw xmm0, k0k1 ;multiply the filter factors - pmullw xmm6, k6k7 - pmullw xmm2, k2k3 - pmullw xmm5, k5k4 - - paddsw xmm0, xmm6 ;sum - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - psrldq xmm2, 8 - paddsw xmm0, xmm5 - psrldq xmm5, 8 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 -%endm - -%macro GET_FILTERS 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - pshufhw xmm4, xmm7, 0b ;k4 - pshufhw xmm5, xmm7, 01010101b ;k5 - pshufhw xmm6, xmm7, 10101010b ;k6 - pshufhw xmm7, xmm7, 11111111b ;k7 - - punpcklwd xmm0, xmm0 - punpcklwd xmm1, xmm1 - punpcklwd xmm2, xmm2 - punpcklwd xmm3, xmm3 - punpckhwd xmm4, xmm4 - punpckhwd xmm5, xmm5 - punpckhwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movdqa k0, xmm0 ;store filter factors on stack - movdqa k1, xmm1 - movdqa k2, xmm2 - movdqa k3, xmm3 - movdqa k4, xmm4 - movdqa k5, xmm5 - movdqa k6, xmm6 - movdqa k7, xmm7 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 ;rounding - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro LOAD_VERT_8 1 - movq xmm0, [rsi + %1] ;0 - movq xmm1, [rsi + rax + %1] ;1 - movq xmm6, [rsi + rdx * 2 + %1] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2 + %1] ;7 - movq xmm2, [rsi + rax + %1] ;2 - movq xmm3, [rsi + rax * 2 + %1] ;3 - movq xmm4, [rsi + rdx + %1] ;4 - movq xmm5, [rsi + rax * 4 + %1] ;5 -%endm - -%macro APPLY_FILTER_8 2 - punpcklbw xmm0, zero - punpcklbw xmm1, zero - punpcklbw xmm6, zero - punpcklbw xmm7, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - punpcklbw xmm3, zero - punpcklbw xmm4, zero - - pmullw xmm0, k0 - pmullw xmm1, k1 - pmullw xmm6, k6 - pmullw xmm7, k7 - pmullw xmm2, k2 - pmullw xmm5, k5 - pmullw xmm3, k3 - pmullw xmm4, k4 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm6 - paddsw xmm0, xmm7 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm3 - paddsw xmm0, xmm4 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi + %2] - pavgb xmm0, xmm1 -%endif - movq [rdi + %2], xmm0 -%endm - -;void vpx_filter_block1d4_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d4_v8_sse2) PRIVATE -sym(vpx_filter_block1d4_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d8_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d8_v8_sse2) PRIVATE -sym(vpx_filter_block1d8_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d16_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d16_v8_sse2) PRIVATE -sym(vpx_filter_block1d16_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 0, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE -sym(vpx_filter_block1d4_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 1 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE -sym(vpx_filter_block1d8_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 1, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE -sym(vpx_filter_block1d16_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 1, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 1, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d4_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d4_h8_sse2) PRIVATE -sym(vpx_filter_block1d4_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d8_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d8_h8_sse2) PRIVATE -sym(vpx_filter_block1d8_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d16_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d16_h8_sse2) PRIVATE -sym(vpx_filter_block1d16_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE -sym(vpx_filter_block1d4_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE -sym(vpx_filter_block1d8_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE -sym(vpx_filter_block1d16_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm deleted file mode 100644 index d2cb8ea292..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm +++ /dev/null @@ -1,629 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_64: times 8 dw 64 - -; %define USE_PMULHRSW -; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss -; when using this instruction. -; -; The add order below (based on ffvp9) must be followed to prevent outranges. -; x = k0k1 + k4k5 -; y = k2k3 + k6k7 -; z = signed SAT(x + y) - -SECTION .text -%if ARCH_X86_64 - %define LOCAL_VARS_SIZE 16*4 -%else - %define LOCAL_VARS_SIZE 16*6 -%endif - -%macro SETUP_LOCAL_VARS 0 - ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + - ; pmaddubsw has a higher latency on some platforms, this might be eased by - ; interleaving the instructions. - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - packsswb m4, m4 - ; TODO(slavarnway): multiple pshufb instructions had a higher latency on - ; some platforms. - pshuflw m0, m4, 0b ;k0_k1 - pshuflw m1, m4, 01010101b ;k2_k3 - pshuflw m2, m4, 10101010b ;k4_k5 - pshuflw m3, m4, 11111111b ;k6_k7 - punpcklqdq m0, m0 - punpcklqdq m1, m1 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - mova k0k1, m0 - mova k2k3, m1 - mova k4k5, m2 - mova k6k7, m3 -%if ARCH_X86_64 - %define krd m12 - %define tmp m13 - mova krd, [GLOBAL(pw_64)] -%else - %define tmp [rsp + 16*4] - %define krd [rsp + 16*5] -%if CONFIG_PIC=0 - mova m6, [GLOBAL(pw_64)] -%else - ; build constants without accessing global memory - pcmpeqb m6, m6 ;all ones - psrlw m6, 15 - psllw m6, 6 ;aka pw_64 -%endif - mova krd, m6 -%endif -%endm - -%macro HORIZx4_ROW 2 - mova %2, %1 - punpcklbw %1, %1 - punpckhbw %2, %2 - - mova m3, %2 - palignr %2, %1, 1 - palignr m3, %1, 5 - - pmaddubsw %2, k0k1k4k5 - pmaddubsw m3, k2k3k6k7 - mova m4, %2 ;k0k1 - mova m5, m3 ;k2k3 - psrldq %2, 8 ;k4k5 - psrldq m3, 8 ;k6k7 - paddsw %2, m4 - paddsw m5, m3 - paddsw %2, m5 - paddsw %2, krd - psraw %2, 7 - packuswb %2, %2 -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_HFILTER4 1 -cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - packsswb m4, m4 -%if ARCH_X86_64 - %define k0k1k4k5 m8 - %define k2k3k6k7 m9 - %define krd m10 - %define orig_height r7d - mova krd, [GLOBAL(pw_64)] - pshuflw k0k1k4k5, m4, 0b ;k0_k1 - pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 - pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 - pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 -%else - %define k0k1k4k5 [rsp + 16*0] - %define k2k3k6k7 [rsp + 16*1] - %define krd [rsp + 16*2] - %define orig_height [rsp + 16*3] - pshuflw m6, m4, 0b ;k0_k1 - pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 - pshuflw m7, m4, 01010101b ;k2_k3 - pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 -%if CONFIG_PIC=0 - mova m1, [GLOBAL(pw_64)] -%else - ; build constants without accessing global memory - pcmpeqb m1, m1 ;all ones - psrlw m1, 15 - psllw m1, 6 ;aka pw_64 -%endif - mova k0k1k4k5, m6 - mova k2k3k6k7, m7 - mova krd, m1 -%endif - mov orig_height, heightd - shr heightd, 1 -.loop: - ;Do two rows at once - movh m0, [srcq - 3] - movh m1, [srcq + 5] - punpcklqdq m0, m1 - mova m1, m0 - movh m2, [srcq + sstrideq - 3] - movh m3, [srcq + sstrideq + 5] - punpcklqdq m2, m3 - mova m3, m2 - punpcklbw m0, m0 - punpckhbw m1, m1 - punpcklbw m2, m2 - punpckhbw m3, m3 - mova m4, m1 - palignr m4, m0, 1 - pmaddubsw m4, k0k1k4k5 - palignr m1, m0, 5 - pmaddubsw m1, k2k3k6k7 - mova m7, m3 - palignr m7, m2, 1 - pmaddubsw m7, k0k1k4k5 - palignr m3, m2, 5 - pmaddubsw m3, k2k3k6k7 - mova m0, m4 ;k0k1 - mova m5, m1 ;k2k3 - mova m2, m7 ;k0k1 upper - psrldq m4, 8 ;k4k5 - psrldq m1, 8 ;k6k7 - paddsw m4, m0 - paddsw m5, m1 - mova m1, m3 ;k2k3 upper - psrldq m7, 8 ;k4k5 upper - psrldq m3, 8 ;k6k7 upper - paddsw m7, m2 - paddsw m4, m5 - paddsw m1, m3 - paddsw m7, m1 - paddsw m4, krd - psraw m4, 7 - packuswb m4, m4 - paddsw m7, krd - psraw m7, 7 - packuswb m7, m7 - -%ifidn %1, h8_avg - movd m0, [dstq] - pavgb m4, m0 - movd m2, [dstq + dstrideq] - pavgb m7, m2 -%endif - movd [dstq], m4 - movd [dstq + dstrideq], m7 - - lea srcq, [srcq + sstrideq ] - prefetcht0 [srcq + 4 * sstrideq - 3] - lea srcq, [srcq + sstrideq ] - lea dstq, [dstq + 2 * dstrideq ] - prefetcht0 [srcq + 2 * sstrideq - 3] - - dec heightd - jnz .loop - - ; Do last row if output_height is odd - mov heightd, orig_height - and heightd, 1 - je .done - - movh m0, [srcq - 3] ; load src - movh m1, [srcq + 5] - punpcklqdq m0, m1 - - HORIZx4_ROW m0, m1 -%ifidn %1, h8_avg - movd m0, [dstq] - pavgb m1, m0 -%endif - movd [dstq], m1 -.done - RET -%endm - -%macro HORIZx8_ROW 5 - mova %2, %1 - punpcklbw %1, %1 - punpckhbw %2, %2 - - mova %3, %2 - mova %4, %2 - mova %5, %2 - - palignr %2, %1, 1 - palignr %3, %1, 5 - palignr %4, %1, 9 - palignr %5, %1, 13 - - pmaddubsw %2, k0k1 - pmaddubsw %3, k2k3 - pmaddubsw %4, k4k5 - pmaddubsw %5, k6k7 - paddsw %2, %4 - paddsw %5, %3 - paddsw %2, %5 - paddsw %2, krd - psraw %2, 7 - packuswb %2, %2 - SWAP %1, %2 -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_HFILTER8 1 -cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS -%if ARCH_X86_64 - %define orig_height r7d -%else - %define orig_height heightmp -%endif - mov orig_height, heightd - shr heightd, 1 - -.loop: - movh m0, [srcq - 3] - movh m3, [srcq + 5] - movh m4, [srcq + sstrideq - 3] - movh m7, [srcq + sstrideq + 5] - punpcklqdq m0, m3 - mova m1, m0 - punpcklbw m0, m0 - punpckhbw m1, m1 - mova m5, m1 - palignr m5, m0, 13 - pmaddubsw m5, k6k7 - mova m2, m1 - mova m3, m1 - palignr m1, m0, 1 - pmaddubsw m1, k0k1 - punpcklqdq m4, m7 - mova m6, m4 - punpcklbw m4, m4 - palignr m2, m0, 5 - punpckhbw m6, m6 - palignr m3, m0, 9 - mova m7, m6 - pmaddubsw m2, k2k3 - pmaddubsw m3, k4k5 - - palignr m7, m4, 13 - mova m0, m6 - palignr m0, m4, 5 - pmaddubsw m7, k6k7 - paddsw m1, m3 - paddsw m2, m5 - paddsw m1, m2 - mova m5, m6 - palignr m6, m4, 1 - pmaddubsw m0, k2k3 - pmaddubsw m6, k0k1 - palignr m5, m4, 9 - paddsw m1, krd - pmaddubsw m5, k4k5 - psraw m1, 7 - paddsw m0, m7 -%ifidn %1, h8_avg - movh m7, [dstq] - movh m2, [dstq + dstrideq] -%endif - packuswb m1, m1 - paddsw m6, m5 - paddsw m6, m0 - paddsw m6, krd - psraw m6, 7 - packuswb m6, m6 -%ifidn %1, h8_avg - pavgb m1, m7 - pavgb m6, m2 -%endif - movh [dstq], m1 - movh [dstq + dstrideq], m6 - - lea srcq, [srcq + sstrideq ] - prefetcht0 [srcq + 4 * sstrideq - 3] - lea srcq, [srcq + sstrideq ] - lea dstq, [dstq + 2 * dstrideq ] - prefetcht0 [srcq + 2 * sstrideq - 3] - dec heightd - jnz .loop - - ;Do last row if output_height is odd - mov heightd, orig_height - and heightd, 1 - je .done - - movh m0, [srcq - 3] - movh m3, [srcq + 5] - punpcklqdq m0, m3 - - HORIZx8_ROW m0, m1, m2, m3, m4 - -%ifidn %1, h8_avg - movh m1, [dstq] - pavgb m0, m1 -%endif - movh [dstq], m0 -.done: - RET -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_HFILTER16 1 -cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS -.loop: - prefetcht0 [srcq + 2 * sstrideq -3] - - movh m0, [srcq - 3] - movh m4, [srcq + 5] - movh m6, [srcq + 13] - punpcklqdq m0, m4 - mova m7, m0 - punpckhbw m0, m0 - mova m1, m0 - punpcklqdq m4, m6 - mova m3, m0 - punpcklbw m7, m7 - - palignr m3, m7, 13 - mova m2, m0 - pmaddubsw m3, k6k7 - palignr m0, m7, 1 - pmaddubsw m0, k0k1 - palignr m1, m7, 5 - pmaddubsw m1, k2k3 - palignr m2, m7, 9 - pmaddubsw m2, k4k5 - paddsw m1, m3 - mova m3, m4 - punpckhbw m4, m4 - mova m5, m4 - punpcklbw m3, m3 - mova m7, m4 - palignr m5, m3, 5 - mova m6, m4 - palignr m4, m3, 1 - pmaddubsw m4, k0k1 - pmaddubsw m5, k2k3 - palignr m6, m3, 9 - pmaddubsw m6, k4k5 - palignr m7, m3, 13 - pmaddubsw m7, k6k7 - paddsw m0, m2 - paddsw m0, m1 -%ifidn %1, h8_avg - mova m1, [dstq] -%endif - paddsw m4, m6 - paddsw m5, m7 - paddsw m4, m5 - paddsw m0, krd - paddsw m4, krd - psraw m0, 7 - psraw m4, 7 - packuswb m0, m4 -%ifidn %1, h8_avg - pavgb m0, m1 -%endif - lea srcq, [srcq + sstrideq] - mova [dstq], m0 - lea dstq, [dstq + dstrideq] - dec heightd - jnz .loop - RET -%endm - -INIT_XMM ssse3 -SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER16 h8_avg -SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER8 h8_avg -SUBPIX_HFILTER4 h8 -SUBPIX_HFILTER4 h8_avg - -;------------------------------------------------------------------------------- -%macro SUBPIX_VFILTER 2 -cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS -%if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq -%else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp -%endif - mov src1q, srcq - add src1q, sstrideq - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 - -%ifidn %2, 8 - %define movx movh -%else - %define movx movd -%endif -.loop: - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - punpcklbw m0, m1 ;A B - movx m2, [srcq + sstrideq * 2 ] ;C - pmaddubsw m0, k0k1 - mova m6, m2 - movx m3, [src1q + sstrideq * 2] ;D - punpcklbw m2, m3 ;C D - pmaddubsw m2, k2k3 - movx m4, [srcq + sstrideq * 4 ] ;E - mova m7, m4 - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m4, k4k5 - punpcklbw m1, m6 ;A B next iter - movx m6, [srcq + sstride6q ] ;G - punpcklbw m5, m6 ;E F next iter - punpcklbw m3, m7 ;C D next iter - pmaddubsw m5, k4k5 - movx m7, [src1q + sstride6q ] ;H - punpcklbw m6, m7 ;G H - pmaddubsw m6, k6k7 - pmaddubsw m3, k2k3 - pmaddubsw m1, k0k1 - paddsw m0, m4 - paddsw m2, m6 - movx m6, [srcq + sstrideq * 8 ] ;H next iter - punpcklbw m7, m6 - pmaddubsw m7, k6k7 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - paddsw m1, m5 - packuswb m0, m0 - - paddsw m3, m7 - paddsw m1, m3 - paddsw m1, krd - psraw m1, 7 - lea srcq, [srcq + sstrideq * 2 ] - lea src1q, [src1q + sstrideq * 2] - packuswb m1, m1 - -%ifidn %1, v8_avg - movx m2, [dstq] - pavgb m0, m2 -%endif - movx [dstq], m0 - add dstq, dst_stride -%ifidn %1, v8_avg - movx m3, [dstq] - pavgb m1, m3 -%endif - movx [dstq], m1 - add dstq, dst_stride - sub heightd, 2 - cmp heightd, 1 - jg .loop - - cmp heightd, 0 - je .done - - movx m0, [srcq ] ;A - movx m1, [srcq + sstrideq ] ;B - movx m6, [srcq + sstride6q ] ;G - punpcklbw m0, m1 ;A B - movx m7, [src1q + sstride6q ] ;H - pmaddubsw m0, k0k1 - movx m2, [srcq + sstrideq * 2 ] ;C - punpcklbw m6, m7 ;G H - movx m3, [src1q + sstrideq * 2] ;D - pmaddubsw m6, k6k7 - movx m4, [srcq + sstrideq * 4 ] ;E - punpcklbw m2, m3 ;C D - movx m5, [src1q + sstrideq * 4] ;F - punpcklbw m4, m5 ;E F - pmaddubsw m2, k2k3 - pmaddubsw m4, k4k5 - paddsw m2, m6 - paddsw m0, m4 - paddsw m0, m2 - paddsw m0, krd - psraw m0, 7 - packuswb m0, m0 -%ifidn %1, v8_avg - movx m1, [dstq] - pavgb m0, m1 -%endif - movx [dstq], m0 -.done: - RET -%endm - -;------------------------------------------------------------------------------- -%macro SUBPIX_VFILTER16 1 -cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ - src, sstride, dst, dstride, height, filter - mova m4, [filterq] - SETUP_LOCAL_VARS -%if ARCH_X86_64 - %define src1q r7 - %define sstride6q r8 - %define dst_stride dstrideq -%else - %define src1q filterq - %define sstride6q dstrideq - %define dst_stride dstridemp -%endif - mov src1q, srcq - add src1q, sstrideq - lea sstride6q, [sstrideq + sstrideq * 4] - add sstride6q, sstrideq ;pitch * 6 - -.loop: - movh m0, [srcq ] ;A - movh m1, [srcq + sstrideq ] ;B - movh m2, [srcq + sstrideq * 2 ] ;C - movh m3, [src1q + sstrideq * 2] ;D - movh m4, [srcq + sstrideq * 4 ] ;E - movh m5, [src1q + sstrideq * 4] ;F - - punpcklbw m0, m1 ;A B - movh m6, [srcq + sstride6q] ;G - punpcklbw m2, m3 ;C D - movh m7, [src1q + sstride6q] ;H - punpcklbw m4, m5 ;E F - pmaddubsw m0, k0k1 - movh m3, [srcq + 8] ;A - pmaddubsw m2, k2k3 - punpcklbw m6, m7 ;G H - movh m5, [srcq + sstrideq + 8] ;B - pmaddubsw m4, k4k5 - punpcklbw m3, m5 ;A B - movh m7, [srcq + sstrideq * 2 + 8] ;C - pmaddubsw m6, k6k7 - movh m5, [src1q + sstrideq * 2 + 8] ;D - punpcklbw m7, m5 ;C D - paddsw m2, m6 - pmaddubsw m3, k0k1 - movh m1, [srcq + sstrideq * 4 + 8] ;E - paddsw m0, m4 - pmaddubsw m7, k2k3 - movh m6, [src1q + sstrideq * 4 + 8] ;F - punpcklbw m1, m6 ;E F - paddsw m0, m2 - paddsw m0, krd - movh m2, [srcq + sstride6q + 8] ;G - pmaddubsw m1, k4k5 - movh m5, [src1q + sstride6q + 8] ;H - psraw m0, 7 - punpcklbw m2, m5 ;G H - pmaddubsw m2, k6k7 -%ifidn %1, v8_avg - mova m4, [dstq] -%endif - movh [dstq], m0 - paddsw m7, m2 - paddsw m3, m1 - paddsw m3, m7 - paddsw m3, krd - psraw m3, 7 - packuswb m0, m3 - - add srcq, sstrideq - add src1q, sstrideq -%ifidn %1, v8_avg - pavgb m0, m4 -%endif - mova [dstq], m0 - add dstq, dst_stride - dec heightd - jnz .loop - RET -%endm - -INIT_XMM ssse3 -SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER16 v8_avg -SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8_avg, 8 -SUBPIX_VFILTER v8, 4 -SUBPIX_VFILTER v8_avg, 4 diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm deleted file mode 100644 index a378dd0402..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm +++ /dev/null @@ -1,448 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm3, [rdx] ;load filters - pshuflw xmm4, xmm3, 11111111b ;k3 - psrldq xmm3, 8 - pshuflw xmm3, xmm3, 0b ;k4 - punpcklqdq xmm4, xmm3 ;k3k4 - - movq xmm3, rcx ;rounding - pshufd xmm3, xmm3, 0 - - pxor xmm2, xmm2 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - - punpckldq xmm0, xmm1 ;two row in one register - punpcklbw xmm0, xmm2 ;unpack to word - pmullw xmm0, xmm4 ;multiply the filter factors - - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - - paddsw xmm0, xmm3 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - - pshuflw xmm6, xmm7, 11111111b ;k3 - pshufhw xmm7, xmm7, 0b ;k4 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movq xmm4, rcx ;rounding - pshufd xmm4, xmm4, 0 - - pxor xmm5, xmm5 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - pmullw xmm2, xmm6 - pmullw xmm3, xmm7 - - paddsw xmm0, xmm1 - paddsw xmm2, xmm3 - - paddsw xmm0, xmm4 ;rounding - paddsw xmm2, xmm4 - psraw xmm0, 7 ;shift - psraw xmm2, 7 - packuswb xmm0, xmm2 ;pack back to byte -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -global sym(vpx_filter_block1d4_v2_sse2) PRIVATE -sym(vpx_filter_block1d4_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_v2_sse2) PRIVATE -sym(vpx_filter_block1d8_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_v2_sse2) PRIVATE -sym(vpx_filter_block1d16_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE -sym(vpx_filter_block1d4_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE -sym(vpx_filter_block1d8_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE -sym(vpx_filter_block1d16_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_h2_sse2) PRIVATE -sym(vpx_filter_block1d4_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_h2_sse2) PRIVATE -sym(vpx_filter_block1d8_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_h2_sse2) PRIVATE -sym(vpx_filter_block1d16_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE -sym(vpx_filter_block1d4_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE -sym(vpx_filter_block1d8_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE -sym(vpx_filter_block1d16_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm deleted file mode 100644 index 3c8cfd2253..0000000000 --- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm +++ /dev/null @@ -1,422 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm3, [rdx] ;load filters - psrldq xmm3, 6 - packsswb xmm3, xmm3 - pshuflw xmm3, xmm3, 0b ;k3_k4 - - movq xmm2, rcx ;rounding - pshufd xmm2, xmm2, 0 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - punpcklbw xmm0, xmm1 - pmaddubsw xmm0, xmm3 - - paddsw xmm0, xmm2 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - psrldq xmm7, 6 - packsswb xmm7, xmm7 - pshuflw xmm7, xmm7, 0b ;k3_k4 - punpcklwd xmm7, xmm7 - - movq xmm6, rcx ;rounding - pshufd xmm6, xmm6, 0 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm1 - pmaddubsw xmm0, xmm7 - - paddsw xmm0, xmm6 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte - -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm1 - punpckhbw xmm2, xmm1 - pmaddubsw xmm0, xmm7 - pmaddubsw xmm2, xmm7 - - paddsw xmm0, xmm6 ;rounding - paddsw xmm2, xmm6 - psraw xmm0, 7 ;shift - psraw xmm2, 7 - packuswb xmm0, xmm2 ;pack back to byte - -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE -sym(vpx_filter_block1d4_v2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE -sym(vpx_filter_block1d8_v2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE -sym(vpx_filter_block1d16_v2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE -sym(vpx_filter_block1d4_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE -sym(vpx_filter_block1d8_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE -sym(vpx_filter_block1d16_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE -sym(vpx_filter_block1d4_h2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE -sym(vpx_filter_block1d8_h2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE -sym(vpx_filter_block1d16_h2_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE -sym(vpx_filter_block1d4_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE -sym(vpx_filter_block1d8_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE -sym(vpx_filter_block1d16_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret |