diff options
Diffstat (limited to 'thirdparty/libvpx/vp8/common/x86')
21 files changed, 0 insertions, 11049 deletions
diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm deleted file mode 100644 index 86fae26956..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm +++ /dev/null @@ -1,93 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -;void vp8_copy32xn_sse2( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp8_copy32xn_sse2) PRIVATE -sym(vp8_copy32xn_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;dst_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;dst_stride - movsxd rcx, dword ptr arg(4) ;height - -.block_copy_sse2_loopx4: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - movdqu xmm2, XMMWORD PTR [rsi + rax] - movdqu xmm3, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqu xmm4, XMMWORD PTR [rsi] - movdqu xmm5, XMMWORD PTR [rsi + 16] - movdqu xmm6, XMMWORD PTR [rsi + rax] - movdqu xmm7, XMMWORD PTR [rsi + rax + 16] - - lea rsi, [rsi+rax*2] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - movdqa XMMWORD PTR [rdi + rdx], xmm2 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 - - lea rdi, [rdi+rdx*2] - - movdqa XMMWORD PTR [rdi], xmm4 - movdqa XMMWORD PTR [rdi + 16], xmm5 - movdqa XMMWORD PTR [rdi + rdx], xmm6 - movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 - - lea rdi, [rdi+rdx*2] - - sub rcx, 4 - cmp rcx, 4 - jge .block_copy_sse2_loopx4 - - cmp rcx, 0 - je .copy_is_done - -.block_copy_sse2_loop: - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi + 16] - lea rsi, [rsi+rax] - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm1 - lea rdi, [rdi+rdx] - - sub rcx, 1 - jne .block_copy_sse2_loop - -.copy_is_done: - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm deleted file mode 100644 index d789a40ccf..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm +++ /dev/null @@ -1,146 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define max_sad arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define max_sad [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define max_sad r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define max_sad - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - - -;void vp8_copy32xn_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; int height); -global sym(vp8_copy32xn_sse3) PRIVATE -sym(vp8_copy32xn_sse3): - - STACK_FRAME_CREATE_X3 - -.block_copy_sse3_loopx4: - lea end_ptr, [src_ptr+src_stride*2] - - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] - movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] - movdqu xmm4, XMMWORD PTR [end_ptr] - movdqu xmm5, XMMWORD PTR [end_ptr + 16] - movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] - movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] - - lea src_ptr, [src_ptr+src_stride*4] - - lea end_ptr, [ref_ptr+ref_stride*2] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 - movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 - movdqa XMMWORD PTR [end_ptr], xmm4 - movdqa XMMWORD PTR [end_ptr + 16], xmm5 - movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 - movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 - - lea ref_ptr, [ref_ptr+ref_stride*4] - - sub height, 4 - cmp height, 4 - jge .block_copy_sse3_loopx4 - - ;Check to see if there is more rows need to be copied. - cmp height, 0 - je .copy_is_done - -.block_copy_sse3_loop: - movdqu xmm0, XMMWORD PTR [src_ptr] - movdqu xmm1, XMMWORD PTR [src_ptr + 16] - lea src_ptr, [src_ptr+src_stride] - - movdqa XMMWORD PTR [ref_ptr], xmm0 - movdqa XMMWORD PTR [ref_ptr + 16], xmm1 - lea ref_ptr, [ref_ptr+ref_stride] - - sub height, 1 - jne .block_copy_sse3_loop - -.copy_is_done: - STACK_FRAME_DESTROY_X3 diff --git a/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm b/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm deleted file mode 100644 index 4e551f00aa..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm +++ /dev/null @@ -1,258 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) -global sym(vp8_dequantize_b_impl_mmx) PRIVATE -sym(vp8_dequantize_b_impl_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;sq - mov rdi, arg(1) ;dq - mov rax, arg(2) ;q - - movq mm1, [rsi] - pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. - movq [rdi], mm1 - - movq mm1, [rsi+8] - pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. - movq [rdi+8], mm1 - - movq mm1, [rsi+16] - pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. - movq [rdi+16], mm1 - - movq mm1, [rsi+24] - pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. - movq [rdi+24], mm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void dequant_idct_add_mmx( -;short *input, 0 -;short *dq, 1 -;unsigned char *dest, 2 -;int stride) 3 -global sym(vp8_dequant_idct_add_mmx) PRIVATE -sym(vp8_dequant_idct_add_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - GET_GOT rbx - push rdi - ; end prolog - - mov rax, arg(0) ;input - mov rdx, arg(1) ;dq - - - movq mm0, [rax ] - pmullw mm0, [rdx] - - movq mm1, [rax +8] - pmullw mm1, [rdx +8] - - movq mm2, [rax+16] - pmullw mm2, [rdx+16] - - movq mm3, [rax+24] - pmullw mm3, [rdx+24] - - mov rdx, arg(2) ;dest - - pxor mm7, mm7 - - - movq [rax], mm7 - movq [rax+8], mm7 - - movq [rax+16],mm7 - movq [rax+24],mm7 - - - movsxd rdi, dword ptr arg(3) ;stride - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - movq mm3, mm5 ; 33 23 13 03 - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - paddw mm0, [GLOBAL(fours)] - - paddw mm2, [GLOBAL(fours)] - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - psraw mm2, 3 - - psraw mm0, 3 - psraw mm4, 3 - - psraw mm6, 3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - pxor mm7, mm7 - - movd mm4, [rdx] - punpcklbw mm4, mm7 - paddsw mm0, mm4 - packuswb mm0, mm7 - movd [rdx], mm0 - - movd mm4, [rdx+rdi] - punpcklbw mm4, mm7 - paddsw mm1, mm4 - packuswb mm1, mm7 - movd [rdx+rdi], mm1 - - movd mm4, [rdx+2*rdi] - punpcklbw mm4, mm7 - paddsw mm2, mm4 - packuswb mm2, mm7 - movd [rdx+rdi*2], mm2 - - add rdx, rdi - - movd mm4, [rdx+2*rdi] - punpcklbw mm4, mm7 - paddsw mm5, mm4 - packuswb mm5, mm7 - movd [rdx+rdi*2], mm5 - - ; begin epilog - pop rdi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -x_s1sqr2: - times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 4 dw 0x4E7B -align 16 -fours: - times 4 dw 0x0004 diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.c b/thirdparty/libvpx/vp8/common/x86/filter_x86.c deleted file mode 100644 index 7f496ed7db..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/filter_x86.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp8/common/x86/filter_x86.h" - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = -{ - { 128, 128, 128, 128, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 112, 112, 112, 112 } -}; - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = -{ - { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } -}; diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.h b/thirdparty/libvpx/vp8/common/x86/filter_x86.h deleted file mode 100644 index d282841bee..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/filter_x86.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP8_COMMON_X86_FILTER_X86_H_ -#define VP8_COMMON_X86_FILTER_X86_H_ - -#include "vpx_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with - * duplicated values */ - -/* duplicated 4x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); - -/* duplicated 8x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c deleted file mode 100644 index f2532b34da..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vp8/common/blockd.h" -#include "vpx_mem/vpx_mem.h" - -extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); - -void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC) -{ - short *sq = (short *) d->qcoeff; - short *dq = (short *) d->dqcoeff; - - vp8_dequantize_b_impl_mmx(sq, dq, DQC); -} - -void vp8_dequant_idct_add_y_block_mmx - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (eobs[0] > 1) - vp8_dequant_idct_add_mmx (q, dq, dst, stride); - else if (eobs[0] == 1) - { - vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride); - memset(q, 0, 2 * sizeof(q[0])); - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride); - else if (eobs[1] == 1) - { - vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride, - dst+4, stride); - memset(q + 16, 0, 2 * sizeof(q[0])); - } - - if (eobs[2] > 1) - vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride); - else if (eobs[2] == 1) - { - vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride, - dst+8, stride); - memset(q + 32, 0, 2 * sizeof(q[0])); - } - - if (eobs[3] > 1) - vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride); - else if (eobs[3] == 1) - { - vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride, - dst+12, stride); - memset(q + 48, 0, 2 * sizeof(q[0])); - } - - q += 64; - dst += 4*stride; - eobs += 4; - } -} - -void vp8_dequant_idct_add_uv_block_mmx - (short *q, short *dq, - unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) -{ - int i; - - for (i = 0; i < 2; i++) - { - if (eobs[0] > 1) - vp8_dequant_idct_add_mmx (q, dq, dstu, stride); - else if (eobs[0] == 1) - { - vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride); - memset(q, 0, 2 * sizeof(q[0])); - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride); - else if (eobs[1] == 1) - { - vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride, - dstu+4, stride); - memset(q + 16, 0, 2 * sizeof(q[0])); - } - - q += 32; - dstu += 4*stride; - eobs += 2; - } - - for (i = 0; i < 2; i++) - { - if (eobs[0] > 1) - vp8_dequant_idct_add_mmx (q, dq, dstv, stride); - else if (eobs[0] == 1) - { - vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride); - memset(q, 0, 2 * sizeof(q[0])); - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride); - else if (eobs[1] == 1) - { - vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride, - dstv+4, stride); - memset(q + 16, 0, 2 * sizeof(q[0])); - } - - q += 32; - dstv += 4*stride; - eobs += 2; - } -} diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c deleted file mode 100644 index ae96ec858c..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" - -void vp8_idct_dequant_0_2x_sse2 - (short *q, short *dq , - unsigned char *dst, int dst_stride); -void vp8_idct_dequant_full_2x_sse2 - (short *q, short *dq , - unsigned char *dst, int dst_stride); - -void vp8_dequant_idct_add_y_block_sse2 - (short *q, short *dq, - unsigned char *dst, int stride, char *eobs) -{ - int i; - - for (i = 0; i < 4; i++) - { - if (((short *)(eobs))[0]) - { - if (((short *)(eobs))[0] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride); - } - if (((short *)(eobs))[1]) - { - if (((short *)(eobs))[1] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride); - else - vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride); - } - q += 64; - dst += stride*4; - eobs += 4; - } -} - -void vp8_dequant_idct_add_uv_block_sse2 - (short *q, short *dq, - unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) -{ - if (((short *)(eobs))[0]) - { - if (((short *)(eobs))[0] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); - } - q += 32; - dstu += stride*4; - - if (((short *)(eobs))[1]) - { - if (((short *)(eobs))[1] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); - } - q += 32; - - if (((short *)(eobs))[2]) - { - if (((short *)(eobs))[2] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); - } - q += 32; - dstv += stride*4; - - if (((short *)(eobs))[3]) - { - if (((short *)(eobs))[3] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); - } -} diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm deleted file mode 100644 index 96fa2c60d0..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm +++ /dev/null @@ -1,295 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -; /**************************************************************************** -; * Notes: -; * -; * This implementation makes use of 16 bit fixed point version of two multiply -; * constants: -; * 1. sqrt(2) * cos (pi/8) -; * 2. sqrt(2) * sin (pi/8) -; * Because the first constant is bigger than 1, to maintain the same 16 bit -; * fixed point precision as the second one, we use a trick of -; * x * a = x + x*(a-1) -; * so -; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). -; * -; * For the second constant, because of the 16bit version is 35468, which -; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative -; * number. -; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x -; * -; **************************************************************************/ - - -;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, -;int pitch, unsigned char *dest,int stride) -global sym(vp8_short_idct4x4llm_mmx) PRIVATE -sym(vp8_short_idct4x4llm_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(0) ;input - mov rsi, arg(1) ;pred - - movq mm0, [rax ] - movq mm1, [rax+ 8] - movq mm2, [rax+16] - movq mm3, [rax+24] - -%if 0 - pxor mm7, mm7 - movq [rax], mm7 - movq [rax+8], mm7 - movq [rax+16],mm7 - movq [rax+24],mm7 -%endif - movsxd rax, dword ptr arg(2) ;pitch - mov rdx, arg(3) ;dest - movsxd rdi, dword ptr arg(4) ;stride - - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - movq mm3, mm5 ; 33 23 13 03 - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)]; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 - - movq mm5, mm1 - movq mm4, mm3 - - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 - - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 - - paddw mm3, mm5 ; d1 - paddw mm0, [GLOBAL(fours)] - - paddw mm2, [GLOBAL(fours)] - movq mm6, mm2 ; a1 - - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 - - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 - - psubw mm6, mm3 ;3 - psraw mm2, 3 - - psraw mm0, 3 - psraw mm4, 3 - - psraw mm6, 3 - - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 - - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 - - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 - - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 - - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 - - pxor mm7, mm7 - - movd mm4, [rsi] - punpcklbw mm4, mm7 - paddsw mm0, mm4 - packuswb mm0, mm7 - movd [rdx], mm0 - - movd mm4, [rsi+rax] - punpcklbw mm4, mm7 - paddsw mm1, mm4 - packuswb mm1, mm7 - movd [rdx+rdi], mm1 - - movd mm4, [rsi+2*rax] - punpcklbw mm4, mm7 - paddsw mm2, mm4 - packuswb mm2, mm7 - movd [rdx+rdi*2], mm2 - - add rdx, rdi - add rsi, rax - - movd mm4, [rsi+2*rax] - punpcklbw mm4, mm7 - paddsw mm5, mm4 - packuswb mm5, mm7 - movd [rdx+rdi*2], mm5 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_dc_only_idct_add_mmx( -;short input_dc, -;unsigned char *pred_ptr, -;int pred_stride, -;unsigned char *dst_ptr, -;int stride) -global sym(vp8_dc_only_idct_add_mmx) PRIVATE -sym(vp8_dc_only_idct_add_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - ; end prolog - - movd mm5, arg(0) ;input_dc - mov rax, arg(1) ;pred_ptr - movsxd rdx, dword ptr arg(2) ;pred_stride - - pxor mm0, mm0 - - paddw mm5, [GLOBAL(fours)] - lea rcx, [rdx + rdx*2] - - psraw mm5, 3 - - punpcklwd mm5, mm5 - - punpckldq mm5, mm5 - - movd mm1, [rax] - movd mm2, [rax+rdx] - movd mm3, [rax+2*rdx] - movd mm4, [rax+rcx] - - mov rax, arg(3) ;d -- destination - movsxd rdx, dword ptr arg(4) ;dst_stride - - punpcklbw mm1, mm0 - paddsw mm1, mm5 - packuswb mm1, mm0 ; pack and unpack to saturate - lea rcx, [rdx + rdx*2] - - punpcklbw mm2, mm0 - paddsw mm2, mm5 - packuswb mm2, mm0 ; pack and unpack to saturate - - punpcklbw mm3, mm0 - paddsw mm3, mm5 - packuswb mm3, mm0 ; pack and unpack to saturate - - punpcklbw mm4, mm0 - paddsw mm4, mm5 - packuswb mm4, mm0 ; pack and unpack to saturate - - movd [rax], mm1 - movd [rax+rdx], mm2 - movd [rax+2*rdx], mm3 - movd [rax+rcx], mm4 - - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -x_s1sqr2: - times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 4 dw 0x4E7B -align 16 -fours: - times 4 dw 0x0004 diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm deleted file mode 100644 index bf8e2c4021..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm +++ /dev/null @@ -1,708 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp8_idct_dequant_0_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *dst - 2 -; int dst_stride - 3 -; ) - -global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE -sym(vp8_idct_dequant_0_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - GET_GOT rbx - ; end prolog - - mov rdx, arg(1) ; dequant - mov rax, arg(0) ; qcoeff - - movd xmm4, [rax] - movd xmm5, [rdx] - - pinsrw xmm4, [rax+32], 4 - pinsrw xmm5, [rdx], 4 - - pmullw xmm4, xmm5 - - ; Zero out xmm5, for use unpacking - pxor xmm5, xmm5 - - ; clear coeffs - movd [rax], xmm5 - movd [rax+32], xmm5 -;pshufb - mov rax, arg(2) ; dst - movsxd rdx, dword ptr arg(3) ; dst_stride - - pshuflw xmm4, xmm4, 00000000b - pshufhw xmm4, xmm4, 00000000b - - lea rcx, [rdx + rdx*2] - paddw xmm4, [GLOBAL(fours)] - - psraw xmm4, 3 - - movq xmm0, [rax] - movq xmm1, [rax+rdx] - movq xmm2, [rax+2*rdx] - movq xmm3, [rax+rcx] - - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - - - ; Add to predict buffer - paddw xmm0, xmm4 - paddw xmm1, xmm4 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - - ; pack up before storing - packuswb xmm0, xmm5 - packuswb xmm1, xmm5 - packuswb xmm2, xmm5 - packuswb xmm3, xmm5 - - ; store blocks back out - movq [rax], xmm0 - movq [rax + rdx], xmm1 - - lea rax, [rax + 2*rdx] - - movq [rax], xmm2 - movq [rax + rdx], xmm3 - - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_idct_dequant_full_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *dst - 2 -; int dst_stride - 3 -; ) -global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE -sym(vp8_idct_dequant_full_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rdx, arg(1) ; dequant - mov rdi, arg(2) ; dst - - - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - - - ; note the transpose of xmm1 and xmm2, necessary for shuffle - ; to spit out sensicle data - movdqa xmm0, [rax] - movdqa xmm2, [rax+16] - movdqa xmm1, [rax+32] - movdqa xmm3, [rax+48] - - ; Clear out coeffs - movdqa [rax], xmm7 - movdqa [rax+16], xmm7 - movdqa [rax+32], xmm7 - movdqa [rax+48], xmm7 - - ; dequantize qcoeff buffer - pmullw xmm0, [rdx] - pmullw xmm2, [rdx+16] - pmullw xmm1, [rdx] - pmullw xmm3, [rdx+16] - movsxd rdx, dword ptr arg(3) ; dst_stride - - ; repack so block 0 row x and block 1 row x are together - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm4, xmm1 - - pshufd xmm0, xmm0, 11011000b - pshufd xmm1, xmm4, 11011000b - - movdqa xmm4, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm4, xmm3 - - pshufd xmm2, xmm2, 11011000b - pshufd xmm3, xmm4, 11011000b - - ; first pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 ; - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - lea rcx, [rdx + rdx*2] ;dst_stride * 3 - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - - ; transpose for the second pass - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - ; second pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - paddw xmm0, [GLOBAL(fours)] - - paddw xmm2, [GLOBAL(fours)] - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - psraw xmm2, 3 - - psraw xmm0, 3 - psraw xmm4, 3 - - psraw xmm6, 3 - - ; transpose to save - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - pxor xmm7, xmm7 - - ; Load up predict blocks - movq xmm4, [rdi] - movq xmm5, [rdi+rdx] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - - movq xmm4, [rdi+2*rdx] - movq xmm5, [rdi+rcx] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm4 - paddw xmm3, xmm5 - -.finish: - - ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - movq [rdi + rdx*2], xmm2 - movq [rdi + rcx], xmm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_idct_dequant_dc_0_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *dst - 2 -; int dst_stride - 3 -; short *dc - 4 -; ) -global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE -sym(vp8_idct_dequant_dc_0_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - - mov rdi, arg(2) ; dst - mov rdx, arg(4) ; dc - - ; Zero out xmm5, for use unpacking - pxor xmm5, xmm5 - - ; load up 2 dc words here == 2*16 = doubleword - movd xmm4, [rdx] - - movsxd rdx, dword ptr arg(3) ; dst_stride - lea rcx, [rdx + rdx*2] - ; Load up predict blocks - movq xmm0, [rdi] - movq xmm1, [rdi+rdx*1] - movq xmm2, [rdi+rdx*2] - movq xmm3, [rdi+rcx] - - ; Duplicate and expand dc across - punpcklwd xmm4, xmm4 - punpckldq xmm4, xmm4 - - ; Rounding to dequant and downshift - paddw xmm4, [GLOBAL(fours)] - psraw xmm4, 3 - - ; Predict buffer needs to be expanded from bytes to words - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - - ; Add to predict buffer - paddw xmm0, xmm4 - paddw xmm1, xmm4 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - - ; pack up before storing - packuswb xmm0, xmm5 - packuswb xmm1, xmm5 - packuswb xmm2, xmm5 - packuswb xmm3, xmm5 - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - movq [rdi + rdx*2], xmm2 - movq [rdi + rcx], xmm3 - - ; begin epilog - pop rdi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -;void vp8_idct_dequant_dc_full_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *dst - 2 -; int dst_stride - 3 -; short *dc - 4 -; ) -global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE -sym(vp8_idct_dequant_dc_full_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rdx, arg(1) ; dequant - - mov rdi, arg(2) ; dst - - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - - - ; note the transpose of xmm1 and xmm2, necessary for shuffle - ; to spit out sensicle data - movdqa xmm0, [rax] - movdqa xmm2, [rax+16] - movdqa xmm1, [rax+32] - movdqa xmm3, [rax+48] - - ; Clear out coeffs - movdqa [rax], xmm7 - movdqa [rax+16], xmm7 - movdqa [rax+32], xmm7 - movdqa [rax+48], xmm7 - - ; dequantize qcoeff buffer - pmullw xmm0, [rdx] - pmullw xmm2, [rdx+16] - pmullw xmm1, [rdx] - pmullw xmm3, [rdx+16] - - ; DC component - mov rdx, arg(4) - - ; repack so block 0 row x and block 1 row x are together - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm4, xmm1 - - pshufd xmm0, xmm0, 11011000b - pshufd xmm1, xmm4, 11011000b - - movdqa xmm4, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm4, xmm3 - - pshufd xmm2, xmm2, 11011000b - pshufd xmm3, xmm4, 11011000b - - ; insert DC component - pinsrw xmm0, [rdx], 0 - pinsrw xmm0, [rdx+2], 4 - - ; first pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 ; - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - - ; transpose for the second pass - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - ; second pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - paddw xmm0, [GLOBAL(fours)] - - paddw xmm2, [GLOBAL(fours)] - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - psraw xmm2, 3 - - psraw xmm0, 3 - psraw xmm4, 3 - - psraw xmm6, 3 - - ; transpose to save - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - pxor xmm7, xmm7 - - ; Load up predict blocks - movsxd rdx, dword ptr arg(3) ; dst_stride - movq xmm4, [rdi] - movq xmm5, [rdi+rdx] - lea rcx, [rdx + rdx*2] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - - movq xmm4, [rdi+rdx*2] - movq xmm5, [rdi+rcx] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm4 - paddw xmm3, xmm5 - -.finish: - - ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 - - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(3) ; dst_stride - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 - - - ; begin epilog - pop rdi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -fours: - times 8 dw 0x0004 -align 16 -x_s1sqr2: - times 8 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 8 dw 0x4E7B diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm deleted file mode 100644 index 158c3b7458..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm +++ /dev/null @@ -1,140 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) -global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE -sym(vp8_short_inv_walsh4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - ; end prolog - - mov rdx, arg(0) - mov rax, 30003h - - movq mm0, [rdx + 0] ;ip[0] - movq mm1, [rdx + 8] ;ip[4] - movq mm7, rax - - movq mm2, [rdx + 16] ;ip[8] - movq mm3, [rdx + 24] ;ip[12] - punpcklwd mm7, mm7 ;0003000300030003h - mov rdx, arg(1) - - movq mm4, mm0 - movq mm5, mm1 - - paddw mm4, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm4 ;temp al - paddw mm4, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm1, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - paddw mm0, mm1 ;dl + cl - psubw mm5, mm1 ;dl - cl - - ; 03 02 01 00 - ; 13 12 11 10 - ; 23 22 21 20 - ; 33 32 31 30 - - movq mm3, mm4 ; 03 02 01 00 - punpcklwd mm4, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm1, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm1, mm5 ; 33 23 32 22 - - movq mm0, mm4 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] -;~~~~~~~~~~~~~~~~~~~~~ - movq mm1, mm0 - movq mm5, mm4 - paddw mm1, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm1 ;temp al - paddw mm1, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - paddw mm1, mm7 - paddw mm6, mm7 - psraw mm1, 3 - psraw mm6, 3 - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm4, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - paddw mm0, mm4 ;dl + cl - psubw mm5, mm4 ;dl - cl - paddw mm0, mm7 - paddw mm5, mm7 - psraw mm0, 3 - psraw mm5, 3 -;~~~~~~~~~~~~~~~~~~~~~ - - movd eax, mm1 - movd ecx, mm0 - psrlq mm0, 32 - psrlq mm1, 32 - mov word ptr[rdx+32*0], ax - mov word ptr[rdx+32*1], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*4], ax - mov word ptr[rdx+32*5], cx - movd eax, mm1 - movd ecx, mm0 - mov word ptr[rdx+32*8], ax - mov word ptr[rdx+32*9], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*12], ax - mov word ptr[rdx+32*13], cx - - movd eax, mm6 - movd ecx, mm5 - psrlq mm5, 32 - psrlq mm6, 32 - mov word ptr[rdx+32*2], ax - mov word ptr[rdx+32*3], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*6], ax - mov word ptr[rdx+32*7], cx - movd eax, mm6 - movd ecx, mm5 - mov word ptr[rdx+32*10], ax - mov word ptr[rdx+32*11], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*14], ax - mov word ptr[rdx+32*15], cx - - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm deleted file mode 100644 index 06e86a80b6..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm +++ /dev/null @@ -1,121 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) -global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE -sym(vp8_short_inv_walsh4x4_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - ; end prolog - - mov rcx, arg(0) - mov rdx, arg(1) - mov rax, 30003h - - movdqa xmm0, [rcx + 0] ;ip[4] ip[0] - movdqa xmm1, [rcx + 16] ;ip[12] ip[8] - - - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm0 ;ip[4] ip[0] - - paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - - movdqa xmm4, xmm0 - punpcklqdq xmm0, xmm3 ;d1 a1 - punpckhqdq xmm4, xmm3 ;c1 b1 - - movdqa xmm1, xmm4 ;c1 b1 - paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] - - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - movd xmm0, eax - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm4 ;ip[4] ip[0] - - pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03 - - paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm3 ;d1 a1 - punpckhqdq xmm5, xmm3 ;c1 b1 - - movdqa xmm1, xmm5 ;c1 b1 - paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] - - paddw xmm5, xmm0 - paddw xmm4, xmm0 - psraw xmm5, 3 - psraw xmm4, 3 - - movd eax, xmm5 - movd ecx, xmm4 - psrldq xmm5, 4 - psrldq xmm4, 4 - mov word ptr[rdx+32*0], ax - mov word ptr[rdx+32*2], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*4], ax - mov word ptr[rdx+32*6], cx - movd eax, xmm5 - movd ecx, xmm4 - psrldq xmm5, 4 - psrldq xmm4, 4 - mov word ptr[rdx+32*8], ax - mov word ptr[rdx+32*10], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*12], ax - mov word ptr[rdx+32*14], cx - - movd eax, xmm5 - movd ecx, xmm4 - psrldq xmm5, 4 - psrldq xmm4, 4 - mov word ptr[rdx+32*1], ax - mov word ptr[rdx+32*3], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*5], ax - mov word ptr[rdx+32*7], cx - movd eax, xmm5 - movd ecx, xmm4 - mov word ptr[rdx+32*9], ax - mov word ptr[rdx+32*11], cx - shr eax, 16 - shr ecx, 16 - mov word ptr[rdx+32*13], ax - mov word ptr[rdx+32*15], cx - - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm deleted file mode 100644 index 6d5aaa19db..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm +++ /dev/null @@ -1,815 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro LF_ABS 2 - ; %1 value not preserved - ; %2 value preserved - ; output in %1 - movdqa scratch1, %2 ; v2 - - psubusb scratch1, %1 ; v2 - v1 - psubusb %1, %2 ; v1 - v2 - por %1, scratch1 ; abs(v2 - v1) -%endmacro - -%macro LF_FILTER_HEV_MASK 8-9 - - LF_ABS %1, %2 ; abs(p3 - p2) - LF_ABS %2, %3 ; abs(p2 - p1) - pmaxub %1, %2 ; accumulate mask -%if %0 == 8 - movdqa scratch2, %3 ; save p1 - LF_ABS scratch2, %4 ; abs(p1 - p0) -%endif - LF_ABS %4, %5 ; abs(p0 - q0) - LF_ABS %5, %6 ; abs(q0 - q1) -%if %0 == 8 - pmaxub %5, scratch2 ; accumulate hev -%else - pmaxub %5, %9 -%endif - pmaxub %1, %5 ; accumulate mask - - LF_ABS %3, %6 ; abs(p1 - q1) - LF_ABS %6, %7 ; abs(q1 - q2) - pmaxub %1, %6 ; accumulate mask - LF_ABS %7, %8 ; abs(q2 - q3) - pmaxub %1, %7 ; accumulate mask - - paddusb %4, %4 ; 2 * abs(p0 - q0) - pand %3, [GLOBAL(tfe)] - psrlw %3, 1 ; abs(p1 - q1) / 2 - paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - psubusb %1, [limit] - psubusb %4, [blimit] - por %1, %4 - pcmpeqb %1, zero ; mask - - psubusb %5, [thresh] - pcmpeqb %5, zero ; ~hev -%endmacro - -%macro LF_FILTER 6 - ; %1-%4: p1-q1 - ; %5: mask - ; %6: hev - - movdqa scratch2, %6 ; save hev - - pxor %1, [GLOBAL(t80)] ; ps1 - pxor %4, [GLOBAL(t80)] ; qs1 - movdqa scratch1, %1 - psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) - pandn scratch2, scratch1 ; vp8_filter &= hev - - pxor %2, [GLOBAL(t80)] ; ps0 - pxor %3, [GLOBAL(t80)] ; qs0 - movdqa scratch1, %3 - psubsb scratch1, %2 ; qs0 - ps0 - paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) - paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) - paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) - pand %5, scratch2 ; &= mask - - movdqa scratch2, %5 - paddsb %5, [GLOBAL(t4)] ; Filter1 - paddsb scratch2, [GLOBAL(t3)] ; Filter2 - - ; Filter1 >> 3 - movdqa scratch1, zero - pcmpgtb scratch1, %5 - psrlw %5, 3 - pand scratch1, [GLOBAL(te0)] - pand %5, [GLOBAL(t1f)] - por %5, scratch1 - - psubsb %3, %5 ; qs0 - Filter1 - pxor %3, [GLOBAL(t80)] - - ; Filter2 >> 3 - movdqa scratch1, zero - pcmpgtb scratch1, scratch2 - psrlw scratch2, 3 - pand scratch1, [GLOBAL(te0)] - pand scratch2, [GLOBAL(t1f)] - por scratch2, scratch1 - - paddsb %2, scratch2 ; ps0 + Filter2 - pxor %2, [GLOBAL(t80)] - - ; outer tap adjustments - paddsb %5, [GLOBAL(t1)] - movdqa scratch1, zero - pcmpgtb scratch1, %5 - psrlw %5, 1 - pand scratch1, [GLOBAL(t80)] - pand %5, [GLOBAL(t7f)] - por %5, scratch1 - pand %5, %6 ; vp8_filter &= ~hev - - psubsb %4, %5 ; qs1 - vp8_filter - pxor %4, [GLOBAL(t80)] - - paddsb %1, %5 ; ps1 + vp8_filter - pxor %1, [GLOBAL(t80)] -%endmacro - -;void vp8_loop_filter_bh_y_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh -;) -global sym(vp8_loop_filter_bh_y_sse2) PRIVATE -sym(vp8_loop_filter_bh_y_sse2): - -%if LIBVPX_YASM_WIN64 - %define src rcx ; src_ptr - %define stride rdx ; src_pixel_step - %define blimit r8 - %define limit r9 - %define thresh r10 - - %define spp rax - %define stride3 r11 - %define stride5 r12 - %define stride7 r13 - - push rbp - mov rbp, rsp - SAVE_XMM 11 - push r12 - push r13 - mov thresh, arg(4) -%else - %define src rdi ; src_ptr - %define stride rsi ; src_pixel_step - %define blimit rdx - %define limit rcx - %define thresh r8 - - %define spp rax - %define stride3 r9 - %define stride5 r10 - %define stride7 r11 -%endif - - %define scratch1 xmm5 - %define scratch2 xmm6 - %define zero xmm7 - - %define i0 [src] - %define i1 [spp] - %define i2 [src + 2 * stride] - %define i3 [spp + 2 * stride] - %define i4 [src + 4 * stride] - %define i5 [spp + 4 * stride] - %define i6 [src + 2 * stride3] - %define i7 [spp + 2 * stride3] - %define i8 [src + 8 * stride] - %define i9 [spp + 8 * stride] - %define i10 [src + 2 * stride5] - %define i11 [spp + 2 * stride5] - %define i12 [src + 4 * stride3] - %define i13 [spp + 4 * stride3] - %define i14 [src + 2 * stride7] - %define i15 [spp + 2 * stride7] - - ; prep work - lea spp, [src + stride] - lea stride3, [stride + 2 * stride] - lea stride5, [stride3 + 2 * stride] - lea stride7, [stride3 + 4 * stride] - pxor zero, zero - - ; load the first set into registers - movdqa xmm0, i0 - movdqa xmm1, i1 - movdqa xmm2, i2 - movdqa xmm3, i3 - movdqa xmm4, i4 - movdqa xmm8, i5 - movdqa xmm9, i6 ; q2, will contain abs(p1-p0) - movdqa xmm10, i7 -LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 - - movdqa xmm1, i2 - movdqa xmm2, i3 - movdqa xmm3, i4 - movdqa xmm8, i5 -LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 - movdqa i2, xmm1 - movdqa i3, xmm2 - -; second set - movdqa i4, xmm3 - movdqa i5, xmm8 - - movdqa xmm0, i6 - movdqa xmm1, i7 - movdqa xmm2, i8 - movdqa xmm4, i9 - movdqa xmm10, i10 ; q2, will contain abs(p1-p0) - movdqa xmm11, i11 -LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 - - movdqa xmm0, i6 - movdqa xmm1, i7 - movdqa xmm4, i8 - movdqa xmm8, i9 -LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 - movdqa i6, xmm0 - movdqa i7, xmm1 - -; last set - movdqa i8, xmm4 - movdqa i9, xmm8 - - movdqa xmm0, i10 - movdqa xmm1, i11 - movdqa xmm2, i12 - movdqa xmm3, i13 - movdqa xmm9, i14 ; q2, will contain abs(p1-p0) - movdqa xmm11, i15 -LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 - - movdqa xmm0, i10 - movdqa xmm1, i11 - movdqa xmm3, i12 - movdqa xmm8, i13 -LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 - movdqa i10, xmm0 - movdqa i11, xmm1 - movdqa i12, xmm3 - movdqa i13, xmm8 - -%if LIBVPX_YASM_WIN64 - pop r13 - pop r12 - RESTORE_XMM - pop rbp -%endif - - ret - - -;void vp8_loop_filter_bv_y_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh -;) - -global sym(vp8_loop_filter_bv_y_sse2) PRIVATE -sym(vp8_loop_filter_bv_y_sse2): - -%if LIBVPX_YASM_WIN64 - %define src rcx ; src_ptr - %define stride rdx ; src_pixel_step - %define blimit r8 - %define limit r9 - %define thresh r10 - - %define spp rax - %define stride3 r11 - %define stride5 r12 - %define stride7 r13 - - push rbp - mov rbp, rsp - SAVE_XMM 15 - push r12 - push r13 - mov thresh, arg(4) -%else - %define src rdi - %define stride rsi - %define blimit rdx - %define limit rcx - %define thresh r8 - - %define spp rax - %define stride3 r9 - %define stride5 r10 - %define stride7 r11 -%endif - - %define scratch1 xmm5 - %define scratch2 xmm6 - %define zero xmm7 - - %define s0 [src] - %define s1 [spp] - %define s2 [src + 2 * stride] - %define s3 [spp + 2 * stride] - %define s4 [src + 4 * stride] - %define s5 [spp + 4 * stride] - %define s6 [src + 2 * stride3] - %define s7 [spp + 2 * stride3] - %define s8 [src + 8 * stride] - %define s9 [spp + 8 * stride] - %define s10 [src + 2 * stride5] - %define s11 [spp + 2 * stride5] - %define s12 [src + 4 * stride3] - %define s13 [spp + 4 * stride3] - %define s14 [src + 2 * stride7] - %define s15 [spp + 2 * stride7] - - %define i0 [rsp] - %define i1 [rsp + 16] - %define i2 [rsp + 32] - %define i3 [rsp + 48] - %define i4 [rsp + 64] - %define i5 [rsp + 80] - %define i6 [rsp + 96] - %define i7 [rsp + 112] - %define i8 [rsp + 128] - %define i9 [rsp + 144] - %define i10 [rsp + 160] - %define i11 [rsp + 176] - %define i12 [rsp + 192] - %define i13 [rsp + 208] - %define i14 [rsp + 224] - %define i15 [rsp + 240] - - ALIGN_STACK 16, rax - - ; reserve stack space - %define temp_storage 0 ; size is 256 (16*16) - %define stack_size 256 - sub rsp, stack_size - - ; prep work - lea spp, [src + stride] - lea stride3, [stride + 2 * stride] - lea stride5, [stride3 + 2 * stride] - lea stride7, [stride3 + 4 * stride] - - ; 8-f - movdqa xmm0, s8 - movdqa xmm1, xmm0 - punpcklbw xmm0, s9 ; 80 90 - punpckhbw xmm1, s9 ; 88 98 - - movdqa xmm2, s10 - movdqa xmm3, xmm2 - punpcklbw xmm2, s11 ; a0 b0 - punpckhbw xmm3, s11 ; a8 b8 - - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm2 ; 80 90 a0 b0 - punpckhwd xmm4, xmm2 ; 84 94 a4 b4 - - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm3 ; 88 98 a8 b8 - punpckhwd xmm2, xmm3 ; 8c 9c ac bc - - ; using xmm[0124] - ; work on next 4 rows - - movdqa xmm3, s12 - movdqa xmm5, xmm3 - punpcklbw xmm3, s13 ; c0 d0 - punpckhbw xmm5, s13 ; c8 d8 - - movdqa xmm6, s14 - movdqa xmm7, xmm6 - punpcklbw xmm6, s15 ; e0 f0 - punpckhbw xmm7, s15 ; e8 f8 - - movdqa xmm8, xmm3 - punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 - punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 - - movdqa xmm6, xmm5 - punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 - punpckhwd xmm6, xmm7 ; cc dc ec fc - - ; pull the third and fourth sets together - - movdqa xmm7, xmm0 - punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 - punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 - - movdqa xmm3, xmm4 - punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 - punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 - - movdqa xmm8, xmm1 - punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 - punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa - - movdqa xmm5, xmm2 - punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc - punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe - - ; save the calculations. we only have 15 registers ... - movdqa i0, xmm0 - movdqa i1, xmm7 - movdqa i2, xmm4 - movdqa i3, xmm3 - movdqa i4, xmm1 - movdqa i5, xmm8 - movdqa i6, xmm2 - movdqa i7, xmm5 - - ; 0-7 - movdqa xmm0, s0 - movdqa xmm1, xmm0 - punpcklbw xmm0, s1 ; 00 10 - punpckhbw xmm1, s1 ; 08 18 - - movdqa xmm2, s2 - movdqa xmm3, xmm2 - punpcklbw xmm2, s3 ; 20 30 - punpckhbw xmm3, s3 ; 28 38 - - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm2 ; 00 10 20 30 - punpckhwd xmm4, xmm2 ; 04 14 24 34 - - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm3 ; 08 18 28 38 - punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c - - ; using xmm[0124] - ; work on next 4 rows - - movdqa xmm3, s4 - movdqa xmm5, xmm3 - punpcklbw xmm3, s5 ; 40 50 - punpckhbw xmm5, s5 ; 48 58 - - movdqa xmm6, s6 - movdqa xmm7, xmm6 - punpcklbw xmm6, s7 ; 60 70 - punpckhbw xmm7, s7 ; 68 78 - - movdqa xmm8, xmm3 - punpcklwd xmm3, xmm6 ; 40 50 60 70 - punpckhwd xmm8, xmm6 ; 44 54 64 74 - - movdqa xmm6, xmm5 - punpcklwd xmm5, xmm7 ; 48 58 68 78 - punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c - - ; pull the first two sets together - - movdqa xmm7, xmm0 - punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 - punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 - - movdqa xmm3, xmm4 - punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 - punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 - - movdqa xmm8, xmm1 - punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 - punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a - - movdqa xmm5, xmm2 - punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c - punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e - ; final combination - - movdqa xmm6, xmm0 - punpcklqdq xmm0, i0 - punpckhqdq xmm6, i0 - - movdqa xmm9, xmm7 - punpcklqdq xmm7, i1 - punpckhqdq xmm9, i1 - - movdqa xmm10, xmm4 - punpcklqdq xmm4, i2 - punpckhqdq xmm10, i2 - - movdqa xmm11, xmm3 - punpcklqdq xmm3, i3 - punpckhqdq xmm11, i3 - - movdqa xmm12, xmm1 - punpcklqdq xmm1, i4 - punpckhqdq xmm12, i4 - - movdqa xmm13, xmm8 - punpcklqdq xmm8, i5 - punpckhqdq xmm13, i5 - - movdqa xmm14, xmm2 - punpcklqdq xmm2, i6 - punpckhqdq xmm14, i6 - - movdqa xmm15, xmm5 - punpcklqdq xmm5, i7 - punpckhqdq xmm15, i7 - - movdqa i0, xmm0 - movdqa i1, xmm6 - movdqa i2, xmm7 - movdqa i3, xmm9 - movdqa i4, xmm4 - movdqa i5, xmm10 - movdqa i6, xmm3 - movdqa i7, xmm11 - movdqa i8, xmm1 - movdqa i9, xmm12 - movdqa i10, xmm8 - movdqa i11, xmm13 - movdqa i12, xmm2 - movdqa i13, xmm14 - movdqa i14, xmm5 - movdqa i15, xmm15 - -; TRANSPOSED DATA AVAILABLE ON THE STACK - - movdqa xmm12, xmm6 - movdqa xmm13, xmm7 - - pxor zero, zero - -LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 - - movdqa xmm1, i2 - movdqa xmm2, i3 - movdqa xmm8, i4 - movdqa xmm9, i5 -LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 - movdqa i2, xmm1 - movdqa i3, xmm2 - -; second set - movdqa i4, xmm8 - movdqa i5, xmm9 - - movdqa xmm0, i6 - movdqa xmm1, i7 - movdqa xmm2, i8 - movdqa xmm4, i9 - movdqa xmm10, i10 ; q2, will contain abs(p1-p0) - movdqa xmm11, i11 -LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 - - movdqa xmm0, i6 - movdqa xmm1, i7 - movdqa xmm3, i8 - movdqa xmm4, i9 -LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 - movdqa i6, xmm0 - movdqa i7, xmm1 - -; last set - movdqa i8, xmm3 - movdqa i9, xmm4 - - movdqa xmm0, i10 - movdqa xmm1, i11 - movdqa xmm2, i12 - movdqa xmm8, i13 - movdqa xmm9, i14 ; q2, will contain abs(p1-p0) - movdqa xmm11, i15 -LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 - - movdqa xmm0, i10 - movdqa xmm1, i11 - movdqa xmm4, i12 - movdqa xmm8, i13 -LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 - movdqa i10, xmm0 - movdqa i11, xmm1 - movdqa i12, xmm4 - movdqa i13, xmm8 - - -; RESHUFFLE AND WRITE OUT - ; 8-f - movdqa xmm0, i8 - movdqa xmm1, xmm0 - punpcklbw xmm0, i9 ; 80 90 - punpckhbw xmm1, i9 ; 88 98 - - movdqa xmm2, i10 - movdqa xmm3, xmm2 - punpcklbw xmm2, i11 ; a0 b0 - punpckhbw xmm3, i11 ; a8 b8 - - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm2 ; 80 90 a0 b0 - punpckhwd xmm4, xmm2 ; 84 94 a4 b4 - - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm3 ; 88 98 a8 b8 - punpckhwd xmm2, xmm3 ; 8c 9c ac bc - - ; using xmm[0124] - ; work on next 4 rows - - movdqa xmm3, i12 - movdqa xmm5, xmm3 - punpcklbw xmm3, i13 ; c0 d0 - punpckhbw xmm5, i13 ; c8 d8 - - movdqa xmm6, i14 - movdqa xmm7, xmm6 - punpcklbw xmm6, i15 ; e0 f0 - punpckhbw xmm7, i15 ; e8 f8 - - movdqa xmm8, xmm3 - punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 - punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 - - movdqa xmm6, xmm5 - punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 - punpckhwd xmm6, xmm7 ; cc dc ec fc - - ; pull the third and fourth sets together - - movdqa xmm7, xmm0 - punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 - punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 - - movdqa xmm3, xmm4 - punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 - punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 - - movdqa xmm8, xmm1 - punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 - punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa - - movdqa xmm5, xmm2 - punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc - punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe - - ; save the calculations. we only have 15 registers ... - movdqa i8, xmm0 - movdqa i9, xmm7 - movdqa i10, xmm4 - movdqa i11, xmm3 - movdqa i12, xmm1 - movdqa i13, xmm8 - movdqa i14, xmm2 - movdqa i15, xmm5 - - ; 0-7 - movdqa xmm0, i0 - movdqa xmm1, xmm0 - punpcklbw xmm0, i1 ; 00 10 - punpckhbw xmm1, i1 ; 08 18 - - movdqa xmm2, i2 - movdqa xmm3, xmm2 - punpcklbw xmm2, i3 ; 20 30 - punpckhbw xmm3, i3 ; 28 38 - - movdqa xmm4, xmm0 - punpcklwd xmm0, xmm2 ; 00 10 20 30 - punpckhwd xmm4, xmm2 ; 04 14 24 34 - - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm3 ; 08 18 28 38 - punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c - - ; using xmm[0124] - ; work on next 4 rows - - movdqa xmm3, i4 - movdqa xmm5, xmm3 - punpcklbw xmm3, i5 ; 40 50 - punpckhbw xmm5, i5 ; 48 58 - - movdqa xmm6, i6 - movdqa xmm7, xmm6 - punpcklbw xmm6, i7 ; 60 70 - punpckhbw xmm7, i7 ; 68 78 - - movdqa xmm8, xmm3 - punpcklwd xmm3, xmm6 ; 40 50 60 70 - punpckhwd xmm8, xmm6 ; 44 54 64 74 - - movdqa xmm6, xmm5 - punpcklwd xmm5, xmm7 ; 48 58 68 78 - punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c - - ; pull the first two sets together - - movdqa xmm7, xmm0 - punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 - punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 - - movdqa xmm3, xmm4 - punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 - punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 - - movdqa xmm8, xmm1 - punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 - punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a - - movdqa xmm5, xmm2 - punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c - punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e - ; final combination - - movdqa xmm6, xmm0 - punpcklqdq xmm0, i8 - punpckhqdq xmm6, i8 - - movdqa xmm9, xmm7 - punpcklqdq xmm7, i9 - punpckhqdq xmm9, i9 - - movdqa xmm10, xmm4 - punpcklqdq xmm4, i10 - punpckhqdq xmm10, i10 - - movdqa xmm11, xmm3 - punpcklqdq xmm3, i11 - punpckhqdq xmm11, i11 - - movdqa xmm12, xmm1 - punpcklqdq xmm1, i12 - punpckhqdq xmm12, i12 - - movdqa xmm13, xmm8 - punpcklqdq xmm8, i13 - punpckhqdq xmm13, i13 - - movdqa xmm14, xmm2 - punpcklqdq xmm2, i14 - punpckhqdq xmm14, i14 - - movdqa xmm15, xmm5 - punpcklqdq xmm5, i15 - punpckhqdq xmm15, i15 - - movdqa s0, xmm0 - movdqa s1, xmm6 - movdqa s2, xmm7 - movdqa s3, xmm9 - movdqa s4, xmm4 - movdqa s5, xmm10 - movdqa s6, xmm3 - movdqa s7, xmm11 - movdqa s8, xmm1 - movdqa s9, xmm12 - movdqa s10, xmm8 - movdqa s11, xmm13 - movdqa s12, xmm2 - movdqa s13, xmm14 - movdqa s14, xmm5 - movdqa s15, xmm15 - - ; free stack space - add rsp, stack_size - - ; un-ALIGN_STACK - pop rsp - -%if LIBVPX_YASM_WIN64 - pop r13 - pop r12 - RESTORE_XMM - pop rbp -%endif - - ret - -SECTION_RODATA -align 16 -te0: - times 16 db 0xe0 -align 16 -t7f: - times 16 db 0x7f -align 16 -tfe: - times 16 db 0xfe -align 16 -t1f: - times 16 db 0x1f -align 16 -t80: - times 16 db 0x80 -align 16 -t1: - times 16 db 0x01 -align 16 -t3: - times 16 db 0x03 -align 16 -t4: - times 16 db 0x04 diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm deleted file mode 100644 index 1913abc69b..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm +++ /dev/null @@ -1,1640 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%define _t0 0 -%define _t1 _t0 + 16 -%define _p3 _t1 + 16 -%define _p2 _p3 + 16 -%define _p1 _p2 + 16 -%define _p0 _p1 + 16 -%define _q0 _p0 + 16 -%define _q1 _q0 + 16 -%define _q2 _q1 + 16 -%define _q3 _q2 + 16 -%define lf_var_size 160 - -; Use of pmaxub instead of psubusb to compute filter mask was seen -; in ffvp8 - -%macro LFH_FILTER_AND_HEV_MASK 1 -%if %1 - movdqa xmm2, [rdi+2*rax] ; q3 - movdqa xmm1, [rsi+2*rax] ; q2 - movdqa xmm4, [rsi+rax] ; q1 - movdqa xmm5, [rsi] ; q0 - neg rax ; negate pitch to deal with above border -%else - movlps xmm2, [rsi + rcx*2] ; q3 - movlps xmm1, [rsi + rcx] ; q2 - movlps xmm4, [rsi] ; q1 - movlps xmm5, [rsi + rax] ; q0 - - movhps xmm2, [rdi + rcx*2] - movhps xmm1, [rdi + rcx] - movhps xmm4, [rdi] - movhps xmm5, [rdi + rax] - - lea rsi, [rsi + rax*4] - lea rdi, [rdi + rax*4] - - movdqa [rsp+_q2], xmm1 ; store q2 - movdqa [rsp+_q1], xmm4 ; store q1 -%endif - movdqa xmm7, [rdx] ;limit - - movdqa xmm6, xmm1 ; q2 - movdqa xmm3, xmm4 ; q1 - - psubusb xmm1, xmm2 ; q2-=q3 - psubusb xmm2, xmm6 ; q3-=q2 - - psubusb xmm4, xmm6 ; q1-=q2 - psubusb xmm6, xmm3 ; q2-=q1 - - por xmm4, xmm6 ; abs(q2-q1) - por xmm1, xmm2 ; abs(q3-q2) - - movdqa xmm0, xmm5 ; q0 - pmaxub xmm1, xmm4 - - psubusb xmm5, xmm3 ; q0-=q1 - psubusb xmm3, xmm0 ; q1-=q0 - - por xmm5, xmm3 ; abs(q0-q1) - movdqa [rsp+_t0], xmm5 ; save to t0 - - pmaxub xmm1, xmm5 - -%if %1 - movdqa xmm2, [rsi+4*rax] ; p3 - movdqa xmm4, [rdi+4*rax] ; p2 - movdqa xmm6, [rsi+2*rax] ; p1 -%else - movlps xmm2, [rsi + rax] ; p3 - movlps xmm4, [rsi] ; p2 - movlps xmm6, [rsi + rcx] ; p1 - - movhps xmm2, [rdi + rax] - movhps xmm4, [rdi] - movhps xmm6, [rdi + rcx] - - movdqa [rsp+_p2], xmm4 ; store p2 - movdqa [rsp+_p1], xmm6 ; store p1 -%endif - - movdqa xmm5, xmm4 ; p2 - movdqa xmm3, xmm6 ; p1 - - psubusb xmm4, xmm2 ; p2-=p3 - psubusb xmm2, xmm5 ; p3-=p2 - - psubusb xmm3, xmm5 ; p1-=p2 - pmaxub xmm1, xmm4 ; abs(p3 - p2) - - psubusb xmm5, xmm6 ; p2-=p1 - pmaxub xmm1, xmm2 ; abs(p3 - p2) - - pmaxub xmm1, xmm5 ; abs(p2 - p1) - movdqa xmm2, xmm6 ; p1 - - pmaxub xmm1, xmm3 ; abs(p2 - p1) -%if %1 - movdqa xmm4, [rsi+rax] ; p0 - movdqa xmm3, [rdi] ; q1 -%else - movlps xmm4, [rsi + rcx*2] ; p0 - movhps xmm4, [rdi + rcx*2] - movdqa xmm3, [rsp+_q1] ; q1 -%endif - - movdqa xmm5, xmm4 ; p0 - psubusb xmm4, xmm6 ; p0-=p1 - - psubusb xmm6, xmm5 ; p1-=p0 - - por xmm6, xmm4 ; abs(p1 - p0) - mov rdx, arg(2) ; get blimit - - movdqa [rsp+_t1], xmm6 ; save to t1 - - movdqa xmm4, xmm3 ; q1 - pmaxub xmm1, xmm6 - - psubusb xmm3, xmm2 ; q1-=p1 - psubusb xmm2, xmm4 ; p1-=q1 - - psubusb xmm1, xmm7 - por xmm2, xmm3 ; abs(p1-q1) - - movdqa xmm7, [rdx] ; blimit - mov rdx, arg(4) ; hev get thresh - - movdqa xmm3, xmm0 ; q0 - pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - - movdqa xmm6, xmm5 ; p0 - psrlw xmm2, 1 ; abs(p1-q1)/2 - - psubusb xmm5, xmm3 ; p0-=q0 - psubusb xmm3, xmm6 ; q0-=p0 - por xmm5, xmm3 ; abs(p0 - q0) - - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - - movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) - movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) - - paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - movdqa xmm2, [rdx] ; hev - - psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - psubusb xmm4, xmm2 ; hev - - psubusb xmm3, xmm2 ; hev - por xmm1, xmm5 - - pxor xmm7, xmm7 - paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb xmm4, xmm5 ; hev - pcmpeqb xmm3, xmm3 ; hev - - pcmpeqb xmm1, xmm7 ; mask xmm1 - pxor xmm4, xmm3 ; hev -%endmacro - -%macro B_FILTER 1 - movdqa xmm3, [GLOBAL(t80)] -%if %1 == 0 - movdqa xmm2, [rsp+_p1] ; p1 - movdqa xmm7, [rsp+_q1] ; q1 -%elif %1 == 1 - movdqa xmm2, [rsi+2*rax] ; p1 - movdqa xmm7, [rdi] ; q1 -%elif %1 == 2 - movdqa xmm2, [rsp+_p1] ; p1 - movdqa xmm6, [rsp+_p0] ; p0 - movdqa xmm0, [rsp+_q0] ; q0 - movdqa xmm7, [rsp+_q1] ; q1 -%endif - - pxor xmm2, xmm3 ; p1 offset to convert to signed values - pxor xmm7, xmm3 ; q1 offset to convert to signed values - - psubsb xmm2, xmm7 ; p1 - q1 - pxor xmm6, xmm3 ; offset to convert to signed values - - pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) - pxor xmm0, xmm3 ; offset to convert to signed values - - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand xmm1, xmm2 ; mask filter values we don't care about - - movdqa xmm2, xmm1 - paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - punpckhbw xmm5, xmm2 ; axbxcxdx - punpcklbw xmm2, xmm2 ; exfxgxhx - - punpcklbw xmm0, xmm1 ; exfxgxhx - psraw xmm5, 11 ; sign extended shift right by 3 - - punpckhbw xmm1, xmm1 ; axbxcxdx - psraw xmm2, 11 ; sign extended shift right by 3 - - packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - psraw xmm0, 11 ; sign extended shift right by 3 - - psraw xmm1, 11 ; sign extended shift right by 3 - movdqa xmm5, xmm0 ; save results - - packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - - paddsb xmm6, xmm2 ; p0+= p0 add - - movdqa xmm2, [GLOBAL(ones)] - paddsw xmm5, xmm2 - paddsw xmm1, xmm2 - psraw xmm5, 1 ; partial shifted one more time for 2nd tap - psraw xmm1, 1 ; partial shifted one more time for 2nd tap - packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - movdqa xmm2, [GLOBAL(t80)] - -%if %1 == 0 - movdqa xmm1, [rsp+_p1] ; p1 - lea rsi, [rsi + rcx*2] - lea rdi, [rdi + rcx*2] -%elif %1 == 1 - movdqa xmm1, [rsi+2*rax] ; p1 -%elif %1 == 2 - movdqa xmm1, [rsp+_p1] ; p1 -%endif - - pandn xmm4, xmm5 ; high edge variance additive - pxor xmm6, xmm2 ; unoffset - - pxor xmm1, xmm2 ; reoffset - psubsb xmm3, xmm0 ; q0-= q0 add - - paddsb xmm1, xmm4 ; p1+= p1 add - pxor xmm3, xmm2 ; unoffset - - pxor xmm1, xmm2 ; unoffset - psubsb xmm7, xmm4 ; q1-= q1 add - - pxor xmm7, xmm2 ; unoffset -%if %1 == 0 - movq [rsi], xmm6 ; p0 - movhps [rdi], xmm6 - movq [rsi + rax], xmm1 ; p1 - movhps [rdi + rax], xmm1 - movq [rsi + rcx], xmm3 ; q0 - movhps [rdi + rcx], xmm3 - movq [rsi + rcx*2], xmm7 ; q1 - movhps [rdi + rcx*2], xmm7 -%elif %1 == 1 - movdqa [rsi+rax], xmm6 ; write back - movdqa [rsi+2*rax], xmm1 ; write back - movdqa [rsi], xmm3 ; write back - movdqa [rdi], xmm7 ; write back -%endif - -%endmacro - -%if ABI_IS_32BIT - -;void vp8_loop_filter_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -;) -global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE -sym(vp8_loop_filter_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step - - mov rdx, arg(3) ;limit - - lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 1 - ; filter and write back the result - B_FILTER 1 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -%endif - -;void vp8_loop_filter_horizontal_edge_uv_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE -sym(vp8_loop_filter_horizontal_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ; u - mov rdi, arg(5) ; v - movsxd rax, dword ptr arg(1) ; src_pixel_step - mov rcx, rax - neg rax ; negate pitch to deal with above border - - mov rdx, arg(3) ;limit - - lea rsi, [rsi + rcx] - lea rdi, [rdi + rcx] - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 0 - ; filter and write back the result - B_FILTER 0 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -%macro MB_FILTER_AND_WRITEBACK 1 - movdqa xmm3, [GLOBAL(t80)] -%if %1 == 0 - movdqa xmm2, [rsp+_p1] ; p1 - movdqa xmm7, [rsp+_q1] ; q1 -%elif %1 == 1 - movdqa xmm2, [rsi+2*rax] ; p1 - movdqa xmm7, [rdi] ; q1 - - mov rcx, rax - neg rcx -%elif %1 == 2 - movdqa xmm2, [rsp+_p1] ; p1 - movdqa xmm6, [rsp+_p0] ; p0 - movdqa xmm0, [rsp+_q0] ; q0 - movdqa xmm7, [rsp+_q1] ; q1 -%endif - - pxor xmm2, xmm3 ; p1 offset to convert to signed values - pxor xmm7, xmm3 ; q1 offset to convert to signed values - pxor xmm6, xmm3 ; offset to convert to signed values - pxor xmm0, xmm3 ; offset to convert to signed values - - psubsb xmm2, xmm7 ; p1 - q1 - - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb xmm2, xmm0 ; 2 * (q0 - p0) - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) - pand xmm1, xmm2 ; mask filter values we don't care about - - movdqa xmm2, xmm1 ; vp8_filter - - pand xmm2, xmm4 ; Filter2 = vp8_filter & hev - pxor xmm0, xmm0 - - pandn xmm4, xmm1 ; vp8_filter&=~hev - pxor xmm1, xmm1 - - punpcklbw xmm0, xmm4 ; Filter 2 (hi) - punpckhbw xmm1, xmm4 ; Filter 2 (lo) - - movdqa xmm5, xmm2 - - movdqa xmm4, [GLOBAL(s9)] - paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) - paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - - pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 - pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 - - punpckhbw xmm7, xmm5 ; axbxcxdx - punpcklbw xmm5, xmm5 ; exfxgxhx - - psraw xmm7, 11 ; sign extended shift right by 3 - - psraw xmm5, 11 ; sign extended shift right by 3 - punpckhbw xmm4, xmm2 ; axbxcxdx - - punpcklbw xmm2, xmm2 ; exfxgxhx - psraw xmm4, 11 ; sign extended shift right by 3 - - packsswb xmm5, xmm7 ; Filter2 >>=3; - psraw xmm2, 11 ; sign extended shift right by 3 - - packsswb xmm2, xmm4 ; Filter1 >>=3; - - paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 - - psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 - movdqa xmm7, xmm1 - - movdqa xmm4, [GLOBAL(s63)] - movdqa xmm5, xmm0 - movdqa xmm2, xmm5 - paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 - paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 - movdqa xmm4, xmm7 - - paddw xmm5, xmm5 ; Filter 2 (hi) * 18 - - paddw xmm7, xmm7 ; Filter 2 (lo) * 18 - paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 - - paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 - paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 - psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 - - paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 - psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 - psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 - - packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) - - psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 - psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 - psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 - - packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) - movdqa xmm7, [GLOBAL(t80)] - -%if %1 == 0 - movdqa xmm1, [rsp+_q1] ; q1 - movdqa xmm4, [rsp+_p1] ; p1 - lea rsi, [rsi+rcx*2] - lea rdi, [rdi+rcx*2] - -%elif %1 == 1 - movdqa xmm1, [rdi] ; q1 - movdqa xmm4, [rsi+rax*2] ; p1 -%elif %1 == 2 - movdqa xmm4, [rsp+_p1] ; p1 - movdqa xmm1, [rsp+_q1] ; q1 -%endif - - pxor xmm1, xmm7 - pxor xmm4, xmm7 - - psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) - paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) - psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) - paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) - -%if %1 == 1 - movdqa xmm2, [rdi+rax*4] ; p2 - movdqa xmm5, [rdi+rcx] ; q2 -%else - movdqa xmm2, [rsp+_p2] ; p2 - movdqa xmm5, [rsp+_q2] ; q2 -%endif - - pxor xmm1, xmm7 ; *oq1 = sq^0x80; - pxor xmm4, xmm7 ; *op1 = sp^0x80; - pxor xmm2, xmm7 - pxor xmm5, xmm7 - paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) - psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) - pxor xmm2, xmm7 ; *op2 = sp^0x80; - pxor xmm5, xmm7 ; *oq2 = sq^0x80; - pxor xmm3, xmm7 ; *oq0 = sq^0x80 - pxor xmm6, xmm7 ; *oq0 = sp^0x80 -%if %1 == 0 - movq [rsi], xmm6 ; p0 - movhps [rdi], xmm6 - movq [rsi + rcx], xmm3 ; q0 - movhps [rdi + rcx], xmm3 - lea rdx, [rcx + rcx*2] - movq [rsi+rcx*2], xmm1 ; q1 - movhps [rdi+rcx*2], xmm1 - - movq [rsi + rax], xmm4 ; p1 - movhps [rdi + rax], xmm4 - - movq [rsi+rax*2], xmm2 ; p2 - movhps [rdi+rax*2], xmm2 - - movq [rsi+rdx], xmm5 ; q2 - movhps [rdi+rdx], xmm5 -%elif %1 == 1 - movdqa [rdi+rcx], xmm5 ; q2 - movdqa [rdi], xmm1 ; q1 - movdqa [rsi], xmm3 ; q0 - movdqa [rsi+rax ], xmm6 ; p0 - movdqa [rsi+rax*2], xmm4 ; p1 - movdqa [rdi+rax*4], xmm2 ; p2 -%elif %1 == 2 - movdqa [rsp+_p1], xmm4 ; p1 - movdqa [rsp+_p0], xmm6 ; p0 - movdqa [rsp+_q0], xmm3 ; q0 - movdqa [rsp+_q1], xmm1 ; q1 -%endif - -%endmacro - - -;void vp8_mbloop_filter_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -;) -global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE -sym(vp8_mbloop_filter_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step - mov rdx, arg(3) ;limit - - lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 1 - ; filter and write back the results - MB_FILTER_AND_WRITEBACK 1 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_horizontal_edge_uv_sse2 -;( -; unsigned char *u, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; unsigned char *v -;) -global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE -sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ; u - mov rdi, arg(5) ; v - movsxd rax, dword ptr arg(1) ; src_pixel_step - mov rcx, rax - neg rax ; negate pitch to deal with above border - mov rdx, arg(3) ;limit - - lea rsi, [rsi + rcx] - lea rdi, [rdi + rcx] - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 0 - ; filter and write back the results - MB_FILTER_AND_WRITEBACK 0 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -%macro TRANSPOSE_16X8 2 - movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 - movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 - movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 - movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 - movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 - movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 - - punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - - movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 - - movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 - - movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 - - punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 -%if %1 - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] -%else - mov rsi, arg(5) ; v_ptr -%endif - - movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 - punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 - punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 - punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - -%if %1 == 0 - lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing - lea rsi, [rsi - 4] -%endif - - movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 - - movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 - punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - - punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 - - punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - - punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - - movdqa [rsp+_t0], xmm2 ; save to free XMM2 - - movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 - movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 - movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 - movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 - movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 - - punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - - movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 - - punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 - - movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 - - punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 - - movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 - - punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 - - movdqa xmm6, xmm1 ; - punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 - - punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - - punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - - punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 - - movdqa xmm0, xmm5 - punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - - punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 - - punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 - - punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 - movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 - - punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 - - punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 - -%if %2 == 0 - movdqa [rsp+_q3], xmm7 ; save 7 - movdqa [rsp+_q2], xmm6 ; save 6 -%endif - movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - movdqa [rsp+_p1], xmm2 ; save 2 - - movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - movdqa [rsp+_p0], xmm3 ; save 3 - - punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - - movdqa [rsp+_q0], xmm4 ; save 4 - movdqa [rsp+_q1], xmm5 ; save 5 - movdqa xmm1, [rsp+_t0] - - movdqa xmm2, xmm1 ; - punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - -%if %2 == 0 - movdqa [rsp+_p2], xmm1 - movdqa [rsp+_p3], xmm2 -%endif - -%endmacro - -%macro LFV_FILTER_MASK_HEV_MASK 0 - movdqa xmm0, xmm6 ; q2 - psubusb xmm0, xmm7 ; q2-q3 - - psubusb xmm7, xmm6 ; q3-q2 - movdqa xmm4, xmm5 ; q1 - - por xmm7, xmm0 ; abs (q3-q2) - psubusb xmm4, xmm6 ; q1-q2 - - movdqa xmm0, xmm1 - psubusb xmm6, xmm5 ; q2-q1 - - por xmm6, xmm4 ; abs (q2-q1) - psubusb xmm0, xmm2 ; p2 - p3; - - psubusb xmm2, xmm1 ; p3 - p2; - por xmm0, xmm2 ; abs(p2-p3) - - movdqa xmm5, [rsp+_p1] ; p1 - pmaxub xmm0, xmm7 - - movdqa xmm2, xmm5 ; p1 - psubusb xmm5, xmm1 ; p1-p2 - psubusb xmm1, xmm2 ; p2-p1 - - movdqa xmm7, xmm3 ; p0 - psubusb xmm7, xmm2 ; p0-p1 - - por xmm1, xmm5 ; abs(p2-p1) - pmaxub xmm0, xmm6 - - pmaxub xmm0, xmm1 - movdqa xmm1, xmm2 ; p1 - - psubusb xmm2, xmm3 ; p1-p0 - - por xmm2, xmm7 ; abs(p1-p0) - - pmaxub xmm0, xmm2 - - movdqa xmm5, [rsp+_q0] ; q0 - movdqa xmm7, [rsp+_q1] ; q1 - - mov rdx, arg(3) ; limit - - movdqa xmm6, xmm5 ; q0 - movdqa xmm4, xmm7 ; q1 - - psubusb xmm5, xmm7 ; q0-q1 - psubusb xmm7, xmm6 ; q1-q0 - - por xmm7, xmm5 ; abs(q1-q0) - - pmaxub xmm0, xmm7 - - psubusb xmm0, [rdx] ; limit - - mov rdx, arg(2) ; blimit - movdqa xmm5, xmm4 ; q1 - - psubusb xmm5, xmm1 ; q1-=p1 - psubusb xmm1, xmm4 ; p1-=q1 - - por xmm5, xmm1 ; abs(p1-q1) - movdqa xmm1, xmm3 ; p0 - - pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psubusb xmm1, xmm6 ; p0-q0 - - movdqa xmm4, [rdx] ; blimit - mov rdx, arg(4) ; get thresh - - psrlw xmm5, 1 ; abs(p1-q1)/2 - psubusb xmm6, xmm3 ; q0-p0 - - por xmm1, xmm6 ; abs(q0-p0) - paddusb xmm1, xmm1 ; abs(q0-p0)*2 - movdqa xmm3, [rdx] - - paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh - - psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh - - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - por xmm1, xmm0 ; mask - pcmpeqb xmm2, xmm0 - - pxor xmm0, xmm0 - pcmpeqb xmm4, xmm4 - - pcmpeqb xmm1, xmm0 - pxor xmm4, xmm2 -%endmacro - -%macro BV_TRANSPOSE 0 - ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - - movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - - punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 - - movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 - - punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 - movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 - - punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 - ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 - ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 - ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 - ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 -%endmacro - -%macro BV_WRITEBACK 2 - movd [rsi+2], %1 - movd [rsi+4*rax+2], %2 - psrldq %1, 4 - psrldq %2, 4 - movd [rdi+2], %1 - movd [rdi+4*rax+2], %2 - psrldq %1, 4 - psrldq %2, 4 - movd [rsi+2*rax+2], %1 - movd [rsi+2*rcx+2], %2 - psrldq %1, 4 - psrldq %2, 4 - movd [rdi+2*rax+2], %1 - movd [rdi+2*rcx+2], %2 -%endmacro - -%if ABI_IS_32BIT - -;void vp8_loop_filter_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -;) -global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE -sym(vp8_loop_filter_vertical_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ; src_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax*2+rax] - - ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8 1, 1 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK - - ; start work on filters - B_FILTER 2 - - ; transpose and write back - only work on q1, q0, p0, p1 - BV_TRANSPOSE - ; store 16-line result - - lea rdx, [rax] - neg rdx - - BV_WRITEBACK xmm1, xmm5 - - lea rsi, [rsi+rdx*8] - lea rdi, [rdi+rdx*8] - BV_WRITEBACK xmm2, xmm6 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -%endif - -;void vp8_loop_filter_vertical_edge_uv_sse2 -;( -; unsigned char *u, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; unsigned char *v -;) -global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE -sym(vp8_loop_filter_vertical_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ; u_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax+2*rax] - - ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8 0, 1 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK - - ; start work on filters - B_FILTER 2 - - ; transpose and write back - only work on q1, q0, p0, p1 - BV_TRANSPOSE - - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - - ; store 16-line result - BV_WRITEBACK xmm1, xmm5 - - mov rsi, arg(0) ; u_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - BV_WRITEBACK xmm2, xmm6 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -%macro MBV_TRANSPOSE 0 - movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - - punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - - punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - - movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - - movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 - punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 - - movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 - - punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 - movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - - punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 - punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 -%endmacro - -%macro MBV_WRITEBACK_1 0 - movq [rsi], xmm0 - movhps [rdi], xmm0 - - movq [rsi+2*rax], xmm6 - movhps [rdi+2*rax], xmm6 - - movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 - punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 - - movq [rsi+4*rax], xmm0 - movhps [rdi+4*rax], xmm0 - - movq [rsi+2*rcx], xmm3 - movhps [rdi+2*rcx], xmm3 - - movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 - punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 - - movdqa xmm0, xmm7 - punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 - punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 - - movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 - punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 -%endmacro - -%macro MBV_WRITEBACK_2 0 - movq [rsi], xmm1 - movhps [rdi], xmm1 - - movq [rsi+2*rax], xmm5 - movhps [rdi+2*rax], xmm5 - - movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 - punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 - - movq [rsi+4*rax], xmm1 - movhps [rdi+4*rax], xmm1 - - movq [rsi+2*rcx], xmm4 - movhps [rdi+2*rcx], xmm4 -%endmacro - - -;void vp8_mbloop_filter_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -;) -global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE -sym(vp8_mbloop_filter_vertical_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ; src_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax*2+rax] - - ; Transpose - TRANSPOSE_16X8 1, 0 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK - - neg rax - ; start work on filters - MB_FILTER_AND_WRITEBACK 2 - - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] - - ; transpose and write back - MBV_TRANSPOSE - - neg rax - - MBV_WRITEBACK_1 - - - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] - MBV_WRITEBACK_2 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_vertical_edge_uv_sse2 -;( -; unsigned char *u, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; unsigned char *v -;) -global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE -sym(vp8_mbloop_filter_vertical_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, lf_var_size - - mov rsi, arg(0) ; u_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax+2*rax] - - ; Transpose - TRANSPOSE_16X8 0, 0 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK - - ; start work on filters - MB_FILTER_AND_WRITEBACK 2 - - ; transpose and write back - MBV_TRANSPOSE - - mov rsi, arg(0) ;u_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] - MBV_WRITEBACK_1 - mov rsi, arg(5) ;v_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] - MBV_WRITEBACK_2 - - add rsp, lf_var_size - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_simple_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE -sym(vp8_loop_filter_simple_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx - ; end prolog - - mov rcx, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - movdqa xmm6, [GLOBAL(tfe)] - lea rdx, [rcx + rax] - neg rax - - ; calculate mask - movdqa xmm0, [rdx] ; q1 - mov rdx, arg(2) ;blimit - movdqa xmm1, [rcx+2*rax] ; p1 - - movdqa xmm2, xmm1 - movdqa xmm3, xmm0 - - psubusb xmm0, xmm1 ; q1-=p1 - psubusb xmm1, xmm3 ; p1-=q1 - por xmm1, xmm0 ; abs(p1-q1) - pand xmm1, xmm6 ; set lsb of each byte to zero - psrlw xmm1, 1 ; abs(p1-q1)/2 - - movdqa xmm7, XMMWORD PTR [rdx] - - movdqa xmm5, [rcx+rax] ; p0 - movdqa xmm4, [rcx] ; q0 - movdqa xmm0, xmm4 ; q0 - movdqa xmm6, xmm5 ; p0 - psubusb xmm5, xmm4 ; p0-=q0 - psubusb xmm4, xmm6 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - - movdqa xmm4, [GLOBAL(t80)] - - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm7, xmm7 - pcmpeqb xmm5, xmm7 - - - ; start work on filters - pxor xmm2, xmm4 ; p1 offset to convert to signed values - pxor xmm3, xmm4 ; q1 offset to convert to signed values - psubsb xmm2, xmm3 ; p1 - q1 - - pxor xmm6, xmm4 ; offset to convert to signed values - pxor xmm0, xmm4 ; offset to convert to signed values - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) - pand xmm5, xmm2 ; mask filter values we don't care about - - movdqa xmm0, xmm5 - paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 - paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 - - movdqa xmm1, [GLOBAL(te0)] - movdqa xmm2, [GLOBAL(t1f)] - -; pxor xmm7, xmm7 - pcmpgtb xmm7, xmm0 ;save sign - pand xmm7, xmm1 ;preserve the upper 3 bits - psrlw xmm0, 3 - pand xmm0, xmm2 ;clear out upper 3 bits - por xmm0, xmm7 ;add sign - psubsb xmm3, xmm0 ; q0-= q0sz add - - pxor xmm7, xmm7 - pcmpgtb xmm7, xmm5 ;save sign - pand xmm7, xmm1 ;preserve the upper 3 bits - psrlw xmm5, 3 - pand xmm5, xmm2 ;clear out upper 3 bits - por xmm5, xmm7 ;add sign - paddsb xmm6, xmm5 ; p0+= p0 add - - pxor xmm3, xmm4 ; unoffset - movdqa [rcx], xmm3 ; write back - - pxor xmm6, xmm4 ; unoffset - movdqa [rcx+rax], xmm6 ; write back - - ; begin epilog - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_simple_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE -sym(vp8_loop_filter_simple_vertical_edge_sse2): - push rbp ; save old base pointer value. - mov rbp, rsp ; set new base pointer value. - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx ; save callee-saved reg - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi - 2 ] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 - movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 - movd xmm2, [rdi] ; 13 12 11 10 - movd xmm3, [rcx] ; 53 52 51 50 - punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 - punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 - - movd xmm4, [rsi + rax*2] ; 23 22 21 20 - movd xmm5, [rdx + rax*2] ; 63 62 61 60 - movd xmm6, [rdi + rax*2] ; 33 32 31 30 - movd xmm7, [rcx + rax*2] ; 73 72 71 70 - punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 - punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 - - punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 - punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 - - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - - movdqa xmm2, xmm0 - punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - - lea rsi, [rsi + rax*8] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm4, [rsi] ; 83 82 81 80 - movd xmm1, [rdx] ; c3 c2 c1 c0 - movd xmm6, [rdi] ; 93 92 91 90 - movd xmm3, [rcx] ; d3 d2 d1 d0 - punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 - punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 - - movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 - movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 - movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 - movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 - punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 - punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 - - punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 - punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 - - movdqa xmm7, xmm4 - punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - - movdqa xmm6, xmm4 - punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - - punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - mov rdx, arg(2) ;blimit - - ; calculate mask - movdqa xmm6, xmm0 ; p1 - movdqa xmm7, xmm3 ; q1 - psubusb xmm7, xmm0 ; q1-=p1 - psubusb xmm6, xmm3 ; p1-=q1 - por xmm6, xmm7 ; abs(p1-q1) - pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw xmm6, 1 ; abs(p1-q1)/2 - - movdqa xmm7, [rdx] - - movdqa xmm5, xmm1 ; p0 - movdqa xmm4, xmm2 ; q0 - psubusb xmm5, xmm2 ; p0-=q0 - psubusb xmm4, xmm1 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - movdqa xmm4, [GLOBAL(t80)] - - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm7, xmm7 - pcmpeqb xmm5, xmm7 ; mm5 = mask - - ; start work on filters - movdqa t0, xmm0 - movdqa t1, xmm3 - - pxor xmm0, xmm4 ; p1 offset to convert to signed values - pxor xmm3, xmm4 ; q1 offset to convert to signed values - psubsb xmm0, xmm3 ; p1 - q1 - - pxor xmm1, xmm4 ; offset to convert to signed values - pxor xmm2, xmm4 ; offset to convert to signed values - - movdqa xmm3, xmm2 ; offseted ; q0 - psubsb xmm2, xmm1 ; q0 - p0 - paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) - paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) - pand xmm5, xmm0 ; mask filter values we don't care about - - movdqa xmm0, xmm5 - paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 - paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 - - movdqa xmm6, [GLOBAL(te0)] - movdqa xmm2, [GLOBAL(t1f)] - -; pxor xmm7, xmm7 - pcmpgtb xmm7, xmm0 ;save sign - pand xmm7, xmm6 ;preserve the upper 3 bits - psrlw xmm0, 3 - pand xmm0, xmm2 ;clear out upper 3 bits - por xmm0, xmm7 ;add sign - psubsb xmm3, xmm0 ; q0-= q0sz add - - pxor xmm7, xmm7 - pcmpgtb xmm7, xmm5 ;save sign - pand xmm7, xmm6 ;preserve the upper 3 bits - psrlw xmm5, 3 - pand xmm5, xmm2 ;clear out upper 3 bits - por xmm5, xmm7 ;add sign - paddsb xmm1, xmm5 ; p0+= p0 add - - pxor xmm3, xmm4 ; unoffset q0 - pxor xmm1, xmm4 ; unoffset p0 - - movdqa xmm0, t0 ; p1 - movdqa xmm4, t1 ; q1 - - ; write out order: xmm0 xmm2 xmm1 xmm3 - lea rdx, [rsi + rax*4] - - ; transpose back to write out - ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - movdqa xmm6, xmm0 - punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - movdqa xmm5, xmm3 - punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - - movdqa xmm3, xmm6 - punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - - movd [rsi], xmm6 ; write the second 8-line result - movd [rdx], xmm3 - psrldq xmm6, 4 - psrldq xmm3, 4 - movd [rdi], xmm6 - movd [rcx], xmm3 - psrldq xmm6, 4 - psrldq xmm3, 4 - movd [rsi + rax*2], xmm6 - movd [rdx + rax*2], xmm3 - psrldq xmm6, 4 - psrldq xmm3, 4 - movd [rdi + rax*2], xmm6 - movd [rcx + rax*2], xmm3 - - neg rax - lea rsi, [rsi + rax*8] - neg rax - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd [rsi], xmm0 ; write the first 8-line result - movd [rdx], xmm2 - psrldq xmm0, 4 - psrldq xmm2, 4 - movd [rdi], xmm0 - movd [rcx], xmm2 - psrldq xmm0, 4 - psrldq xmm2, 4 - movd [rsi + rax*2], xmm0 - movd [rdx + rax*2], xmm2 - psrldq xmm0, 4 - psrldq xmm2, 4 - movd [rdi + rax*2], xmm0 - movd [rcx + rax*2], xmm2 - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -tfe: - times 16 db 0xfe -align 16 -t80: - times 16 db 0x80 -align 16 -t1s: - times 16 db 0x01 -align 16 -t3: - times 16 db 0x03 -align 16 -t4: - times 16 db 0x04 -align 16 -ones: - times 8 dw 0x0001 -align 16 -s9: - times 8 dw 0x0900 -align 16 -s63: - times 8 dw 0x003f -align 16 -te0: - times 16 db 0xe0 -align 16 -t1f: - times 16 db 0x1f diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c b/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c deleted file mode 100644 index 6586004600..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp8/common/loopfilter.h" - -#define prototype_loopfilter(sym) \ - void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ - const unsigned char *limit, const unsigned char *thresh, int count) - -#define prototype_loopfilter_nc(sym) \ - void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ - const unsigned char *limit, const unsigned char *thresh) - -#define prototype_simple_loopfilter(sym) \ - void sym(unsigned char *y, int ystride, const unsigned char *blimit) - -prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx); -prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx); -prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx); -prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx); -prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx); -prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx); - -#if HAVE_SSE2 && ARCH_X86_64 -prototype_loopfilter(vp8_loop_filter_bv_y_sse2); -prototype_loopfilter(vp8_loop_filter_bh_y_sse2); -#else -prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2); -prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2); -#endif -prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2); -prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2); - -extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2; -extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2; -extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2; -extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; - -#if HAVE_MMX -/* Horizontal MB filtering */ -void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - - -/* Vertical MB Filtering */ -void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - - -/* Horizontal B Filtering */ -void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - - -void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) -{ - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit); - vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit); -} - - -/* Vertical B Filtering */ -void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - - -void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) -{ - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); -} -#endif - - -/* Horizontal MB filtering */ -#if HAVE_SSE2 -void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); - - if (u_ptr) - vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); -} - - -/* Vertical MB Filtering */ -void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr); - - if (u_ptr) - vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr); -} - - -/* Horizontal B Filtering */ -void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ -#if ARCH_X86_64 - vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); -#else - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); - vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); -#endif - - if (u_ptr) - vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride); -} - - -void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) -{ - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit); - vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit); -} - - -/* Vertical B Filtering */ -void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ -#if ARCH_X86_64 - vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); -#else - vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); - vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); -#endif - - if (u_ptr) - vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4); -} - - -void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) -{ - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); -} - -#endif diff --git a/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm b/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm deleted file mode 100644 index 15e98713c7..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm +++ /dev/null @@ -1,274 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -;void copy_mem8x8_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp8_copy_mem8x8_mmx) PRIVATE -sym(vp8_copy_mem8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movq mm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movq mm1, [rsi+rax] - movq mm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movq [rdi], mm0 - add rsi, rax - - movq [rdi+rcx], mm1 - movq [rdi+rcx*2], mm2 - - - lea rdi, [rdi+rcx*2] - movq mm3, [rsi] - - add rdi, rcx - movq mm4, [rsi+rax] - - movq mm5, [rsi+rax*2] - movq [rdi], mm3 - - lea rsi, [rsi+rax*2] - movq [rdi+rcx], mm4 - - movq [rdi+rcx*2], mm5 - lea rdi, [rdi+rcx*2] - - movq mm0, [rsi+rax] - movq mm1, [rsi+rax*2] - - movq [rdi+rcx], mm0 - movq [rdi+rcx*2],mm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void copy_mem8x4_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp8_copy_mem8x4_mmx) PRIVATE -sym(vp8_copy_mem8x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movq mm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movq mm1, [rsi+rax] - movq mm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movq [rdi], mm0 - movq [rdi+rcx], mm1 - - movq [rdi+rcx*2], mm2 - lea rdi, [rdi+rcx*2] - - movq mm3, [rsi+rax] - movq [rdi+rcx], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void copy_mem16x16_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp8_copy_mem16x16_mmx) PRIVATE -sym(vp8_copy_mem16x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movsxd rax, dword ptr arg(1) ;src_stride; - - mov rdi, arg(2) ;dst; - movsxd rcx, dword ptr arg(3) ;dst_stride - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq [rdi], mm0 - movq [rdi+8], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm b/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm deleted file mode 100644 index cb89537f76..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm +++ /dev/null @@ -1,116 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void copy_mem16x16_sse2( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp8_copy_mem16x16_sse2) PRIVATE -sym(vp8_copy_mem16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movdqu xmm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movdqu xmm1, [rsi+rax] - movdqu xmm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm0 - add rsi, rax - - movdqa [rdi+rcx], xmm1 - movdqa [rdi+rcx*2],xmm2 - - lea rdi, [rdi+rcx*2] - movdqu xmm3, [rsi] - - add rdi, rcx - movdqu xmm4, [rsi+rax] - - movdqu xmm5, [rsi+rax*2] - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm3 - add rsi, rax - - movdqa [rdi+rcx], xmm4 - movdqa [rdi+rcx*2],xmm5 - - lea rdi, [rdi+rcx*2] - movdqu xmm0, [rsi] - - add rdi, rcx - movdqu xmm1, [rsi+rax] - - movdqu xmm2, [rsi+rax*2] - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm0 - add rsi, rax - - movdqa [rdi+rcx], xmm1 - - movdqa [rdi+rcx*2], xmm2 - movdqu xmm3, [rsi] - - movdqu xmm4, [rsi+rax] - lea rdi, [rdi+rcx*2] - - add rdi, rcx - movdqu xmm5, [rsi+rax*2] - - lea rsi, [rsi+rax*2] - movdqa [rdi], xmm3 - - add rsi, rax - movdqa [rdi+rcx], xmm4 - - movdqa [rdi+rcx*2],xmm5 - movdqu xmm0, [rsi] - - lea rdi, [rdi+rcx*2] - movdqu xmm1, [rsi+rax] - - add rdi, rcx - movdqu xmm2, [rsi+rax*2] - - lea rsi, [rsi+rax*2] - movdqa [rdi], xmm0 - - movdqa [rdi+rcx], xmm1 - movdqa [rdi+rcx*2],xmm2 - - movdqu xmm3, [rsi+rax] - lea rdi, [rdi+rcx*2] - - movdqa [rdi+rcx], xmm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm deleted file mode 100644 index 47dd452297..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm +++ /dev/null @@ -1,702 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) - - -%define BLOCK_HEIGHT_WIDTH 4 -%define vp8_filter_weight 128 -%define VP8_FILTER_SHIFT 7 - - -;void vp8_filter_block1d_h6_mmx -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp8_filter -;) -global sym(vp8_filter_block1d_h6_mmx) PRIVATE -sym(vp8_filter_block1d_h6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp8_filter - - movq mm1, [rdx + 16] ; do both the negative taps first!!! - movq mm2, [rdx + 32] ; - movq mm6, [rdx + 48] ; - movq mm7, [rdx + 64] ; - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - movq mm3, [rsi-2] ; mm3 = p-2..p5 - movq mm4, mm3 ; mm4 = p-2..p5 - psrlq mm3, 8 ; mm3 = p-1..p5 - punpcklbw mm3, mm0 ; mm3 = p-1..p2 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - movq mm5, mm4 ; mm5 = p-2..p5 - punpckhbw mm4, mm0 ; mm5 = p2..p5 - pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - movq mm4, mm5 ; mm4 = p-2..p5; - psrlq mm5, 16 ; mm5 = p0..p5; - punpcklbw mm5, mm0 ; mm5 = p0..p3 - pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - movq mm5, mm4 ; mm5 = p-2..p5 - psrlq mm4, 24 ; mm4 = p1..p5 - punpcklbw mm4, mm0 ; mm4 = p1..p4 - pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - ; do outer positive taps - movd mm4, [rsi+3] - punpcklbw mm4, mm0 ; mm5 = p3..p6 - pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - paddsw mm3, [GLOBAL(rd)] ; mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and unpack to saturate - punpcklbw mm3, mm0 ; - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line - add rdi, rax; -%else - movsxd r8, dword ptr arg(2) ;src_pixels_per_line - add rdi, rax; - - add rsi, r8 ; next line -%endif - - dec rcx ; decrement count - jnz .nextrow ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_filter_block1dc_v6_mmx -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int output_pitch, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp8_filter -;) -global sym(vp8_filter_block1dc_v6_mmx) PRIVATE -sym(vp8_filter_block1dc_v6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movq mm5, [GLOBAL(rd)] - push rbx - mov rbx, arg(7) ;vp8_filter - movq mm1, [rbx + 16] ; do both the negative taps first!!! - movq mm2, [rbx + 32] ; - movq mm6, [rbx + 48] ; - movq mm7, [rbx + 64] ; - - movsxd rdx, dword ptr arg(3) ;pixels_per_line - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - sub rsi, rdx - sub rsi, rdx - movsxd rcx, DWORD PTR arg(5) ;output_height - movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - - -.nextrow_cv: - movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 - pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 - pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi] ; mm4 = p0..p3 = row -2 - pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - add rsi, rdx ; move source forward 1 line to avoid 3 * pitch - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 - pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 - pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - paddsw mm3, mm5 ; mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and saturate - - movd [rdi],mm3 ; store the results in the destination - ; the subsequent iterations repeat 3 out of 4 of these reads. Since the - ; recon block should be in cache this shouldn't cost much. Its obviously - ; avoidable!!!. - lea rdi, [rdi+rax] ; - dec rcx ; decrement count - jnz .nextrow_cv ; next row - - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict8x8_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x8_mmx) PRIVATE -sym(vp8_bilinear_predict8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - shl rax, 5 ; offset * 32 - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;dst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - - shl rax, 5 ; offset*32 - add rax, rcx ; VFilter - - lea rcx, [rdi+rdx*8] ; - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - - - ; get the first horizontal line done ; - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - add rsi, rdx ; next line -.next_row_8x8: - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - movq mm5, mm7 ; - movq mm6, mm7 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 - - pmullw mm5, [rax] ; - pmullw mm6, [rax] ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - - pmullw mm3, [rax+16] ; - pmullw mm4, [rax+16] ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - packuswb mm3, mm4 - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch - add rsi, rdx ; next line - add rdi, r8 ;dst_pitch -%endif - cmp rdi, rcx ; - jne .next_row_8x8 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict8x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x4_mmx) PRIVATE -sym(vp8_bilinear_predict8x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - shl rax, 5 - - mov rsi, arg(0) ;src_ptr ; - add rax, rcx - - movsxd rdx, dword ptr arg(5) ;dst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - add rsi, rdx ; next line -.next_row_8x4: - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - movq mm5, mm7 ; - movq mm6, mm7 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 - - pmullw mm5, [rax] ; - pmullw mm6, [rax] ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - - pmullw mm3, [rax+16] ; - pmullw mm4, [rax+16] ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - packuswb mm3, mm4 - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch - add rsi, rdx ; next line - add rdi, r8 -%endif - cmp rdi, rcx ; - jne .next_row_8x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict4x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict4x4_mmx) PRIVATE -sym(vp8_bilinear_predict4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - shl rax, 5 - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;ldst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movq mm7, mm3 ; - packuswb mm7, mm0 ; - - add rsi, rdx ; next line -.next_row_4x4: - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - - movq mm5, mm7 ; - punpcklbw mm5, mm0 ; - - pmullw mm5, [rax] ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - movq mm7, mm3 ; - - packuswb mm7, mm0 ; - - pmullw mm3, [rax+16] ; - paddw mm3, mm5 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - packuswb mm3, mm0 - movd [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch ; - add rsi, rdx ; next line - add rdi, r8 -%endif - - cmp rdi, rcx ; - jne .next_row_4x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - -SECTION_RODATA -align 16 -rd: - times 4 dw 0x40 - -align 16 -global HIDDEN_DATA(sym(vp8_six_tap_mmx)) -sym(vp8_six_tap_mmx): - times 8 dw 0 - times 8 dw 0 - times 8 dw 128 - times 8 dw 0 - times 8 dw 0 - times 8 dw 0 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 123 - times 8 dw 12 - times 8 dw -1 - times 8 dw 0 - - times 8 dw 2 - times 8 dw -11 - times 8 dw 108 - times 8 dw 36 - times 8 dw -8 - times 8 dw 1 - - times 8 dw 0 - times 8 dw -9 - times 8 dw 93 - times 8 dw 50 - times 8 dw -6 - times 8 dw 0 - - times 8 dw 3 - times 8 dw -16 - times 8 dw 77 - times 8 dw 77 - times 8 dw -16 - times 8 dw 3 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 50 - times 8 dw 93 - times 8 dw -9 - times 8 dw 0 - - times 8 dw 1 - times 8 dw -8 - times 8 dw 36 - times 8 dw 108 - times 8 dw -11 - times 8 dw 2 - - times 8 dw 0 - times 8 dw -1 - times 8 dw 12 - times 8 dw 123 - times 8 dw -6 - times 8 dw 0 - - diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm deleted file mode 100644 index 69f8d103c1..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm +++ /dev/null @@ -1,1372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP8_FILTER_WEIGHT 128 -%define VP8_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -;void vp8_filter_block1d8_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp8_filter -;) -global sym(vp8_filter_block1d8_h6_sse2) PRIVATE -sym(vp8_filter_block1d8_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp8_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_filter_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp8_filter -;) -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -global sym(vp8_filter_block1d16_h6_sse2) PRIVATE -sym(vp8_filter_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp8_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi+16], xmm4 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_filter_block1d8_v6_sse2 -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp8_filter -;) -;/************************************************************************************ -; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp8_filter_block1d8_v6_sse2) PRIVATE -sym(vp8_filter_block1d8_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp8_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp8_filter_block1d8_v6_sse2_loop: - movdqa xmm1, XMMWORD PTR [rsi] - pmullw xmm1, [rax] - - movdqa xmm2, XMMWORD PTR [rsi + rdx] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] - pmullw xmm3, [rax + 32] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] - pmullw xmm5, [rax + 64] - - add rsi, rdx - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] - - pmullw xmm4, [rax + 48] - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] - - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp8_filter_block1d8_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_filter_block1d16_v6_sse2 -;( -; unsigned short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; const short *vp8_filter -;) -;/************************************************************************************ -; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp8_filter_block1d16_v6_sse2) PRIVATE -sym(vp8_filter_block1d16_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp8_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp8_filter_block1d16_v6_sse2_loop: -; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. - movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 - movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] - pmullw xmm1, [rax + 16] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm3, [rax + 64] - pmullw xmm4, [rax + 64] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm5, [rax + 32] - pmullw xmm6, [rax + 32] - - movdqa xmm7, XMMWORD PTR [rsi] ; line 1 - movdqa xmm0, XMMWORD PTR [rsi + 16] - pmullw xmm7, [rax] - pmullw xmm0, [rax] - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - paddsw xmm1, xmm7 - paddsw xmm2, xmm0 - - add rsi, rdx - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm3, [rax + 48] - pmullw xmm4, [rax + 48] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm5, [rax + 80] - pmullw xmm6, [rax + 80] - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] - pxor xmm0, xmm0 ; clear xmm0 - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - - paddsw xmm1, xmm7 - paddsw xmm2, xmm7 - - psraw xmm1, 7 - psraw xmm2, 7 - - packuswb xmm1, xmm2 ; pack and saturate - movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp8_filter_block1d16_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_filter_block1d8_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp8_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE -sym(vp8_filter_block1d8_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp8_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_only_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - - movq QWORD PTR [rdi], xmm4 ; store the results in the destination - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_only_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_filter_block1d16_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp8_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE -sym(vp8_filter_block1d16_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp8_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_only_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; lower 8 bytes - - movq QWORD Ptr [rdi], xmm4 ; store the results in the destination - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; higher 8 bytes - - movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_only_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_filter_block1d8_v6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp8_filter -;) -; Second-pass filter only when xoffset==0 -global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE -sym(vp8_filter_block1d8_v6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - mov rax, arg(5) ;vp8_filter - - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ; dst_ptich -%endif - -.vp8_filter_block1d8_v6_only_sse2_loop: - movq xmm1, MMWORD PTR [rsi] - movq xmm2, MMWORD PTR [rsi + rdx] - movq xmm3, MMWORD PTR [rsi + rdx * 2] - movq xmm5, MMWORD PTR [rsi + rdx * 4] - add rsi, rdx - movq xmm4, MMWORD PTR [rsi + rdx * 2] - movq xmm6, MMWORD PTR [rsi + rdx * 4] - - punpcklbw xmm1, xmm0 - pmullw xmm1, [rax] - - punpcklbw xmm2, xmm0 - pmullw xmm2, [rax + 16] - - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax + 32] - - punpcklbw xmm5, xmm0 - pmullw xmm5, [rax + 64] - - punpcklbw xmm4, xmm0 - pmullw xmm4, [rax + 48] - - punpcklbw xmm6, xmm0 - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_unpack_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int output_height, -; unsigned int output_width -;) -global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE -sym(vp8_unpack_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(3) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source - - pxor xmm0, xmm0 ; clear xmm0 for unpack -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source -%endif - -.unpack_block1d16_h6_sse2_rowloop: - movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 - movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - punpcklbw xmm1, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm1 - movdqa XMMWORD Ptr [rdi + 16], xmm3 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(4) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - jnz .unpack_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp8_bilinear_filters_x86_8) -global sym(vp8_bilinear_predict16x16_sse2) PRIVATE -sym(vp8_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x8_sse2) PRIVATE -sym(vp8_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -rd: - times 8 dw 0x40 diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm deleted file mode 100644 index c06f24556e..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm +++ /dev/null @@ -1,1508 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP8_FILTER_WEIGHT 128 -%define VP8_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ -;void vp8_filter_block1d8_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp8_filter_index -;) -global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE -sym(vp8_filter_block1d8_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 - - movdqa xmm7, [GLOBAL(rd)] - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - mov rdi, arg(2) ;output_ptr - - cmp esi, DWORD PTR [rax] - je vp8_filter_block1d8_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx -;xmm3 free -.filter_block1d8_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - pmaddubsw xmm1, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm1 - paddsw xmm2, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - jnz .filter_block1d8_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -vp8_filter_block1d8_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] - movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] - - mov rsi, arg(0) ;src_ptr - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx - -.filter_block1d8_h4_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm2, xmm0 - pshufb xmm0, xmm3 - - pshufb xmm2, xmm4 - pmaddubsw xmm0, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - - jnz .filter_block1d8_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp8_filter_block1d16_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp8_filter_index -;) -global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE -sym(vp8_filter_block1d16_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - mov rdi, arg(2) ;output_ptr - - mov rsi, arg(0) ;src_ptr - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d16_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - movq xmm3, MMWORD PTR [rsi + 6] - - pmaddubsw xmm1, xmm5 - movq xmm7, MMWORD PTR [rsi + 11] - - pmaddubsw xmm2, xmm6 - punpcklbw xmm3, xmm7 - - paddsw xmm0, xmm1 - movdqa xmm1, xmm3 - - pmaddubsw xmm3, xmm4 - paddsw xmm0, xmm2 - - movdqa xmm2, xmm1 - paddsw xmm0, [GLOBAL(rd)] - - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - - psraw xmm0, 7 - pmaddubsw xmm1, xmm5 - - pmaddubsw xmm2, xmm6 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - paddsw xmm3, xmm1 - - paddsw xmm3, xmm2 - - paddsw xmm3, [GLOBAL(rd)] - - psraw xmm3, 7 - - packuswb xmm3, xmm3 - - punpcklqdq xmm0, xmm3 - - movdqa XMMWORD Ptr [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_filter_block1d4_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp8_filter_index -;) -global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE -sym(vp8_filter_block1d4_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - movdqa xmm7, [GLOBAL(rd)] - - cmp esi, DWORD PTR [rax] - je .vp8_filter_block1d4_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -;xmm3 free -.filter_block1d4_h6_rowloop_ssse3: - movdqu xmm0, XMMWORD PTR [rsi - 2] - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf1b)] - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2b)] - pmaddubsw xmm0, xmm4 - pshufb xmm2, [GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm0, xmm1 - paddsw xmm0, xmm7 - pxor xmm1, xmm1 - paddsw xmm0, xmm2 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movd DWORD PTR [rdi], xmm0 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp8_filter_block1d4_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d4_h4_rowloop_ssse3: - movdqu xmm1, XMMWORD PTR [rsi - 2] - - movdqa xmm2, xmm1 - pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] - pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm1, xmm7 - paddsw xmm1, xmm2 - psraw xmm1, 7 - packuswb xmm1, xmm1 - - movd DWORD PTR [rdi], xmm1 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - -;void vp8_filter_block1d16_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp8_filter_index -;) -global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE -sym(vp8_filter_block1d16_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - cmp esi, DWORD PTR [rax] - je .vp8_filter_block1d16_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - -.vp8_filter_block1d16_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 ;store the results - - movq xmm1, MMWORD PTR [rsi + 8] ;A - movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi+8], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp8_filter_block1d16_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp8_filter_block1d16_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - -.vp8_filter_block1d16_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - paddsw xmm2, [GLOBAL(rd)] - paddsw xmm2, xmm3 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - punpcklbw xmm5, xmm4 ;B D - punpcklbw xmm1, xmm0 ;C E - - pmaddubsw xmm1, xmm6 - pmaddubsw xmm5, xmm7 - - movdqa xmm4, [GLOBAL(rd)] - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm5, xmm1 - paddsw xmm5, xmm4 - psraw xmm5, 7 - packuswb xmm5, xmm5 - - punpcklqdq xmm2, xmm5 - - movdqa XMMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp8_filter_block1d16_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_filter_block1d8_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp8_filter_index -;) -global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE -sym(vp8_filter_block1d8_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp8_filter_block1d8_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp8_filter_block1d8_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - movdqa xmm4, [GLOBAL(rd)] - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp8_filter_block1d8_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp8_filter_block1d8_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm5, [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp8_filter_block1d8_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm5 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp8_filter_block1d8_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp8_filter_block1d4_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp8_filter_index -;) -global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE -sym(vp8_filter_block1d4_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp8_filter_block1d4_v4_ssse3 - - movq mm5, MMWORD PTR [rax] ;k0_k5 - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp8_filter_block1d4_v6_ssse3_loop: - movd mm1, DWORD PTR [rsi] ;A - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - movd mm0, DWORD PTR [rax + rdx * 4] ;F - - movq mm4, [GLOBAL(rd)] - - pmaddubsw mm3, mm6 - punpcklbw mm1, mm0 ;A F - pmaddubsw mm2, mm7 - pmaddubsw mm1, mm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm1 - paddsw mm2, mm4 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp8_filter_block1d4_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp8_filter_block1d4_v4_ssse3: - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - movq mm5, MMWORD PTR [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp8_filter_block1d4_v4_ssse3_loop: - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - pmaddubsw mm3, mm6 - pmaddubsw mm2, mm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm5 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp8_filter_block1d4_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_bilinear_predict16x16_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE -sym(vp8_bilinear_predict16x16_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(2) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 4 - lea rax, [rax + rcx] ; HFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ; src_pixels_per_line - - movdqa xmm2, [rax] - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ; dst_pitch -%endif - movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 - - punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 - pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm6, xmm5 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm6, xmm1 - - punpcklbw xmm4, xmm5 - pmaddubsw xmm4, xmm1 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 - - packuswb xmm6, xmm4 - movdqa xmm5, xmm7 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm2 - - punpckhbw xmm7, xmm6 - pmaddubsw xmm7, xmm2 - - paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value - psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm5, xmm7 - movdqa xmm7, xmm6 - - movdqa [rdi], xmm5 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ; dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - - ; get the first horizontal line done - movq xmm4, [rsi] ; load row 0 - movq xmm2, [rsi + 8] ; load row 0 - - lea rsi, [rsi + rax] ; next line -.next_row_sp: - movq xmm3, [rsi] ; load row + 1 - movq xmm5, [rsi + 8] ; load row + 1 - - punpcklbw xmm4, xmm3 - punpcklbw xmm2, xmm5 - - pmaddubsw xmm4, xmm1 - movq xmm7, [rsi + rax] ; load row + 2 - - pmaddubsw xmm2, xmm1 - movq xmm6, [rsi + rax + 8] ; load row + 2 - - punpcklbw xmm3, xmm7 - punpcklbw xmm5, xmm6 - - pmaddubsw xmm3, xmm1 - paddw xmm4, [GLOBAL(rd)] - - pmaddubsw xmm5, xmm1 - paddw xmm2, [GLOBAL(rd)] - - psraw xmm4, VP8_FILTER_SHIFT - psraw xmm2, VP8_FILTER_SHIFT - - packuswb xmm4, xmm2 - paddw xmm3, [GLOBAL(rd)] - - movdqa [rdi], xmm4 ; store row 0 - paddw xmm5, [GLOBAL(rd)] - - psraw xmm3, VP8_FILTER_SHIFT - psraw xmm5, VP8_FILTER_SHIFT - - packuswb xmm3, xmm5 - movdqa xmm4, xmm7 - - movdqa [rdi + rdx],xmm3 ; store row 1 - lea rsi, [rsi + 2*rax] - - movdqa xmm2, xmm6 - lea rdi, [rdi + 2*rdx] - - cmp rdi, rcx - jne .next_row_sp - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - -.next_row_fp: - movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm2, xmm4 - movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 - - pmaddubsw xmm2, xmm1 - movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rax] ; next line - punpcklbw xmm3, xmm4 - - pmaddubsw xmm3, xmm1 - movq xmm5, [rsi] - - paddw xmm2, [GLOBAL(rd)] - movq xmm7, [rsi+1] - - movq xmm6, [rsi+8] - psraw xmm2, VP8_FILTER_SHIFT - - punpcklbw xmm5, xmm7 - movq xmm7, [rsi+9] - - paddw xmm3, [GLOBAL(rd)] - pmaddubsw xmm5, xmm1 - - psraw xmm3, VP8_FILTER_SHIFT - punpcklbw xmm6, xmm7 - - packuswb xmm2, xmm3 - pmaddubsw xmm6, xmm1 - - movdqa [rdi], xmm2 ; store the results in the destination - paddw xmm5, [GLOBAL(rd)] - - lea rdi, [rdi + rdx] ; dst_pitch - psraw xmm5, VP8_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm6, VP8_FILTER_SHIFT - - packuswb xmm5, xmm6 - lea rsi, [rsi + rax] ; next line - - movdqa [rdi], xmm5 ; store the results in the destination - lea rdi, [rdi + rdx] ; dst_pitch - - cmp rdi, rcx - - jne .next_row_fp - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_bilinear_predict8x8_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE -sym(vp8_bilinear_predict8x8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ; xoffset - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b8x8_sp_only - - shl rax, 4 - add rax, rcx ; HFilter - - mov rdi, arg(4) ; dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b8x8_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm1, [rax] - - ; get the first horizontal line done - movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx - - psrldq xmm5, 1 - lea rsp, [rsp + 16] ; next line - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - lea rsp, [rsp + 16] ; next line - - movdqa xmm5, xmm6 - - psrldq xmm5, 1 - - punpcklbw xmm6, xmm5 - pmaddubsw xmm6, xmm0 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 - - packuswb xmm6, xmm6 - - punpcklbw xmm7, xmm6 - pmaddubsw xmm7, xmm1 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm7, xmm7 - - movq [rdi], xmm7 ; store the results in the destination - lea rdi, [rdi + rdx] - - movdqa xmm7, xmm6 - - cmp rdi, rcx - jne .next_row - - jmp .done8x8 - -.b8x8_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] ; VFilter - - movq xmm1, XMMWORD PTR [rsp] - movq xmm2, XMMWORD PTR [rsp+16] - - movq xmm3, XMMWORD PTR [rsp+32] - punpcklbw xmm1, xmm2 - - movq xmm4, XMMWORD PTR [rsp+48] - punpcklbw xmm2, xmm3 - - movq xmm5, XMMWORD PTR [rsp+64] - punpcklbw xmm3, xmm4 - - movq xmm6, XMMWORD PTR [rsp+80] - punpcklbw xmm4, xmm5 - - movq xmm7, XMMWORD PTR [rsp+96] - punpcklbw xmm5, xmm6 - - pmaddubsw xmm1, xmm0 - pmaddubsw xmm2, xmm0 - - pmaddubsw xmm3, xmm0 - pmaddubsw xmm4, xmm0 - - pmaddubsw xmm5, xmm0 - punpcklbw xmm6, xmm7 - - pmaddubsw xmm6, xmm0 - paddw xmm1, [GLOBAL(rd)] - - paddw xmm2, [GLOBAL(rd)] - psraw xmm1, VP8_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm2, VP8_FILTER_SHIFT - - paddw xmm4, [GLOBAL(rd)] - psraw xmm3, VP8_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm5, VP8_FILTER_SHIFT - - psraw xmm6, VP8_FILTER_SHIFT - packuswb xmm1, xmm1 - - packuswb xmm2, xmm2 - movq [rdi], xmm1 - - packuswb xmm3, xmm3 - movq [rdi+rdx], xmm2 - - packuswb xmm4, xmm4 - movq xmm1, XMMWORD PTR [rsp+112] - - lea rdi, [rdi + 2*rdx] - movq xmm2, XMMWORD PTR [rsp+128] - - packuswb xmm5, xmm5 - movq [rdi], xmm3 - - packuswb xmm6, xmm6 - movq [rdi+rdx], xmm4 - - lea rdi, [rdi + 2*rdx] - punpcklbw xmm7, xmm1 - - movq [rdi], xmm5 - pmaddubsw xmm7, xmm0 - - movq [rdi+rdx], xmm6 - punpcklbw xmm1, xmm2 - - pmaddubsw xmm1, xmm0 - paddw xmm7, [GLOBAL(rd)] - - psraw xmm7, VP8_FILTER_SHIFT - paddw xmm1, [GLOBAL(rd)] - - psraw xmm1, VP8_FILTER_SHIFT - packuswb xmm7, xmm7 - - packuswb xmm1, xmm1 - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm7 - - movq [rdi+rdx], xmm1 - lea rsp, [rsp + 144] - - jmp .done8x8 - -.b8x8_fp_only: - lea rcx, [rdi+rdx*8] - -.next_row_fp: - movdqa xmm1, XMMWORD PTR [rsp] - movdqa xmm3, XMMWORD PTR [rsp+16] - - movdqa xmm2, xmm1 - movdqa xmm5, XMMWORD PTR [rsp+32] - - psrldq xmm2, 1 - movdqa xmm7, XMMWORD PTR [rsp+48] - - movdqa xmm4, xmm3 - psrldq xmm4, 1 - - movdqa xmm6, xmm5 - psrldq xmm6, 1 - - punpcklbw xmm1, xmm2 - pmaddubsw xmm1, xmm0 - - punpcklbw xmm3, xmm4 - pmaddubsw xmm3, xmm0 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm0 - - movdqa xmm2, xmm7 - psrldq xmm2, 1 - - punpcklbw xmm7, xmm2 - pmaddubsw xmm7, xmm0 - - paddw xmm1, [GLOBAL(rd)] - psraw xmm1, VP8_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm3, VP8_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm5, VP8_FILTER_SHIFT - - paddw xmm7, [GLOBAL(rd)] - psraw xmm7, VP8_FILTER_SHIFT - - packuswb xmm1, xmm1 - packuswb xmm3, xmm3 - - packuswb xmm5, xmm5 - movq [rdi], xmm1 - - packuswb xmm7, xmm7 - movq [rdi+rdx], xmm3 - - lea rdi, [rdi + 2*rdx] - movq [rdi], xmm5 - - lea rsp, [rsp + 4*16] - movq [rdi+rdx], xmm7 - - lea rdi, [rdi + 2*rdx] - cmp rdi, rcx - - jne .next_row_fp - - lea rsp, [rsp + 16] - -.done8x8: - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -shuf2b: - db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 -shuf3b: - db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 - -align 16 -shuf2bfrom1: - db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 -align 16 -shuf3bfrom1: - db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 - -align 16 -rd: - times 8 dw 0x40 - -align 16 -k0_k5: - times 8 db 0, 0 ;placeholder - times 8 db 0, 0 - times 8 db 2, 1 - times 8 db 0, 0 - times 8 db 3, 3 - times 8 db 0, 0 - times 8 db 1, 2 - times 8 db 0, 0 -k1_k3: - times 8 db 0, 0 ;placeholder - times 8 db -6, 12 - times 8 db -11, 36 - times 8 db -9, 50 - times 8 db -16, 77 - times 8 db -6, 93 - times 8 db -8, 108 - times 8 db -1, 123 -k2_k4: - times 8 db 128, 0 ;placeholder - times 8 db 123, -1 - times 8 db 108, -8 - times 8 db 93, -6 - times 8 db 77, -16 - times 8 db 50, -9 - times 8 db 36, -11 - times 8 db 12, -6 -align 16 -vp8_bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 112, 16 - times 8 db 96, 32 - times 8 db 80, 48 - times 8 db 64, 64 - times 8 db 48, 80 - times 8 db 32, 96 - times 8 db 16, 112 - diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c b/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c deleted file mode 100644 index fb0b57eb1c..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c +++ /dev/null @@ -1,625 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vpx_ports/mem.h" -#include "filter_x86.h" - -extern const short vp8_six_tap_mmx[8][6*8]; - -extern void vp8_filter_block1d_h6_mmx -( - unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -); -extern void vp8_filter_block1dc_v6_mmx -( - unsigned short *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -); -extern void vp8_filter_block1d8_h6_sse2 -( - unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -); -extern void vp8_filter_block1d16_h6_sse2 -( - unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -); -extern void vp8_filter_block1d8_v6_sse2 -( - unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -); -extern void vp8_filter_block1d16_v6_sse2 -( - unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -); -extern void vp8_unpack_block1d16_h6_sse2 -( - unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_height, - unsigned int output_width -); -extern void vp8_filter_block1d8_h6_only_sse2 -( - unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_ptich, - unsigned int output_height, - const short *vp8_filter -); -extern void vp8_filter_block1d16_h6_only_sse2 -( - unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_ptich, - unsigned int output_height, - const short *vp8_filter -); -extern void vp8_filter_block1d8_v6_only_sse2 -( - unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_ptich, - unsigned int output_height, - const short *vp8_filter -); - - -#if HAVE_MMX -void vp8_sixtap_predict4x4_mmx -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */ - const short *HFilter, *VFilter; - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); - -} - - -void vp8_sixtap_predict16x16_mmx -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - - DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ - - const short *HFilter, *VFilter; - - - HFilter = vp8_six_tap_mmx[xoffset]; - - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); - - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); - -} - - -void vp8_sixtap_predict8x8_mmx -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - - DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ - - const short *HFilter, *VFilter; - - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); - - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); - -} - - -void vp8_sixtap_predict8x4_mmx -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - - DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ - - const short *HFilter, *VFilter; - - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); - vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); - - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); - vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); - -} - - - -void vp8_bilinear_predict16x16_mmx -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); - vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); - vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); - vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); -} -#endif - - -#if HAVE_SSE2 -void vp8_sixtap_predict16x16_sse2 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch - -) -{ - DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */ - - const short *HFilter, *VFilter; - - if (xoffset) - { - if (yoffset) - { - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); - } - else - { - /* First-pass only */ - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); - } - } - else - { - /* Second-pass only */ - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); - vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); - } -} - - -void vp8_sixtap_predict8x8_sse2 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ - const short *HFilter, *VFilter; - - if (xoffset) - { - if (yoffset) - { - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); - } - else - { - /* First-pass only */ - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); - } - } - else - { - /* Second-pass only */ - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); - } -} - - -void vp8_sixtap_predict8x4_sse2 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */ - const short *HFilter, *VFilter; - - if (xoffset) - { - if (yoffset) - { - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); - } - else - { - /* First-pass only */ - HFilter = vp8_six_tap_mmx[xoffset]; - vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); - } - } - else - { - /* Second-pass only */ - VFilter = vp8_six_tap_mmx[yoffset]; - vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); - } -} - -#endif - -#if HAVE_SSSE3 - -extern void vp8_filter_block1d8_h6_ssse3 -( - unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp8_filter_index -); - -extern void vp8_filter_block1d16_h6_ssse3 -( - unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp8_filter_index -); - -extern void vp8_filter_block1d16_v6_ssse3 -( - unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp8_filter_index -); - -extern void vp8_filter_block1d8_v6_ssse3 -( - unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp8_filter_index -); - -extern void vp8_filter_block1d4_h6_ssse3 -( - unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp8_filter_index -); - -extern void vp8_filter_block1d4_v6_ssse3 -( - unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp8_filter_index -); - -void vp8_sixtap_predict16x16_ssse3 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch - -) -{ - DECLARE_ALIGNED(16, unsigned char, FData2[24*24]); - - if (xoffset) - { - if (yoffset) - { - vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, FData2, - 16, 21, xoffset); - vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, - 16, yoffset); - } - else - { - /* First-pass only */ - vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, xoffset); - } - } - else - { - if (yoffset) - { - /* Second-pass only */ - vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 16, yoffset); - } - else - { - /* ssse3 second-pass only function couldn't handle (xoffset==0 && - * yoffset==0) case correctly. Add copy function here to guarantee - * six-tap function handles all possible offsets. */ - vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); - } - } -} - -void vp8_sixtap_predict8x8_ssse3 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - DECLARE_ALIGNED(16, unsigned char, FData2[256]); - - if (xoffset) - { - if (yoffset) - { - vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, FData2, - 8, 13, xoffset); - vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, - 8, yoffset); - } - else - { - vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, xoffset); - } - } - else - { - if (yoffset) - { - /* Second-pass only */ - vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, yoffset); - } - else - { - /* ssse3 second-pass only function couldn't handle (xoffset==0 && - * yoffset==0) case correctly. Add copy function here to guarantee - * six-tap function handles all possible offsets. */ - vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); - } - } -} - - -void vp8_sixtap_predict8x4_ssse3 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - DECLARE_ALIGNED(16, unsigned char, FData2[256]); - - if (xoffset) - { - if (yoffset) - { - vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, FData2, - 8, 9, xoffset); - vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, - 4, yoffset); - } - else - { - /* First-pass only */ - vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } - else - { - if (yoffset) - { - /* Second-pass only */ - vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } - else - { - /* ssse3 second-pass only function couldn't handle (xoffset==0 && - * yoffset==0) case correctly. Add copy function here to guarantee - * six-tap function handles all possible offsets. */ - vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); - } - } -} - -void vp8_sixtap_predict4x4_ssse3 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) -{ - DECLARE_ALIGNED(16, unsigned char, FData2[4*9]); - - if (xoffset) - { - if (yoffset) - { - vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - FData2, 4, 9, xoffset); - vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, - 4, yoffset); - } - else - { - vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } - else - { - if (yoffset) - { - vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } - else - { - /* ssse3 second-pass only function couldn't handle (xoffset==0 && - * yoffset==0) case correctly. Add copy function here to guarantee - * six-tap function handles all possible offsets. */ - int r; - - for (r = 0; r < 4; r++) - { - dst_ptr[0] = src_ptr[0]; - dst_ptr[1] = src_ptr[1]; - dst_ptr[2] = src_ptr[2]; - dst_ptr[3] = src_ptr[3]; - dst_ptr += dst_pitch; - src_ptr += src_pixels_per_line; - } - } - } -} - -#endif diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm b/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm deleted file mode 100644 index 88a07b9f3f..0000000000 --- a/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm +++ /dev/null @@ -1,1753 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -;void vp8_loop_filter_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE -sym(vp8_loop_filter_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - movsxd rcx, dword ptr arg(5) ;count -.next8_h: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - ; calculate breakout conditions - movq mm2, [rdi+2*rax] ; q3 - movq mm1, [rsi+2*rax] ; q2 - movq mm6, mm1 ; q2 - psubusb mm1, mm2 ; q2-=q3 - psubusb mm2, mm6 ; q3-=q2 - por mm1, mm2 ; abs(q3-q2) - psubusb mm1, mm7 ; - - - movq mm4, [rsi+rax] ; q1 - movq mm3, mm4 ; q1 - psubusb mm4, mm6 ; q1-=q2 - psubusb mm6, mm3 ; q2-=q1 - por mm4, mm6 ; abs(q2-q1) - - psubusb mm4, mm7 - por mm1, mm4 - - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - psubusb mm4, mm3 ; q0-=q1 - psubusb mm3, mm0 ; q1-=q0 - por mm4, mm3 ; abs(q0-q1) - movq t0, mm4 ; save to t0 - psubusb mm4, mm7 - por mm1, mm4 - - - neg rax ; negate pitch to deal with above border - - movq mm2, [rsi+4*rax] ; p3 - movq mm4, [rdi+4*rax] ; p2 - movq mm5, mm4 ; p2 - psubusb mm4, mm2 ; p2-=p3 - psubusb mm2, mm5 ; p3-=p2 - por mm4, mm2 ; abs(p3 - p2) - psubusb mm4, mm7 - por mm1, mm4 - - - movq mm4, [rsi+2*rax] ; p1 - movq mm3, mm4 ; p1 - psubusb mm4, mm5 ; p1-=p2 - psubusb mm5, mm3 ; p2-=p1 - por mm4, mm5 ; abs(p2 - p1) - psubusb mm4, mm7 - por mm1, mm4 - - movq mm2, mm3 ; p1 - - movq mm4, [rsi+rax] ; p0 - movq mm5, mm4 ; p0 - psubusb mm4, mm3 ; p0-=p1 - psubusb mm3, mm5 ; p1-=p0 - por mm4, mm3 ; abs(p1 - p0) - movq t1, mm4 ; save to t1 - psubusb mm4, mm7 - por mm1, mm4 - - movq mm3, [rdi] ; q1 - movq mm4, mm3 ; q1 - psubusb mm3, mm2 ; q1-=p1 - psubusb mm2, mm4 ; p1-=q1 - por mm2, mm3 ; abs(p1-q1) - pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm2, 1 ; abs(p1-q1)/2 - - movq mm6, mm5 ; p0 - movq mm3, [rsi] ; q0 - psubusb mm5, mm3 ; p0-=q0 - psubusb mm3, mm6 ; q0-=p0 - por mm5, mm3 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] ; blimit - - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm5 - pxor mm5, mm5 - pcmpeqb mm1, mm5 ; mask mm1 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb mm4, mm5 - - pcmpeqb mm5, mm5 - pxor mm4, mm5 - - - ; start work on filters - movq mm2, [rsi+2*rax] ; p1 - movq mm7, [rdi] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - pand mm2, mm4 ; high var mask (hvm)(p1 - q1) - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - movq mm2, mm1 - paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - pxor mm0, mm0 ; - pxor mm5, mm5 - punpcklbw mm0, mm2 ; - punpckhbw mm5, mm2 ; - psraw mm0, 11 ; - psraw mm5, 11 - packsswb mm0, mm5 - movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - - pxor mm0, mm0 ; 0 - movq mm5, mm1 ; abcdefgh - punpcklbw mm0, mm1 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - pxor mm1, mm1 ; 0 - punpckhbw mm1, mm5 ; a0b0c0d0 - psraw mm1, 11 ; sign extended shift right by 3 - movq mm5, mm0 ; save results - - packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw mm5, [GLOBAL(ones)] - paddsw mm1, [GLOBAL(ones)] - psraw mm5, 1 ; partial shifted one more time for 2nd tap - psraw mm1, 1 ; partial shifted one more time for 2nd tap - packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - pandn mm4, mm5 ; high edge variance additive - - paddsb mm6, mm2 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - movq mm6, [rsi+2*rax] ; p1 - pxor mm6, [GLOBAL(t80)] ; reoffset - paddsb mm6, mm4 ; p1+= p1 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+2*rax], mm6 ; write back - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - psubsb mm7, mm4 ; q1-= q1 add - pxor mm7, [GLOBAL(t80)] ; unoffset - movq [rdi], mm7 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .next8_h - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE -sym(vp8_loop_filter_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 64 ; reserve 64 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4 - 4] - - movsxd rcx, dword ptr arg(5) ;count -.next8_v: - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - - ;transpose - movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 - movq mm7, mm6 ; 77 76 75 74 73 72 71 70 - - punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 - punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 - - movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 - movq mm5, mm4 ; 47 46 45 44 43 42 41 40 - - punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 - punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 - - movq mm3, mm5 ; 57 47 56 46 55 45 54 44 - punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 - - punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 - movq mm2, mm4 ; 53 43 52 42 51 41 50 40 - - punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 - punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 - - neg rax - movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 - - movq mm1, mm6 ; 27 26 25 24 23 22 21 20 - punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 - - punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 - movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 - - punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 - movq mm0, mm7 ; 17 07 16 06 15 05 14 04 - - punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 - punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 - - movq mm6, mm7 ; 37 27 17 07 36 26 16 06 - punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 - - punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 - - movq mm5, mm6 ; 76 66 56 46 36 26 16 06 - psubusb mm5, mm7 ; q2-q3 - - psubusb mm7, mm6 ; q3-q2 - por mm7, mm5; ; mm7=abs (q3-q2) - - movq mm5, mm0 ; 35 25 15 05 34 24 14 04 - punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 - - punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 - movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 - - psubusb mm3, mm6 ; q1-q2 - psubusb mm6, mm5 ; q2-q1 - - por mm6, mm3 ; mm6=abs(q2-q1) - lea rdx, srct - - movq [rdx+24], mm5 ; save q1 - movq [rdx+16], mm0 ; save q0 - - movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 - punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 - - movq mm0, mm3 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 31 21 11 01 30 20 10 00 - - punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 - - movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb mm2, mm0 ; p2-p3 - - psubusb mm0, mm1 ; p3-p2 - por mm0, mm2 ; mm0=abs(p3-p2) - - movq mm2, mm3 ; 33 23 13 03 32 22 12 02 - punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 - - punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 - movq [rdx+8], mm3 ; save p0 - - movq [rdx], mm2 ; save p1 - movq mm5, mm2 ; mm5 = p1 - - psubusb mm2, mm1 ; p1-p2 - psubusb mm1, mm5 ; p2-p1 - - por mm1, mm2 ; mm1=abs(p2-p1) - mov rdx, arg(3) ;limit - - movq mm4, [rdx] ; mm4 = limit - psubusb mm7, mm4 - - psubusb mm0, mm4 - psubusb mm1, mm4 - - psubusb mm6, mm4 - por mm7, mm6 - - por mm0, mm1 - por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit - - movq mm1, mm5 ; p1 - - movq mm7, mm3 ; mm3=mm7=p0 - psubusb mm7, mm5 ; p0 - p1 - - psubusb mm5, mm3 ; p1 - p0 - por mm5, mm7 ; abs(p1-p0) - - movq t0, mm5 ; save abs(p1-p0) - lea rdx, srct - - psubusb mm5, mm4 - por mm0, mm5 ; mm0=mask - - movq mm5, [rdx+16] ; mm5=q0 - movq mm7, [rdx+24] ; mm7=q1 - - movq mm6, mm5 ; mm6=q0 - movq mm2, mm7 ; q1 - psubusb mm5, mm7 ; q0-q1 - - psubusb mm7, mm6 ; q1-q0 - por mm7, mm5 ; abs(q1-q0) - - movq t1, mm7 ; save abs(q1-q0) - psubusb mm7, mm4 - - por mm0, mm7 ; mask - - movq mm5, mm2 ; q1 - psubusb mm5, mm1 ; q1-=p1 - psubusb mm1, mm2 ; p1-=q1 - por mm5, mm1 ; abs(p1-q1) - pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm5, 1 ; abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; - - movq mm4, [rdx] ;blimit - movq mm1, mm3 ; mm1=mm3=p0 - - movq mm7, mm6 ; mm7=mm6=q0 - psubusb mm1, mm7 ; p0-q0 - - psubusb mm7, mm3 ; q0-p0 - por mm1, mm7 ; abs(q0-p0) - paddusb mm1, mm1 ; abs(q0-p0)*2 - paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm0; ; mask - - pxor mm0, mm0 - pcmpeqb mm1, mm0 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] - ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - - por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb mm4, mm0 - - pcmpeqb mm0, mm0 - pxor mm4, mm0 - - - - ; start work on filters - lea rdx, srct - - movq mm2, [rdx] ; p1 - movq mm7, [rdx+24] ; q1 - - movq mm6, [rdx+8] ; p0 - movq mm0, [rdx+16] ; q0 - - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm2, mm7 ; p1 - q1 - pand mm2, mm4 ; high var mask (hvm)(p1 - q1) - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - - paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - - paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - movq mm2, mm1 - paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - - paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - pxor mm0, mm0 ; - - pxor mm5, mm5 - punpcklbw mm0, mm2 ; - - punpckhbw mm5, mm2 ; - psraw mm0, 11 ; - - psraw mm5, 11 - packsswb mm0, mm5 - - movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - - pxor mm0, mm0 ; 0 - movq mm5, mm1 ; abcdefgh - - punpcklbw mm0, mm1 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - - pxor mm1, mm1 ; 0 - punpckhbw mm1, mm5 ; a0b0c0d0 - - psraw mm1, 11 ; sign extended shift right by 3 - movq mm5, mm0 ; save results - - packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw mm5, [GLOBAL(ones)] - - paddsw mm1, [GLOBAL(ones)] - psraw mm5, 1 ; partial shifted one more time for 2nd tap - - psraw mm1, 1 ; partial shifted one more time for 2nd tap - packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - - pandn mm4, mm5 ; high edge variance additive - - paddsb mm6, mm2 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - ; mm6=p0 ; - movq mm1, [rdx] ; p1 - pxor mm1, [GLOBAL(t80)] ; reoffset - - paddsb mm1, mm4 ; p1+= p1 add - pxor mm1, [GLOBAL(t80)] ; unoffset - ; mm6 = p0 mm1 = p1 - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; mm3 = q0 - psubsb mm7, mm4 ; q1-= q1 add - pxor mm7, [GLOBAL(t80)] ; unoffset - ; mm7 = q1 - - ; transpose and write back - ; mm1 = 72 62 52 42 32 22 12 02 - ; mm6 = 73 63 53 43 33 23 13 03 - ; mm3 = 74 64 54 44 34 24 14 04 - ; mm7 = 75 65 55 45 35 25 15 05 - - movq mm2, mm1 ; 72 62 52 42 32 22 12 02 - punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 - - movq mm4, mm3 ; 74 64 54 44 34 24 14 04 - punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 - - punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 - punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 - - movq mm6, mm2 ; 33 32 23 22 13 12 03 02 - punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 - - punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 - movq mm5, mm1 ; 73 72 63 62 53 52 43 42 - - punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 - punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 - - - ; mm2 = 15 14 13 12 05 04 03 02 - ; mm6 = 35 34 33 32 25 24 23 22 - ; mm5 = 55 54 53 52 45 44 43 42 - ; mm1 = 75 74 73 72 65 64 63 62 - - - - movd [rsi+rax*4+2], mm2 - psrlq mm2, 32 - - movd [rdi+rax*4+2], mm2 - movd [rsi+rax*2+2], mm6 - - psrlq mm6, 32 - movd [rsi+rax+2],mm6 - - movd [rsi+2], mm1 - psrlq mm1, 32 - - movd [rdi+2], mm1 - neg rax - - movd [rdi+rax+2],mm5 - psrlq mm5, 32 - - movd [rdi+rax*2+2], mm5 - - lea rsi, [rsi+rax*8] - dec rcx - jnz .next8_v - - add rsp, 64 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE -sym(vp8_mbloop_filter_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - movsxd rcx, dword ptr arg(5) ;count -.next8_mbh: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - ; calculate breakout conditions - movq mm2, [rdi+2*rax] ; q3 - - movq mm1, [rsi+2*rax] ; q2 - movq mm6, mm1 ; q2 - psubusb mm1, mm2 ; q2-=q3 - psubusb mm2, mm6 ; q3-=q2 - por mm1, mm2 ; abs(q3-q2) - psubusb mm1, mm7 - - - ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit - movq mm4, [rsi+rax] ; q1 - movq mm3, mm4 ; q1 - psubusb mm4, mm6 ; q1-=q2 - psubusb mm6, mm3 ; q2-=q1 - por mm4, mm6 ; abs(q2-q1) - psubusb mm4, mm7 - por mm1, mm4 - - - ; mm1 = mask, mm3=q1, mm7 = limit - - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - psubusb mm4, mm3 ; q0-=q1 - psubusb mm3, mm0 ; q1-=q0 - por mm4, mm3 ; abs(q0-q1) - movq t0, mm4 ; save to t0 - psubusb mm4, mm7 - por mm1, mm4 - - - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - neg rax ; negate pitch to deal with above border - - movq mm2, [rsi+4*rax] ; p3 - movq mm4, [rdi+4*rax] ; p2 - movq mm5, mm4 ; p2 - psubusb mm4, mm2 ; p2-=p3 - psubusb mm2, mm5 ; p3-=p2 - por mm4, mm2 ; abs(p3 - p2) - psubusb mm4, mm7 - por mm1, mm4 - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - movq mm4, [rsi+2*rax] ; p1 - movq mm3, mm4 ; p1 - psubusb mm4, mm5 ; p1-=p2 - psubusb mm5, mm3 ; p2-=p1 - por mm4, mm5 ; abs(p2 - p1) - psubusb mm4, mm7 - por mm1, mm4 - - movq mm2, mm3 ; p1 - - - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) - - movq mm4, [rsi+rax] ; p0 - movq mm5, mm4 ; p0 - psubusb mm4, mm3 ; p0-=p1 - psubusb mm3, mm5 ; p1-=p0 - por mm4, mm3 ; abs(p1 - p0) - movq t1, mm4 ; save to t1 - psubusb mm4, mm7 - por mm1, mm4 - ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm5 = p0 - movq mm3, [rdi] ; q1 - movq mm4, mm3 ; q1 - psubusb mm3, mm2 ; q1-=p1 - psubusb mm2, mm4 ; p1-=q1 - por mm2, mm3 ; abs(p1-q1) - pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm2, 1 ; abs(p1-q1)/2 - - movq mm6, mm5 ; p0 - movq mm3, mm0 ; q0 - psubusb mm5, mm3 ; p0-=q0 - psubusb mm3, mm6 ; q0-=p0 - por mm5, mm3 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] ; blimit - - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm5 - pxor mm5, mm5 - pcmpeqb mm1, mm5 ; mask mm1 - - ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm6 = p0, - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb mm4, mm5 - - pcmpeqb mm5, mm5 - pxor mm4, mm5 - - - - ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) - ; mm6 = p0, mm4=hev - ; start work on filters - movq mm2, [rsi+2*rax] ; p1 - movq mm7, [rdi] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) - paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - - ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 - movq mm2, mm1 ; vp8_filter - pand mm2, mm4; ; Filter2 = vp8_filter & hev - - movq mm5, mm2 ; - paddsb mm5, [GLOBAL(t3)]; - - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm5 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm5 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - movq mm5, mm0 ; Filter2 - - paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm2 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm2 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 - psubsb mm3, mm0 ; qs0 =qs0 - filter1 - paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 - - ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 - ; vp8_filter &= ~hev; - ; Filter2 = vp8_filter; - pandn mm4, mm1 ; vp8_filter&=~hev - - - ; mm3=qs0, mm4=filter2, mm6=ps0 - - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; - pxor mm0, mm0 - - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s27)] - pmulhw mm2, [GLOBAL(s27)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - psubsb mm3, mm1 - paddsb mm6, mm1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - movq [rsi+rax], mm6 - movq [rsi], mm3 - - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s18)] - pmulhw mm2, [GLOBAL(s18)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm3, [rdi] - movq mm6, [rsi+rax*2] ; p1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdi], mm3 - movq [rsi+rax*2], mm6 - - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s9)] - pmulhw mm2, [GLOBAL(s9)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - - movq mm6, [rdi+rax*4] - neg rax - movq mm3, [rdi+rax ] - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdi+rax ], mm3 - neg rax - movq [rdi+rax*4], mm6 - -;EARLY_BREAK_OUT: - neg rax - add rsi,8 - dec rcx - jnz .next8_mbh - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_mbloop_filter_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE -sym(vp8_mbloop_filter_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4 - 4] - - movsxd rcx, dword ptr arg(5) ;count -.next8_mbv: - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - - ;transpose - movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 - movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 - - movq mm7, mm6 ; 77 76 75 74 73 72 71 70 - punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 - - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 - - movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 - movq mm5, mm4 ; 47 46 45 44 43 42 41 40 - - punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - - movq mm3, mm5 ; 57 47 56 46 55 45 54 44 - punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 - - punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 - movq mm2, mm4 ; 53 43 52 42 51 41 50 40 - - punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 - punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 - - neg rax - - movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 - movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 - - movq mm1, mm6 ; 27 26 25 24 23 22 21 20 - punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 - - punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 - - movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 - punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 - - movq mm0, mm7 ; 17 07 16 06 15 05 14 04 - punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 - - punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 - movq mm6, mm7 ; 37 27 17 07 36 26 16 06 - - punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 - punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 - - lea rdx, srct - movq mm5, mm6 ; 76 66 56 46 36 26 16 06 - - movq [rdx+56], mm7 - psubusb mm5, mm7 ; q2-q3 - - - movq [rdx+48], mm6 - psubusb mm7, mm6 ; q3-q2 - - por mm7, mm5; ; mm7=abs (q3-q2) - movq mm5, mm0 ; 35 25 15 05 34 24 14 04 - - punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 - punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 - - movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 - psubusb mm3, mm6 ; q1-q2 - - psubusb mm6, mm5 ; q2-q1 - por mm6, mm3 ; mm6=abs(q2-q1) - - movq [rdx+40], mm5 ; save q1 - movq [rdx+32], mm0 ; save q0 - - movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 - punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 - - movq mm0, mm3 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 31 21 11 01 30 20 10 00 - - punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 - - movq [rdx], mm0 ; save p3 - movq [rdx+8], mm1 ; save p2 - - movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb mm2, mm0 ; p2-p3 - - psubusb mm0, mm1 ; p3-p2 - por mm0, mm2 ; mm0=abs(p3-p2) - - movq mm2, mm3 ; 33 23 13 03 32 22 12 02 - punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 - - punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 - movq [rdx+24], mm3 ; save p0 - - movq [rdx+16], mm2 ; save p1 - movq mm5, mm2 ; mm5 = p1 - - psubusb mm2, mm1 ; p1-p2 - psubusb mm1, mm5 ; p2-p1 - - por mm1, mm2 ; mm1=abs(p2-p1) - mov rdx, arg(3) ;limit - - movq mm4, [rdx] ; mm4 = limit - psubusb mm7, mm4 ; abs(q3-q2) > limit - - psubusb mm0, mm4 ; abs(p3-p2) > limit - psubusb mm1, mm4 ; abs(p2-p1) > limit - - psubusb mm6, mm4 ; abs(q2-q1) > limit - por mm7, mm6 ; or - - por mm0, mm1 ; - por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit - - movq mm1, mm5 ; p1 - - movq mm7, mm3 ; mm3=mm7=p0 - psubusb mm7, mm5 ; p0 - p1 - - psubusb mm5, mm3 ; p1 - p0 - por mm5, mm7 ; abs(p1-p0) - - movq t0, mm5 ; save abs(p1-p0) - lea rdx, srct - - psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit - por mm0, mm5 ; mm0=mask - - movq mm5, [rdx+32] ; mm5=q0 - movq mm7, [rdx+40] ; mm7=q1 - - movq mm6, mm5 ; mm6=q0 - movq mm2, mm7 ; q1 - psubusb mm5, mm7 ; q0-q1 - - psubusb mm7, mm6 ; q1-q0 - por mm7, mm5 ; abs(q1-q0) - - movq t1, mm7 ; save abs(q1-q0) - psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit - - por mm0, mm7 ; mask - - movq mm5, mm2 ; q1 - psubusb mm5, mm1 ; q1-=p1 - psubusb mm1, mm2 ; p1-=q1 - por mm5, mm1 ; abs(p1-q1) - pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm5, 1 ; abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; - - movq mm4, [rdx] ;blimit - movq mm1, mm3 ; mm1=mm3=p0 - - movq mm7, mm6 ; mm7=mm6=q0 - psubusb mm1, mm7 ; p0-q0 - - psubusb mm7, mm3 ; q0-p0 - por mm1, mm7 ; abs(q0-p0) - paddusb mm1, mm1 ; abs(q0-p0)*2 - paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm0; ; mask - - pxor mm0, mm0 - pcmpeqb mm1, mm0 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] - ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 ; abs(q1 - q0) > thresh - - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 ; abs(p1 - p0)> thresh - - por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb mm4, mm0 - - pcmpeqb mm0, mm0 - pxor mm4, mm0 - - - - - ; start work on filters - lea rdx, srct - - ; start work on filters - movq mm2, [rdx+16] ; p1 - movq mm7, [rdx+40] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - movq mm6, [rdx+24] ; p0 - movq mm0, [rdx+32] ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) - paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 - movq mm2, mm1 ; vp8_filter - pand mm2, mm4; ; Filter2 = vp8_filter & hev - - movq mm5, mm2 ; - paddsb mm5, [GLOBAL(t3)]; - - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm5 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm5 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - movq mm5, mm0 ; Filter2 - - paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) - pxor mm0, mm0 ; 0 - pxor mm7, mm7 ; 0 - - punpcklbw mm0, mm2 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - punpckhbw mm7, mm2 ; a0b0c0d0 - psraw mm7, 11 ; sign extended shift right by 3 - packsswb mm0, mm7 ; Filter2 >>=3; - - ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 - psubsb mm3, mm0 ; qs0 =qs0 - filter1 - paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 - - ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 - ; vp8_filter &= ~hev; - ; Filter2 = vp8_filter; - pandn mm4, mm1 ; vp8_filter&=~hev - - - ; mm3=qs0, mm4=filter2, mm6=ps0 - - ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); - ; s = vp8_signed_char_clamp(qs0 - u); - ; *oq0 = s^0x80; - ; s = vp8_signed_char_clamp(ps0 + u); - ; *op0 = s^0x80; - pxor mm0, mm0 - - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s27)] - pmulhw mm2, [GLOBAL(s27)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - psubsb mm3, mm1 - paddsb mm6, mm1 - - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - movq [rdx+24], mm6 - movq [rdx+32], mm3 - - ; roughly 2/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); - ; s = vp8_signed_char_clamp(qs1 - u); - ; *oq1 = s^0x80; - ; s = vp8_signed_char_clamp(ps1 + u); - ; *op1 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s18)] - pmulhw mm2, [GLOBAL(s18)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm3, [rdx + 40] - movq mm6, [rdx + 16] ; p1 - pxor mm3, [GLOBAL(t80)] - pxor mm6, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - movq [rdx + 40], mm3 - movq [rdx + 16], mm6 - - ; roughly 1/7th difference across boundary - ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); - ; s = vp8_signed_char_clamp(qs2 - u); - ; *oq2 = s^0x80; - ; s = vp8_signed_char_clamp(ps2 + u); - ; *op2 = s^0x80; - pxor mm1, mm1 - pxor mm2, mm2 - punpcklbw mm1, mm4 - punpckhbw mm2, mm4 - pmulhw mm1, [GLOBAL(s9)] - pmulhw mm2, [GLOBAL(s9)] - paddw mm1, [GLOBAL(s63)] - paddw mm2, [GLOBAL(s63)] - psraw mm1, 7 - psraw mm2, 7 - packsswb mm1, mm2 - - movq mm6, [rdx+ 8] - movq mm3, [rdx+48] - - pxor mm6, [GLOBAL(t80)] - pxor mm3, [GLOBAL(t80)] - - paddsb mm6, mm1 - psubsb mm3, mm1 - - pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 - pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 - - ; transpose and write back - movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 - movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 - - punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 - punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 - - movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 - movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 - - punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 - punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 - - movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 - punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 - - punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 - movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 - - punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 - punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 - - movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 - punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 - - movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 - punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 - - movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 - punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 - - punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 - movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 - - punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 - punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 - - movq [rsi+rax*4], mm0 ; write out - movq [rdi+rax*4], mm6 ; write out - - movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 - punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 - - punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 - movq [rsi+rax*2], mm0 ; write out - - movq [rdi+rax*2], mm5 ; write out - movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 - - punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 - punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 - - movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 - punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 - - punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 - movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 - - movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 - punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 - - punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 - movq [rsi], mm0 ; write out - - movq [rdi], mm1 ; write out - neg rax - - punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 - punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 - - movq [rsi+rax*2], mm3 - movq [rdi+rax*2], mm4 - - lea rsi, [rsi+rax*8] - dec rcx - - jnz .next8_mbv - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_simple_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE -sym(vp8_loop_filter_simple_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - mov rcx, 2 ; count -.nexts8_h: - mov rdx, arg(2) ;blimit ; get blimit - movq mm3, [rdx] ; - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movq mm1, [rsi+2*rax] ; p1 - movq mm0, [rdi] ; q1 - movq mm2, mm1 - movq mm7, mm0 - movq mm4, mm0 - psubusb mm0, mm1 ; q1-=p1 - psubusb mm1, mm4 ; p1-=q1 - por mm1, mm0 ; abs(p1-q1) - pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm1, 1 ; abs(p1-q1)/2 - - movq mm5, [rsi+rax] ; p0 - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - movq mm6, mm5 ; p0 - psubusb mm5, mm4 ; p0-=q0 - psubusb mm4, mm6 ; q0-=p0 - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm3, mm3 - pcmpeqb mm5, mm3 - - ; start work on filters - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) - pand mm5, mm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - movq mm1, mm5 ; get a copy of filters - psraw mm1, 11 ; arithmetic shift right 11 - psllw mm1, 8 ; shift left 8 to put it back - - por mm0, mm1 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .nexts8_h - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_loop_filter_simple_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE -sym(vp8_loop_filter_simple_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4- 2]; ; - mov rcx, 2 ; count -.nexts8_v: - - lea rdi, [rsi + rax]; - movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 - - movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - - movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 - movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 - - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - movq mm5, mm4 ; 53 43 52 42 51 41 50 40 - - punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 - punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 - - neg rax - - movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 - movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 - - punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 - movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 - - movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 - punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 - - movq mm2, mm0 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 13 03 12 02 11 01 10 00 - - punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 - movq mm3, mm2 ; 33 23 13 03 32 22 12 02 - - punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 - punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 - - punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 - - - ; calculate mask - movq mm6, mm0 ; p1 - movq mm7, mm3 ; q1 - psubusb mm7, mm6 ; q1-=p1 - psubusb mm6, mm3 ; p1-=q1 - por mm6, mm7 ; abs(p1-q1) - pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm6, 1 ; abs(p1-q1)/2 - - movq mm5, mm1 ; p0 - movq mm4, mm2 ; q0 - - psubusb mm5, mm2 ; p0-=q0 - psubusb mm4, mm1 ; q0-=p0 - - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] - - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm7, mm7 - pcmpeqb mm5, mm7 ; mm5 = mask - - ; start work on filters - movq t0, mm0 - movq t1, mm3 - - pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm0, mm3 ; p1 - q1 - movq mm6, mm1 ; p0 - - movq mm7, mm2 ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm7 ; offseted ; q0 - - psubsb mm7, mm6 ; q0 - p0 - paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) - - pand mm5, mm0 ; mask filter values we don't care about - - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - movq mm7, mm5 ; get a copy of filters - psraw mm7, 11 ; arithmetic shift right 11 - psllw mm7, 8 ; shift left 8 to put it back - - por mm0, mm7 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0sz add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - - movq mm0, t0 - movq mm4, t1 - - ; mm0 = 70 60 50 40 30 20 10 00 - ; mm6 = 71 61 51 41 31 21 11 01 - ; mm3 = 72 62 52 42 32 22 12 02 - ; mm4 = 73 63 53 43 33 23 13 03 - ; transpose back to write out - - movq mm1, mm0 ; - punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 - - punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 - movq mm2, mm3 ; - - punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 - movq mm5, mm1 ; 71 70 61 60 51 50 41 40 - - punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 - movq mm6, mm0 ; 31 30 21 20 11 10 01 00 - - punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 - punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 - - movd [rsi+rax*4], mm0 ; write 03 02 01 00 - punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 - - psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 - punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 - - movd [rdi+rax*4], mm0 ; write 13 12 11 10 - movd [rsi+rax*2], mm6 ; write 23 22 21 20 - - psrlq mm6, 32 ; 33 32 31 30 - movd [rsi], mm1 ; write 43 42 41 40 - - movd [rsi + rax], mm6 ; write 33 32 31 30 - neg rax - - movd [rsi + rax*2], mm5 ; write 63 62 61 60 - psrlq mm1, 32 ; 53 52 51 50 - - movd [rdi], mm1 ; write out 53 52 51 50 - psrlq mm5, 32 ; 73 72 71 70 - - movd [rdi + rax*2], mm5 ; write 73 72 71 70 - - lea rsi, [rsi+rax*8] ; next 8 - - dec rcx - jnz .nexts8_v - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - -;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, -; int y_stride, -; loop_filter_info *lfi) -;{ -; -; -; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -;} - -SECTION_RODATA -align 16 -tfe: - times 8 db 0xfe -align 16 -t80: - times 8 db 0x80 -align 16 -t1s: - times 8 db 0x01 -align 16 -t3: - times 8 db 0x03 -align 16 -t4: - times 8 db 0x04 -align 16 -ones: - times 4 dw 0x0001 -align 16 -s27: - times 4 dw 0x1b00 -align 16 -s18: - times 4 dw 0x1200 -align 16 -s9: - times 4 dw 0x0900 -align 16 -s63: - times 4 dw 0x003f |