summaryrefslogtreecommitdiff
path: root/thirdparty/libvpx/vp8/common/x86
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/libvpx/vp8/common/x86')
-rw-r--r--thirdparty/libvpx/vp8/common/x86/copy_sse2.asm93
-rw-r--r--thirdparty/libvpx/vp8/common/x86/copy_sse3.asm146
-rw-r--r--thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm258
-rw-r--r--thirdparty/libvpx/vp8/common/x86/filter_x86.c35
-rw-r--r--thirdparty/libvpx/vp8/common/x86/filter_x86.h33
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c128
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c89
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm295
-rw-r--r--thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm708
-rw-r--r--thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm140
-rw-r--r--thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm121
-rw-r--r--thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm815
-rw-r--r--thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm1640
-rw-r--r--thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c198
-rw-r--r--thirdparty/libvpx/vp8/common/x86/recon_mmx.asm274
-rw-r--r--thirdparty/libvpx/vp8/common/x86/recon_sse2.asm116
-rw-r--r--thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm702
-rw-r--r--thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm1372
-rw-r--r--thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm1508
-rw-r--r--thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c625
-rw-r--r--thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm1753
21 files changed, 0 insertions, 11049 deletions
diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm
deleted file mode 100644
index 86fae26956..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm
+++ /dev/null
@@ -1,93 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vp8_copy32xn_sse2(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp8_copy32xn_sse2) PRIVATE
-sym(vp8_copy32xn_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;dst_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;dst_stride
- movsxd rcx, dword ptr arg(4) ;height
-
-.block_copy_sse2_loopx4:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- movdqu xmm2, XMMWORD PTR [rsi + rax]
- movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
-
- lea rsi, [rsi+rax*2]
-
- movdqu xmm4, XMMWORD PTR [rsi]
- movdqu xmm5, XMMWORD PTR [rsi + 16]
- movdqu xmm6, XMMWORD PTR [rsi + rax]
- movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
-
- lea rsi, [rsi+rax*2]
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- movdqa XMMWORD PTR [rdi + rdx], xmm2
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
-
- lea rdi, [rdi+rdx*2]
-
- movdqa XMMWORD PTR [rdi], xmm4
- movdqa XMMWORD PTR [rdi + 16], xmm5
- movdqa XMMWORD PTR [rdi + rdx], xmm6
- movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
-
- lea rdi, [rdi+rdx*2]
-
- sub rcx, 4
- cmp rcx, 4
- jge .block_copy_sse2_loopx4
-
- cmp rcx, 0
- je .copy_is_done
-
-.block_copy_sse2_loop:
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi + 16]
- lea rsi, [rsi+rax]
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm1
- lea rdi, [rdi+rdx]
-
- sub rcx, 1
- jne .block_copy_sse2_loop
-
-.copy_is_done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm b/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm
deleted file mode 100644
index d789a40ccf..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/copy_sse3.asm
+++ /dev/null
@@ -1,146 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
- %define src_ptr rsi
- %define src_stride rax
- %define ref_ptr rdi
- %define ref_stride rdx
- %define end_ptr rcx
- %define ret_var rbx
- %define result_ptr arg(4)
- %define max_sad arg(4)
- %define height dword ptr arg(4)
- push rbp
- mov rbp, rsp
- push rsi
- push rdi
- push rbx
-
- mov rsi, arg(0) ; src_ptr
- mov rdi, arg(2) ; ref_ptr
-
- movsxd rax, dword ptr arg(1) ; src_stride
- movsxd rdx, dword ptr arg(3) ; ref_stride
-%else
- %if LIBVPX_YASM_WIN64
- SAVE_XMM 7, u
- %define src_ptr rcx
- %define src_stride rdx
- %define ref_ptr r8
- %define ref_stride r9
- %define end_ptr r10
- %define ret_var r11
- %define result_ptr [rsp+xmm_stack_space+8+4*8]
- %define max_sad [rsp+xmm_stack_space+8+4*8]
- %define height dword ptr [rsp+xmm_stack_space+8+4*8]
- %else
- %define src_ptr rdi
- %define src_stride rsi
- %define ref_ptr rdx
- %define ref_stride rcx
- %define end_ptr r9
- %define ret_var r10
- %define result_ptr r8
- %define max_sad r8
- %define height r8
- %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
- %define src_ptr
- %define src_stride
- %define ref_ptr
- %define ref_stride
- %define end_ptr
- %define ret_var
- %define result_ptr
- %define max_sad
- %define height
-
-%if ABI_IS_32BIT
- pop rbx
- pop rdi
- pop rsi
- pop rbp
-%else
- %if LIBVPX_YASM_WIN64
- RESTORE_XMM
- %endif
-%endif
- ret
-%endmacro
-
-
-;void vp8_copy32xn_sse3(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *dst_ptr,
-; int dst_stride,
-; int height);
-global sym(vp8_copy32xn_sse3) PRIVATE
-sym(vp8_copy32xn_sse3):
-
- STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
- lea end_ptr, [src_ptr+src_stride*2]
-
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
- movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
- movdqu xmm4, XMMWORD PTR [end_ptr]
- movdqu xmm5, XMMWORD PTR [end_ptr + 16]
- movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
- movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
-
- lea src_ptr, [src_ptr+src_stride*4]
-
- lea end_ptr, [ref_ptr+ref_stride*2]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
- movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
- movdqa XMMWORD PTR [end_ptr], xmm4
- movdqa XMMWORD PTR [end_ptr + 16], xmm5
- movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
- movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
- lea ref_ptr, [ref_ptr+ref_stride*4]
-
- sub height, 4
- cmp height, 4
- jge .block_copy_sse3_loopx4
-
- ;Check to see if there is more rows need to be copied.
- cmp height, 0
- je .copy_is_done
-
-.block_copy_sse3_loop:
- movdqu xmm0, XMMWORD PTR [src_ptr]
- movdqu xmm1, XMMWORD PTR [src_ptr + 16]
- lea src_ptr, [src_ptr+src_stride]
-
- movdqa XMMWORD PTR [ref_ptr], xmm0
- movdqa XMMWORD PTR [ref_ptr + 16], xmm1
- lea ref_ptr, [ref_ptr+ref_stride]
-
- sub height, 1
- jne .block_copy_sse3_loop
-
-.copy_is_done:
- STACK_FRAME_DESTROY_X3
diff --git a/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm b/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm
deleted file mode 100644
index 4e551f00aa..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/dequantize_mmx.asm
+++ /dev/null
@@ -1,258 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
-global sym(vp8_dequantize_b_impl_mmx) PRIVATE
-sym(vp8_dequantize_b_impl_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;sq
- mov rdi, arg(1) ;dq
- mov rax, arg(2) ;q
-
- movq mm1, [rsi]
- pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
- movq [rdi], mm1
-
- movq mm1, [rsi+8]
- pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
- movq [rdi+8], mm1
-
- movq mm1, [rsi+16]
- pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
- movq [rdi+16], mm1
-
- movq mm1, [rsi+24]
- pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
- movq [rdi+24], mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void dequant_idct_add_mmx(
-;short *input, 0
-;short *dq, 1
-;unsigned char *dest, 2
-;int stride) 3
-global sym(vp8_dequant_idct_add_mmx) PRIVATE
-sym(vp8_dequant_idct_add_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- push rdi
- ; end prolog
-
- mov rax, arg(0) ;input
- mov rdx, arg(1) ;dq
-
-
- movq mm0, [rax ]
- pmullw mm0, [rdx]
-
- movq mm1, [rax +8]
- pmullw mm1, [rdx +8]
-
- movq mm2, [rax+16]
- pmullw mm2, [rdx+16]
-
- movq mm3, [rax+24]
- pmullw mm3, [rdx+24]
-
- mov rdx, arg(2) ;dest
-
- pxor mm7, mm7
-
-
- movq [rax], mm7
- movq [rax+8], mm7
-
- movq [rax+16],mm7
- movq [rax+24],mm7
-
-
- movsxd rdi, dword ptr arg(3) ;stride
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- movq mm3, mm5 ; 33 23 13 03
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- paddw mm0, [GLOBAL(fours)]
-
- paddw mm2, [GLOBAL(fours)]
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
- psraw mm2, 3
-
- psraw mm0, 3
- psraw mm4, 3
-
- psraw mm6, 3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- pxor mm7, mm7
-
- movd mm4, [rdx]
- punpcklbw mm4, mm7
- paddsw mm0, mm4
- packuswb mm0, mm7
- movd [rdx], mm0
-
- movd mm4, [rdx+rdi]
- punpcklbw mm4, mm7
- paddsw mm1, mm4
- packuswb mm1, mm7
- movd [rdx+rdi], mm1
-
- movd mm4, [rdx+2*rdi]
- punpcklbw mm4, mm7
- paddsw mm2, mm4
- packuswb mm2, mm7
- movd [rdx+rdi*2], mm2
-
- add rdx, rdi
-
- movd mm4, [rdx+2*rdi]
- punpcklbw mm4, mm7
- paddsw mm5, mm4
- packuswb mm5, mm7
- movd [rdx+rdi*2], mm5
-
- ; begin epilog
- pop rdi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
- times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 4 dw 0x4E7B
-align 16
-fours:
- times 4 dw 0x0004
diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.c b/thirdparty/libvpx/vp8/common/x86/filter_x86.c
deleted file mode 100644
index 7f496ed7db..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/filter_x86.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/x86/filter_x86.h"
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
-{
- { 128, 128, 128, 128, 0, 0, 0, 0 },
- { 112, 112, 112, 112, 16, 16, 16, 16 },
- { 96, 96, 96, 96, 32, 32, 32, 32 },
- { 80, 80, 80, 80, 48, 48, 48, 48 },
- { 64, 64, 64, 64, 64, 64, 64, 64 },
- { 48, 48, 48, 48, 80, 80, 80, 80 },
- { 32, 32, 32, 32, 96, 96, 96, 96 },
- { 16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
-{
- { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
- { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
- { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
- { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
- { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
- { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
- { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
- { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
diff --git a/thirdparty/libvpx/vp8/common/x86/filter_x86.h b/thirdparty/libvpx/vp8/common/x86/filter_x86.h
deleted file mode 100644
index d282841bee..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/filter_x86.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP8_COMMON_X86_FILTER_X86_H_
-#define VP8_COMMON_X86_FILTER_X86_H_
-
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
- * duplicated values */
-
-/* duplicated 4x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
-
-/* duplicated 8x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // VP8_COMMON_X86_FILTER_X86_H_
diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c
deleted file mode 100644
index f2532b34da..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/idct_blk_mmx.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
-
-void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
-{
- short *sq = (short *) d->qcoeff;
- short *dq = (short *) d->dqcoeff;
-
- vp8_dequantize_b_impl_mmx(sq, dq, DQC);
-}
-
-void vp8_dequant_idct_add_y_block_mmx
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_mmx (q, dq, dst, stride);
- else if (eobs[0] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
- memset(q, 0, 2 * sizeof(q[0]));
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
- else if (eobs[1] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
- dst+4, stride);
- memset(q + 16, 0, 2 * sizeof(q[0]));
- }
-
- if (eobs[2] > 1)
- vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
- else if (eobs[2] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
- dst+8, stride);
- memset(q + 32, 0, 2 * sizeof(q[0]));
- }
-
- if (eobs[3] > 1)
- vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
- else if (eobs[3] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
- dst+12, stride);
- memset(q + 48, 0, 2 * sizeof(q[0]));
- }
-
- q += 64;
- dst += 4*stride;
- eobs += 4;
- }
-}
-
-void vp8_dequant_idct_add_uv_block_mmx
- (short *q, short *dq,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
-{
- int i;
-
- for (i = 0; i < 2; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
- else if (eobs[0] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
- memset(q, 0, 2 * sizeof(q[0]));
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
- else if (eobs[1] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
- dstu+4, stride);
- memset(q + 16, 0, 2 * sizeof(q[0]));
- }
-
- q += 32;
- dstu += 4*stride;
- eobs += 2;
- }
-
- for (i = 0; i < 2; i++)
- {
- if (eobs[0] > 1)
- vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
- else if (eobs[0] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
- memset(q, 0, 2 * sizeof(q[0]));
- }
-
- if (eobs[1] > 1)
- vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
- else if (eobs[1] == 1)
- {
- vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
- dstv+4, stride);
- memset(q + 16, 0, 2 * sizeof(q[0]));
- }
-
- q += 32;
- dstv += 4*stride;
- eobs += 2;
- }
-}
diff --git a/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c b/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c
deleted file mode 100644
index ae96ec858c..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/idct_blk_sse2.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-
-void vp8_idct_dequant_0_2x_sse2
- (short *q, short *dq ,
- unsigned char *dst, int dst_stride);
-void vp8_idct_dequant_full_2x_sse2
- (short *q, short *dq ,
- unsigned char *dst, int dst_stride);
-
-void vp8_dequant_idct_add_y_block_sse2
- (short *q, short *dq,
- unsigned char *dst, int stride, char *eobs)
-{
- int i;
-
- for (i = 0; i < 4; i++)
- {
- if (((short *)(eobs))[0])
- {
- if (((short *)(eobs))[0] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
- }
- if (((short *)(eobs))[1])
- {
- if (((short *)(eobs))[1] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
- else
- vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
- }
- q += 64;
- dst += stride*4;
- eobs += 4;
- }
-}
-
-void vp8_dequant_idct_add_uv_block_sse2
- (short *q, short *dq,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
-{
- if (((short *)(eobs))[0])
- {
- if (((short *)(eobs))[0] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
- }
- q += 32;
- dstu += stride*4;
-
- if (((short *)(eobs))[1])
- {
- if (((short *)(eobs))[1] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
- }
- q += 32;
-
- if (((short *)(eobs))[2])
- {
- if (((short *)(eobs))[2] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
- }
- q += 32;
- dstv += stride*4;
-
- if (((short *)(eobs))[3])
- {
- if (((short *)(eobs))[3] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
- }
-}
diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm
deleted file mode 100644
index 96fa2c60d0..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/idctllm_mmx.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; /****************************************************************************
-; * Notes:
-; *
-; * This implementation makes use of 16 bit fixed point version of two multiply
-; * constants:
-; * 1. sqrt(2) * cos (pi/8)
-; * 2. sqrt(2) * sin (pi/8)
-; * Because the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point precision as the second one, we use a trick of
-; * x * a = x + x*(a-1)
-; * so
-; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
-; *
-; * For the second constant, because of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
-; * number.
-; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
-; *
-; **************************************************************************/
-
-
-;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
-;int pitch, unsigned char *dest,int stride)
-global sym(vp8_short_idct4x4llm_mmx) PRIVATE
-sym(vp8_short_idct4x4llm_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(0) ;input
- mov rsi, arg(1) ;pred
-
- movq mm0, [rax ]
- movq mm1, [rax+ 8]
- movq mm2, [rax+16]
- movq mm3, [rax+24]
-
-%if 0
- pxor mm7, mm7
- movq [rax], mm7
- movq [rax+8], mm7
- movq [rax+16],mm7
- movq [rax+24],mm7
-%endif
- movsxd rax, dword ptr arg(2) ;pitch
- mov rdx, arg(3) ;dest
- movsxd rdi, dword ptr arg(4) ;stride
-
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- movq mm3, mm5 ; 33 23 13 03
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)];
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
-
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
-
- movq mm5, mm1
- movq mm4, mm3
-
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
-
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
-
- paddw mm3, mm5 ; d1
- paddw mm0, [GLOBAL(fours)]
-
- paddw mm2, [GLOBAL(fours)]
- movq mm6, mm2 ; a1
-
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
-
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
-
- psubw mm6, mm3 ;3
- psraw mm2, 3
-
- psraw mm0, 3
- psraw mm4, 3
-
- psraw mm6, 3
-
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
-
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
-
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
-
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
-
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
-
- pxor mm7, mm7
-
- movd mm4, [rsi]
- punpcklbw mm4, mm7
- paddsw mm0, mm4
- packuswb mm0, mm7
- movd [rdx], mm0
-
- movd mm4, [rsi+rax]
- punpcklbw mm4, mm7
- paddsw mm1, mm4
- packuswb mm1, mm7
- movd [rdx+rdi], mm1
-
- movd mm4, [rsi+2*rax]
- punpcklbw mm4, mm7
- paddsw mm2, mm4
- packuswb mm2, mm7
- movd [rdx+rdi*2], mm2
-
- add rdx, rdi
- add rsi, rax
-
- movd mm4, [rsi+2*rax]
- punpcklbw mm4, mm7
- paddsw mm5, mm4
- packuswb mm5, mm7
- movd [rdx+rdi*2], mm5
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_dc_only_idct_add_mmx(
-;short input_dc,
-;unsigned char *pred_ptr,
-;int pred_stride,
-;unsigned char *dst_ptr,
-;int stride)
-global sym(vp8_dc_only_idct_add_mmx) PRIVATE
-sym(vp8_dc_only_idct_add_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- ; end prolog
-
- movd mm5, arg(0) ;input_dc
- mov rax, arg(1) ;pred_ptr
- movsxd rdx, dword ptr arg(2) ;pred_stride
-
- pxor mm0, mm0
-
- paddw mm5, [GLOBAL(fours)]
- lea rcx, [rdx + rdx*2]
-
- psraw mm5, 3
-
- punpcklwd mm5, mm5
-
- punpckldq mm5, mm5
-
- movd mm1, [rax]
- movd mm2, [rax+rdx]
- movd mm3, [rax+2*rdx]
- movd mm4, [rax+rcx]
-
- mov rax, arg(3) ;d -- destination
- movsxd rdx, dword ptr arg(4) ;dst_stride
-
- punpcklbw mm1, mm0
- paddsw mm1, mm5
- packuswb mm1, mm0 ; pack and unpack to saturate
- lea rcx, [rdx + rdx*2]
-
- punpcklbw mm2, mm0
- paddsw mm2, mm5
- packuswb mm2, mm0 ; pack and unpack to saturate
-
- punpcklbw mm3, mm0
- paddsw mm3, mm5
- packuswb mm3, mm0 ; pack and unpack to saturate
-
- punpcklbw mm4, mm0
- paddsw mm4, mm5
- packuswb mm4, mm0 ; pack and unpack to saturate
-
- movd [rax], mm1
- movd [rax+rdx], mm2
- movd [rax+2*rdx], mm3
- movd [rax+rcx], mm4
-
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
- times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 4 dw 0x4E7B
-align 16
-fours:
- times 4 dw 0x0004
diff --git a/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm b/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm
deleted file mode 100644
index bf8e2c4021..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/idctllm_sse2.asm
+++ /dev/null
@@ -1,708 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_idct_dequant_0_2x_sse2
-; (
-; short *qcoeff - 0
-; short *dequant - 1
-; unsigned char *dst - 2
-; int dst_stride - 3
-; )
-
-global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
-sym(vp8_idct_dequant_0_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- ; end prolog
-
- mov rdx, arg(1) ; dequant
- mov rax, arg(0) ; qcoeff
-
- movd xmm4, [rax]
- movd xmm5, [rdx]
-
- pinsrw xmm4, [rax+32], 4
- pinsrw xmm5, [rdx], 4
-
- pmullw xmm4, xmm5
-
- ; Zero out xmm5, for use unpacking
- pxor xmm5, xmm5
-
- ; clear coeffs
- movd [rax], xmm5
- movd [rax+32], xmm5
-;pshufb
- mov rax, arg(2) ; dst
- movsxd rdx, dword ptr arg(3) ; dst_stride
-
- pshuflw xmm4, xmm4, 00000000b
- pshufhw xmm4, xmm4, 00000000b
-
- lea rcx, [rdx + rdx*2]
- paddw xmm4, [GLOBAL(fours)]
-
- psraw xmm4, 3
-
- movq xmm0, [rax]
- movq xmm1, [rax+rdx]
- movq xmm2, [rax+2*rdx]
- movq xmm3, [rax+rcx]
-
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
-
-
- ; Add to predict buffer
- paddw xmm0, xmm4
- paddw xmm1, xmm4
- paddw xmm2, xmm4
- paddw xmm3, xmm4
-
- ; pack up before storing
- packuswb xmm0, xmm5
- packuswb xmm1, xmm5
- packuswb xmm2, xmm5
- packuswb xmm3, xmm5
-
- ; store blocks back out
- movq [rax], xmm0
- movq [rax + rdx], xmm1
-
- lea rax, [rax + 2*rdx]
-
- movq [rax], xmm2
- movq [rax + rdx], xmm3
-
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_idct_dequant_full_2x_sse2
-; (
-; short *qcoeff - 0
-; short *dequant - 1
-; unsigned char *dst - 2
-; int dst_stride - 3
-; )
-global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
-sym(vp8_idct_dequant_full_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ; special case when 2 blocks have 0 or 1 coeffs
- ; dc is set as first coeff, so no need to load qcoeff
- mov rax, arg(0) ; qcoeff
- mov rdx, arg(1) ; dequant
- mov rdi, arg(2) ; dst
-
-
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
-
-
- ; note the transpose of xmm1 and xmm2, necessary for shuffle
- ; to spit out sensicle data
- movdqa xmm0, [rax]
- movdqa xmm2, [rax+16]
- movdqa xmm1, [rax+32]
- movdqa xmm3, [rax+48]
-
- ; Clear out coeffs
- movdqa [rax], xmm7
- movdqa [rax+16], xmm7
- movdqa [rax+32], xmm7
- movdqa [rax+48], xmm7
-
- ; dequantize qcoeff buffer
- pmullw xmm0, [rdx]
- pmullw xmm2, [rdx+16]
- pmullw xmm1, [rdx]
- pmullw xmm3, [rdx+16]
- movsxd rdx, dword ptr arg(3) ; dst_stride
-
- ; repack so block 0 row x and block 1 row x are together
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm4, xmm1
-
- pshufd xmm0, xmm0, 11011000b
- pshufd xmm1, xmm4, 11011000b
-
- movdqa xmm4, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm4, xmm3
-
- pshufd xmm2, xmm2, 11011000b
- pshufd xmm3, xmm4, 11011000b
-
- ; first pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2 ;
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- lea rcx, [rdx + rdx*2] ;dst_stride * 3
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
-
- ; transpose for the second pass
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- ; second pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- paddw xmm0, [GLOBAL(fours)]
-
- paddw xmm2, [GLOBAL(fours)]
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
- psraw xmm2, 3
-
- psraw xmm0, 3
- psraw xmm4, 3
-
- psraw xmm6, 3
-
- ; transpose to save
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- pxor xmm7, xmm7
-
- ; Load up predict blocks
- movq xmm4, [rdi]
- movq xmm5, [rdi+rdx]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm0, xmm4
- paddw xmm1, xmm5
-
- movq xmm4, [rdi+2*rdx]
- movq xmm5, [rdi+rcx]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
-.finish:
-
- ; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
-
- ; store blocks back out
- movq [rdi], xmm0
- movq [rdi + rdx], xmm1
- movq [rdi + rdx*2], xmm2
- movq [rdi + rcx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_idct_dequant_dc_0_2x_sse2
-; (
-; short *qcoeff - 0
-; short *dequant - 1
-; unsigned char *dst - 2
-; int dst_stride - 3
-; short *dc - 4
-; )
-global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
-sym(vp8_idct_dequant_dc_0_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rdi
- ; end prolog
-
- ; special case when 2 blocks have 0 or 1 coeffs
- ; dc is set as first coeff, so no need to load qcoeff
- mov rax, arg(0) ; qcoeff
-
- mov rdi, arg(2) ; dst
- mov rdx, arg(4) ; dc
-
- ; Zero out xmm5, for use unpacking
- pxor xmm5, xmm5
-
- ; load up 2 dc words here == 2*16 = doubleword
- movd xmm4, [rdx]
-
- movsxd rdx, dword ptr arg(3) ; dst_stride
- lea rcx, [rdx + rdx*2]
- ; Load up predict blocks
- movq xmm0, [rdi]
- movq xmm1, [rdi+rdx*1]
- movq xmm2, [rdi+rdx*2]
- movq xmm3, [rdi+rcx]
-
- ; Duplicate and expand dc across
- punpcklwd xmm4, xmm4
- punpckldq xmm4, xmm4
-
- ; Rounding to dequant and downshift
- paddw xmm4, [GLOBAL(fours)]
- psraw xmm4, 3
-
- ; Predict buffer needs to be expanded from bytes to words
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
-
- ; Add to predict buffer
- paddw xmm0, xmm4
- paddw xmm1, xmm4
- paddw xmm2, xmm4
- paddw xmm3, xmm4
-
- ; pack up before storing
- packuswb xmm0, xmm5
- packuswb xmm1, xmm5
- packuswb xmm2, xmm5
- packuswb xmm3, xmm5
-
- ; store blocks back out
- movq [rdi], xmm0
- movq [rdi + rdx], xmm1
- movq [rdi + rdx*2], xmm2
- movq [rdi + rcx], xmm3
-
- ; begin epilog
- pop rdi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp8_idct_dequant_dc_full_2x_sse2
-; (
-; short *qcoeff - 0
-; short *dequant - 1
-; unsigned char *dst - 2
-; int dst_stride - 3
-; short *dc - 4
-; )
-global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
-sym(vp8_idct_dequant_dc_full_2x_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rdi
- ; end prolog
-
- ; special case when 2 blocks have 0 or 1 coeffs
- ; dc is set as first coeff, so no need to load qcoeff
- mov rax, arg(0) ; qcoeff
- mov rdx, arg(1) ; dequant
-
- mov rdi, arg(2) ; dst
-
- ; Zero out xmm7, for use unpacking
- pxor xmm7, xmm7
-
-
- ; note the transpose of xmm1 and xmm2, necessary for shuffle
- ; to spit out sensicle data
- movdqa xmm0, [rax]
- movdqa xmm2, [rax+16]
- movdqa xmm1, [rax+32]
- movdqa xmm3, [rax+48]
-
- ; Clear out coeffs
- movdqa [rax], xmm7
- movdqa [rax+16], xmm7
- movdqa [rax+32], xmm7
- movdqa [rax+48], xmm7
-
- ; dequantize qcoeff buffer
- pmullw xmm0, [rdx]
- pmullw xmm2, [rdx+16]
- pmullw xmm1, [rdx]
- pmullw xmm3, [rdx+16]
-
- ; DC component
- mov rdx, arg(4)
-
- ; repack so block 0 row x and block 1 row x are together
- movdqa xmm4, xmm0
- punpckldq xmm0, xmm1
- punpckhdq xmm4, xmm1
-
- pshufd xmm0, xmm0, 11011000b
- pshufd xmm1, xmm4, 11011000b
-
- movdqa xmm4, xmm2
- punpckldq xmm2, xmm3
- punpckhdq xmm4, xmm3
-
- pshufd xmm2, xmm2, 11011000b
- pshufd xmm3, xmm4, 11011000b
-
- ; insert DC component
- pinsrw xmm0, [rdx], 0
- pinsrw xmm0, [rdx+2], 4
-
- ; first pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2 ;
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
-
- ; transpose for the second pass
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- ; second pass
- psubw xmm0, xmm2 ; b1 = 0-2
- paddw xmm2, xmm2
-
- movdqa xmm5, xmm1
- paddw xmm2, xmm0 ; a1 = 0+2
-
- pmulhw xmm5, [GLOBAL(x_s1sqr2)]
- paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
-
- movdqa xmm7, xmm3
- pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
-
- paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw xmm7, xmm5 ; c1
-
- movdqa xmm5, xmm1
- movdqa xmm4, xmm3
-
- pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
- paddw xmm5, xmm1
-
- pmulhw xmm3, [GLOBAL(x_s1sqr2)]
- paddw xmm3, xmm4
-
- paddw xmm3, xmm5 ; d1
- paddw xmm0, [GLOBAL(fours)]
-
- paddw xmm2, [GLOBAL(fours)]
- movdqa xmm6, xmm2 ; a1
-
- movdqa xmm4, xmm0 ; b1
- paddw xmm2, xmm3 ;0
-
- paddw xmm4, xmm7 ;1
- psubw xmm0, xmm7 ;2
-
- psubw xmm6, xmm3 ;3
- psraw xmm2, 3
-
- psraw xmm0, 3
- psraw xmm4, 3
-
- psraw xmm6, 3
-
- ; transpose to save
- movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
- punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
- punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
-
- movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
- punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
- punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
-
-
- movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
- punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
- punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
-
- movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
- punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
- punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
-
-
- movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
- punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
- punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
-
- movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
- punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
- punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
-
- pshufd xmm0, xmm2, 11011000b
- pshufd xmm2, xmm1, 11011000b
-
- pshufd xmm1, xmm5, 11011000b
- pshufd xmm3, xmm7, 11011000b
-
- pxor xmm7, xmm7
-
- ; Load up predict blocks
- movsxd rdx, dword ptr arg(3) ; dst_stride
- movq xmm4, [rdi]
- movq xmm5, [rdi+rdx]
- lea rcx, [rdx + rdx*2]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm0, xmm4
- paddw xmm1, xmm5
-
- movq xmm4, [rdi+rdx*2]
- movq xmm5, [rdi+rcx]
-
- punpcklbw xmm4, xmm7
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm4
- paddw xmm3, xmm5
-
-.finish:
-
- ; pack up before storing
- packuswb xmm0, xmm7
- packuswb xmm1, xmm7
- packuswb xmm2, xmm7
- packuswb xmm3, xmm7
-
- ; Load destination stride before writing out,
- ; doesn't need to persist
- movsxd rdx, dword ptr arg(3) ; dst_stride
-
- ; store blocks back out
- movq [rdi], xmm0
- movq [rdi + rdx], xmm1
-
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm2
- movq [rdi + rdx], xmm3
-
-
- ; begin epilog
- pop rdi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-fours:
- times 8 dw 0x0004
-align 16
-x_s1sqr2:
- times 8 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 8 dw 0x4E7B
diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm
deleted file mode 100644
index 158c3b7458..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/iwalsh_mmx.asm
+++ /dev/null
@@ -1,140 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
-sym(vp8_short_inv_walsh4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- ; end prolog
-
- mov rdx, arg(0)
- mov rax, 30003h
-
- movq mm0, [rdx + 0] ;ip[0]
- movq mm1, [rdx + 8] ;ip[4]
- movq mm7, rax
-
- movq mm2, [rdx + 16] ;ip[8]
- movq mm3, [rdx + 24] ;ip[12]
- punpcklwd mm7, mm7 ;0003000300030003h
- mov rdx, arg(1)
-
- movq mm4, mm0
- movq mm5, mm1
-
- paddw mm4, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm4 ;temp al
- paddw mm4, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm1, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
- paddw mm0, mm1 ;dl + cl
- psubw mm5, mm1 ;dl - cl
-
- ; 03 02 01 00
- ; 13 12 11 10
- ; 23 22 21 20
- ; 33 32 31 30
-
- movq mm3, mm4 ; 03 02 01 00
- punpcklwd mm4, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm1, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm1, mm5 ; 33 23 32 22
-
- movq mm0, mm4 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
- movq mm1, mm0
- movq mm5, mm4
- paddw mm1, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm1 ;temp al
- paddw mm1, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
- paddw mm1, mm7
- paddw mm6, mm7
- psraw mm1, 3
- psraw mm6, 3
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm4, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
- paddw mm0, mm4 ;dl + cl
- psubw mm5, mm4 ;dl - cl
- paddw mm0, mm7
- paddw mm5, mm7
- psraw mm0, 3
- psraw mm5, 3
-;~~~~~~~~~~~~~~~~~~~~~
-
- movd eax, mm1
- movd ecx, mm0
- psrlq mm0, 32
- psrlq mm1, 32
- mov word ptr[rdx+32*0], ax
- mov word ptr[rdx+32*1], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*4], ax
- mov word ptr[rdx+32*5], cx
- movd eax, mm1
- movd ecx, mm0
- mov word ptr[rdx+32*8], ax
- mov word ptr[rdx+32*9], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*12], ax
- mov word ptr[rdx+32*13], cx
-
- movd eax, mm6
- movd ecx, mm5
- psrlq mm5, 32
- psrlq mm6, 32
- mov word ptr[rdx+32*2], ax
- mov word ptr[rdx+32*3], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*6], ax
- mov word ptr[rdx+32*7], cx
- movd eax, mm6
- movd ecx, mm5
- mov word ptr[rdx+32*10], ax
- mov word ptr[rdx+32*11], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*14], ax
- mov word ptr[rdx+32*15], cx
-
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
diff --git a/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm b/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm
deleted file mode 100644
index 06e86a80b6..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/iwalsh_sse2.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
-sym(vp8_short_inv_walsh4x4_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- ; end prolog
-
- mov rcx, arg(0)
- mov rdx, arg(1)
- mov rax, 30003h
-
- movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
- movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
-
-
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm0 ;ip[4] ip[0]
-
- paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm4, xmm0
- punpcklqdq xmm0, xmm3 ;d1 a1
- punpckhqdq xmm4, xmm3 ;c1 b1
-
- movdqa xmm1, xmm4 ;c1 b1
- paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- movd xmm0, eax
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm4 ;ip[4] ip[0]
-
- pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
-
- paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm3 ;d1 a1
- punpckhqdq xmm5, xmm3 ;c1 b1
-
- movdqa xmm1, xmm5 ;c1 b1
- paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-
- paddw xmm5, xmm0
- paddw xmm4, xmm0
- psraw xmm5, 3
- psraw xmm4, 3
-
- movd eax, xmm5
- movd ecx, xmm4
- psrldq xmm5, 4
- psrldq xmm4, 4
- mov word ptr[rdx+32*0], ax
- mov word ptr[rdx+32*2], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*4], ax
- mov word ptr[rdx+32*6], cx
- movd eax, xmm5
- movd ecx, xmm4
- psrldq xmm5, 4
- psrldq xmm4, 4
- mov word ptr[rdx+32*8], ax
- mov word ptr[rdx+32*10], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*12], ax
- mov word ptr[rdx+32*14], cx
-
- movd eax, xmm5
- movd ecx, xmm4
- psrldq xmm5, 4
- psrldq xmm4, 4
- mov word ptr[rdx+32*1], ax
- mov word ptr[rdx+32*3], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*5], ax
- mov word ptr[rdx+32*7], cx
- movd eax, xmm5
- movd ecx, xmm4
- mov word ptr[rdx+32*9], ax
- mov word ptr[rdx+32*11], cx
- shr eax, 16
- shr ecx, 16
- mov word ptr[rdx+32*13], ax
- mov word ptr[rdx+32*15], cx
-
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
deleted file mode 100644
index 6d5aaa19db..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
+++ /dev/null
@@ -1,815 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro LF_ABS 2
- ; %1 value not preserved
- ; %2 value preserved
- ; output in %1
- movdqa scratch1, %2 ; v2
-
- psubusb scratch1, %1 ; v2 - v1
- psubusb %1, %2 ; v1 - v2
- por %1, scratch1 ; abs(v2 - v1)
-%endmacro
-
-%macro LF_FILTER_HEV_MASK 8-9
-
- LF_ABS %1, %2 ; abs(p3 - p2)
- LF_ABS %2, %3 ; abs(p2 - p1)
- pmaxub %1, %2 ; accumulate mask
-%if %0 == 8
- movdqa scratch2, %3 ; save p1
- LF_ABS scratch2, %4 ; abs(p1 - p0)
-%endif
- LF_ABS %4, %5 ; abs(p0 - q0)
- LF_ABS %5, %6 ; abs(q0 - q1)
-%if %0 == 8
- pmaxub %5, scratch2 ; accumulate hev
-%else
- pmaxub %5, %9
-%endif
- pmaxub %1, %5 ; accumulate mask
-
- LF_ABS %3, %6 ; abs(p1 - q1)
- LF_ABS %6, %7 ; abs(q1 - q2)
- pmaxub %1, %6 ; accumulate mask
- LF_ABS %7, %8 ; abs(q2 - q3)
- pmaxub %1, %7 ; accumulate mask
-
- paddusb %4, %4 ; 2 * abs(p0 - q0)
- pand %3, [GLOBAL(tfe)]
- psrlw %3, 1 ; abs(p1 - q1) / 2
- paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
- psubusb %1, [limit]
- psubusb %4, [blimit]
- por %1, %4
- pcmpeqb %1, zero ; mask
-
- psubusb %5, [thresh]
- pcmpeqb %5, zero ; ~hev
-%endmacro
-
-%macro LF_FILTER 6
- ; %1-%4: p1-q1
- ; %5: mask
- ; %6: hev
-
- movdqa scratch2, %6 ; save hev
-
- pxor %1, [GLOBAL(t80)] ; ps1
- pxor %4, [GLOBAL(t80)] ; qs1
- movdqa scratch1, %1
- psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
- pandn scratch2, scratch1 ; vp8_filter &= hev
-
- pxor %2, [GLOBAL(t80)] ; ps0
- pxor %3, [GLOBAL(t80)] ; qs0
- movdqa scratch1, %3
- psubsb scratch1, %2 ; qs0 - ps0
- paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
- paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
- paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
- pand %5, scratch2 ; &= mask
-
- movdqa scratch2, %5
- paddsb %5, [GLOBAL(t4)] ; Filter1
- paddsb scratch2, [GLOBAL(t3)] ; Filter2
-
- ; Filter1 >> 3
- movdqa scratch1, zero
- pcmpgtb scratch1, %5
- psrlw %5, 3
- pand scratch1, [GLOBAL(te0)]
- pand %5, [GLOBAL(t1f)]
- por %5, scratch1
-
- psubsb %3, %5 ; qs0 - Filter1
- pxor %3, [GLOBAL(t80)]
-
- ; Filter2 >> 3
- movdqa scratch1, zero
- pcmpgtb scratch1, scratch2
- psrlw scratch2, 3
- pand scratch1, [GLOBAL(te0)]
- pand scratch2, [GLOBAL(t1f)]
- por scratch2, scratch1
-
- paddsb %2, scratch2 ; ps0 + Filter2
- pxor %2, [GLOBAL(t80)]
-
- ; outer tap adjustments
- paddsb %5, [GLOBAL(t1)]
- movdqa scratch1, zero
- pcmpgtb scratch1, %5
- psrlw %5, 1
- pand scratch1, [GLOBAL(t80)]
- pand %5, [GLOBAL(t7f)]
- por %5, scratch1
- pand %5, %6 ; vp8_filter &= ~hev
-
- psubsb %4, %5 ; qs1 - vp8_filter
- pxor %4, [GLOBAL(t80)]
-
- paddsb %1, %5 ; ps1 + vp8_filter
- pxor %1, [GLOBAL(t80)]
-%endmacro
-
-;void vp8_loop_filter_bh_y_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh
-;)
-global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
-sym(vp8_loop_filter_bh_y_sse2):
-
-%if LIBVPX_YASM_WIN64
- %define src rcx ; src_ptr
- %define stride rdx ; src_pixel_step
- %define blimit r8
- %define limit r9
- %define thresh r10
-
- %define spp rax
- %define stride3 r11
- %define stride5 r12
- %define stride7 r13
-
- push rbp
- mov rbp, rsp
- SAVE_XMM 11
- push r12
- push r13
- mov thresh, arg(4)
-%else
- %define src rdi ; src_ptr
- %define stride rsi ; src_pixel_step
- %define blimit rdx
- %define limit rcx
- %define thresh r8
-
- %define spp rax
- %define stride3 r9
- %define stride5 r10
- %define stride7 r11
-%endif
-
- %define scratch1 xmm5
- %define scratch2 xmm6
- %define zero xmm7
-
- %define i0 [src]
- %define i1 [spp]
- %define i2 [src + 2 * stride]
- %define i3 [spp + 2 * stride]
- %define i4 [src + 4 * stride]
- %define i5 [spp + 4 * stride]
- %define i6 [src + 2 * stride3]
- %define i7 [spp + 2 * stride3]
- %define i8 [src + 8 * stride]
- %define i9 [spp + 8 * stride]
- %define i10 [src + 2 * stride5]
- %define i11 [spp + 2 * stride5]
- %define i12 [src + 4 * stride3]
- %define i13 [spp + 4 * stride3]
- %define i14 [src + 2 * stride7]
- %define i15 [spp + 2 * stride7]
-
- ; prep work
- lea spp, [src + stride]
- lea stride3, [stride + 2 * stride]
- lea stride5, [stride3 + 2 * stride]
- lea stride7, [stride3 + 4 * stride]
- pxor zero, zero
-
- ; load the first set into registers
- movdqa xmm0, i0
- movdqa xmm1, i1
- movdqa xmm2, i2
- movdqa xmm3, i3
- movdqa xmm4, i4
- movdqa xmm8, i5
- movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
- movdqa xmm10, i7
-LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
-
- movdqa xmm1, i2
- movdqa xmm2, i3
- movdqa xmm3, i4
- movdqa xmm8, i5
-LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
- movdqa i2, xmm1
- movdqa i3, xmm2
-
-; second set
- movdqa i4, xmm3
- movdqa i5, xmm8
-
- movdqa xmm0, i6
- movdqa xmm1, i7
- movdqa xmm2, i8
- movdqa xmm4, i9
- movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
- movdqa xmm11, i11
-LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
-
- movdqa xmm0, i6
- movdqa xmm1, i7
- movdqa xmm4, i8
- movdqa xmm8, i9
-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
- movdqa i6, xmm0
- movdqa i7, xmm1
-
-; last set
- movdqa i8, xmm4
- movdqa i9, xmm8
-
- movdqa xmm0, i10
- movdqa xmm1, i11
- movdqa xmm2, i12
- movdqa xmm3, i13
- movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
- movdqa xmm11, i15
-LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
-
- movdqa xmm0, i10
- movdqa xmm1, i11
- movdqa xmm3, i12
- movdqa xmm8, i13
-LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
- movdqa i10, xmm0
- movdqa i11, xmm1
- movdqa i12, xmm3
- movdqa i13, xmm8
-
-%if LIBVPX_YASM_WIN64
- pop r13
- pop r12
- RESTORE_XMM
- pop rbp
-%endif
-
- ret
-
-
-;void vp8_loop_filter_bv_y_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh
-;)
-
-global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
-sym(vp8_loop_filter_bv_y_sse2):
-
-%if LIBVPX_YASM_WIN64
- %define src rcx ; src_ptr
- %define stride rdx ; src_pixel_step
- %define blimit r8
- %define limit r9
- %define thresh r10
-
- %define spp rax
- %define stride3 r11
- %define stride5 r12
- %define stride7 r13
-
- push rbp
- mov rbp, rsp
- SAVE_XMM 15
- push r12
- push r13
- mov thresh, arg(4)
-%else
- %define src rdi
- %define stride rsi
- %define blimit rdx
- %define limit rcx
- %define thresh r8
-
- %define spp rax
- %define stride3 r9
- %define stride5 r10
- %define stride7 r11
-%endif
-
- %define scratch1 xmm5
- %define scratch2 xmm6
- %define zero xmm7
-
- %define s0 [src]
- %define s1 [spp]
- %define s2 [src + 2 * stride]
- %define s3 [spp + 2 * stride]
- %define s4 [src + 4 * stride]
- %define s5 [spp + 4 * stride]
- %define s6 [src + 2 * stride3]
- %define s7 [spp + 2 * stride3]
- %define s8 [src + 8 * stride]
- %define s9 [spp + 8 * stride]
- %define s10 [src + 2 * stride5]
- %define s11 [spp + 2 * stride5]
- %define s12 [src + 4 * stride3]
- %define s13 [spp + 4 * stride3]
- %define s14 [src + 2 * stride7]
- %define s15 [spp + 2 * stride7]
-
- %define i0 [rsp]
- %define i1 [rsp + 16]
- %define i2 [rsp + 32]
- %define i3 [rsp + 48]
- %define i4 [rsp + 64]
- %define i5 [rsp + 80]
- %define i6 [rsp + 96]
- %define i7 [rsp + 112]
- %define i8 [rsp + 128]
- %define i9 [rsp + 144]
- %define i10 [rsp + 160]
- %define i11 [rsp + 176]
- %define i12 [rsp + 192]
- %define i13 [rsp + 208]
- %define i14 [rsp + 224]
- %define i15 [rsp + 240]
-
- ALIGN_STACK 16, rax
-
- ; reserve stack space
- %define temp_storage 0 ; size is 256 (16*16)
- %define stack_size 256
- sub rsp, stack_size
-
- ; prep work
- lea spp, [src + stride]
- lea stride3, [stride + 2 * stride]
- lea stride5, [stride3 + 2 * stride]
- lea stride7, [stride3 + 4 * stride]
-
- ; 8-f
- movdqa xmm0, s8
- movdqa xmm1, xmm0
- punpcklbw xmm0, s9 ; 80 90
- punpckhbw xmm1, s9 ; 88 98
-
- movdqa xmm2, s10
- movdqa xmm3, xmm2
- punpcklbw xmm2, s11 ; a0 b0
- punpckhbw xmm3, s11 ; a8 b8
-
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm2 ; 80 90 a0 b0
- punpckhwd xmm4, xmm2 ; 84 94 a4 b4
-
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm3 ; 88 98 a8 b8
- punpckhwd xmm2, xmm3 ; 8c 9c ac bc
-
- ; using xmm[0124]
- ; work on next 4 rows
-
- movdqa xmm3, s12
- movdqa xmm5, xmm3
- punpcklbw xmm3, s13 ; c0 d0
- punpckhbw xmm5, s13 ; c8 d8
-
- movdqa xmm6, s14
- movdqa xmm7, xmm6
- punpcklbw xmm6, s15 ; e0 f0
- punpckhbw xmm7, s15 ; e8 f8
-
- movdqa xmm8, xmm3
- punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
- punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
-
- movdqa xmm6, xmm5
- punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
- punpckhwd xmm6, xmm7 ; cc dc ec fc
-
- ; pull the third and fourth sets together
-
- movdqa xmm7, xmm0
- punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
- punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
-
- movdqa xmm3, xmm4
- punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
- punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
-
- movdqa xmm8, xmm1
- punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
- punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
-
- movdqa xmm5, xmm2
- punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
- punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
-
- ; save the calculations. we only have 15 registers ...
- movdqa i0, xmm0
- movdqa i1, xmm7
- movdqa i2, xmm4
- movdqa i3, xmm3
- movdqa i4, xmm1
- movdqa i5, xmm8
- movdqa i6, xmm2
- movdqa i7, xmm5
-
- ; 0-7
- movdqa xmm0, s0
- movdqa xmm1, xmm0
- punpcklbw xmm0, s1 ; 00 10
- punpckhbw xmm1, s1 ; 08 18
-
- movdqa xmm2, s2
- movdqa xmm3, xmm2
- punpcklbw xmm2, s3 ; 20 30
- punpckhbw xmm3, s3 ; 28 38
-
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm2 ; 00 10 20 30
- punpckhwd xmm4, xmm2 ; 04 14 24 34
-
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm3 ; 08 18 28 38
- punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
-
- ; using xmm[0124]
- ; work on next 4 rows
-
- movdqa xmm3, s4
- movdqa xmm5, xmm3
- punpcklbw xmm3, s5 ; 40 50
- punpckhbw xmm5, s5 ; 48 58
-
- movdqa xmm6, s6
- movdqa xmm7, xmm6
- punpcklbw xmm6, s7 ; 60 70
- punpckhbw xmm7, s7 ; 68 78
-
- movdqa xmm8, xmm3
- punpcklwd xmm3, xmm6 ; 40 50 60 70
- punpckhwd xmm8, xmm6 ; 44 54 64 74
-
- movdqa xmm6, xmm5
- punpcklwd xmm5, xmm7 ; 48 58 68 78
- punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
-
- ; pull the first two sets together
-
- movdqa xmm7, xmm0
- punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
- punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
-
- movdqa xmm3, xmm4
- punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
- punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
-
- movdqa xmm8, xmm1
- punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
- punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
-
- movdqa xmm5, xmm2
- punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
- punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
- ; final combination
-
- movdqa xmm6, xmm0
- punpcklqdq xmm0, i0
- punpckhqdq xmm6, i0
-
- movdqa xmm9, xmm7
- punpcklqdq xmm7, i1
- punpckhqdq xmm9, i1
-
- movdqa xmm10, xmm4
- punpcklqdq xmm4, i2
- punpckhqdq xmm10, i2
-
- movdqa xmm11, xmm3
- punpcklqdq xmm3, i3
- punpckhqdq xmm11, i3
-
- movdqa xmm12, xmm1
- punpcklqdq xmm1, i4
- punpckhqdq xmm12, i4
-
- movdqa xmm13, xmm8
- punpcklqdq xmm8, i5
- punpckhqdq xmm13, i5
-
- movdqa xmm14, xmm2
- punpcklqdq xmm2, i6
- punpckhqdq xmm14, i6
-
- movdqa xmm15, xmm5
- punpcklqdq xmm5, i7
- punpckhqdq xmm15, i7
-
- movdqa i0, xmm0
- movdqa i1, xmm6
- movdqa i2, xmm7
- movdqa i3, xmm9
- movdqa i4, xmm4
- movdqa i5, xmm10
- movdqa i6, xmm3
- movdqa i7, xmm11
- movdqa i8, xmm1
- movdqa i9, xmm12
- movdqa i10, xmm8
- movdqa i11, xmm13
- movdqa i12, xmm2
- movdqa i13, xmm14
- movdqa i14, xmm5
- movdqa i15, xmm15
-
-; TRANSPOSED DATA AVAILABLE ON THE STACK
-
- movdqa xmm12, xmm6
- movdqa xmm13, xmm7
-
- pxor zero, zero
-
-LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
-
- movdqa xmm1, i2
- movdqa xmm2, i3
- movdqa xmm8, i4
- movdqa xmm9, i5
-LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
- movdqa i2, xmm1
- movdqa i3, xmm2
-
-; second set
- movdqa i4, xmm8
- movdqa i5, xmm9
-
- movdqa xmm0, i6
- movdqa xmm1, i7
- movdqa xmm2, i8
- movdqa xmm4, i9
- movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
- movdqa xmm11, i11
-LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
-
- movdqa xmm0, i6
- movdqa xmm1, i7
- movdqa xmm3, i8
- movdqa xmm4, i9
-LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
- movdqa i6, xmm0
- movdqa i7, xmm1
-
-; last set
- movdqa i8, xmm3
- movdqa i9, xmm4
-
- movdqa xmm0, i10
- movdqa xmm1, i11
- movdqa xmm2, i12
- movdqa xmm8, i13
- movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
- movdqa xmm11, i15
-LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
-
- movdqa xmm0, i10
- movdqa xmm1, i11
- movdqa xmm4, i12
- movdqa xmm8, i13
-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
- movdqa i10, xmm0
- movdqa i11, xmm1
- movdqa i12, xmm4
- movdqa i13, xmm8
-
-
-; RESHUFFLE AND WRITE OUT
- ; 8-f
- movdqa xmm0, i8
- movdqa xmm1, xmm0
- punpcklbw xmm0, i9 ; 80 90
- punpckhbw xmm1, i9 ; 88 98
-
- movdqa xmm2, i10
- movdqa xmm3, xmm2
- punpcklbw xmm2, i11 ; a0 b0
- punpckhbw xmm3, i11 ; a8 b8
-
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm2 ; 80 90 a0 b0
- punpckhwd xmm4, xmm2 ; 84 94 a4 b4
-
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm3 ; 88 98 a8 b8
- punpckhwd xmm2, xmm3 ; 8c 9c ac bc
-
- ; using xmm[0124]
- ; work on next 4 rows
-
- movdqa xmm3, i12
- movdqa xmm5, xmm3
- punpcklbw xmm3, i13 ; c0 d0
- punpckhbw xmm5, i13 ; c8 d8
-
- movdqa xmm6, i14
- movdqa xmm7, xmm6
- punpcklbw xmm6, i15 ; e0 f0
- punpckhbw xmm7, i15 ; e8 f8
-
- movdqa xmm8, xmm3
- punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
- punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
-
- movdqa xmm6, xmm5
- punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
- punpckhwd xmm6, xmm7 ; cc dc ec fc
-
- ; pull the third and fourth sets together
-
- movdqa xmm7, xmm0
- punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
- punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
-
- movdqa xmm3, xmm4
- punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
- punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
-
- movdqa xmm8, xmm1
- punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
- punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
-
- movdqa xmm5, xmm2
- punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
- punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
-
- ; save the calculations. we only have 15 registers ...
- movdqa i8, xmm0
- movdqa i9, xmm7
- movdqa i10, xmm4
- movdqa i11, xmm3
- movdqa i12, xmm1
- movdqa i13, xmm8
- movdqa i14, xmm2
- movdqa i15, xmm5
-
- ; 0-7
- movdqa xmm0, i0
- movdqa xmm1, xmm0
- punpcklbw xmm0, i1 ; 00 10
- punpckhbw xmm1, i1 ; 08 18
-
- movdqa xmm2, i2
- movdqa xmm3, xmm2
- punpcklbw xmm2, i3 ; 20 30
- punpckhbw xmm3, i3 ; 28 38
-
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm2 ; 00 10 20 30
- punpckhwd xmm4, xmm2 ; 04 14 24 34
-
- movdqa xmm2, xmm1
- punpcklwd xmm1, xmm3 ; 08 18 28 38
- punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
-
- ; using xmm[0124]
- ; work on next 4 rows
-
- movdqa xmm3, i4
- movdqa xmm5, xmm3
- punpcklbw xmm3, i5 ; 40 50
- punpckhbw xmm5, i5 ; 48 58
-
- movdqa xmm6, i6
- movdqa xmm7, xmm6
- punpcklbw xmm6, i7 ; 60 70
- punpckhbw xmm7, i7 ; 68 78
-
- movdqa xmm8, xmm3
- punpcklwd xmm3, xmm6 ; 40 50 60 70
- punpckhwd xmm8, xmm6 ; 44 54 64 74
-
- movdqa xmm6, xmm5
- punpcklwd xmm5, xmm7 ; 48 58 68 78
- punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
-
- ; pull the first two sets together
-
- movdqa xmm7, xmm0
- punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
- punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
-
- movdqa xmm3, xmm4
- punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
- punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
-
- movdqa xmm8, xmm1
- punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
- punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
-
- movdqa xmm5, xmm2
- punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
- punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
- ; final combination
-
- movdqa xmm6, xmm0
- punpcklqdq xmm0, i8
- punpckhqdq xmm6, i8
-
- movdqa xmm9, xmm7
- punpcklqdq xmm7, i9
- punpckhqdq xmm9, i9
-
- movdqa xmm10, xmm4
- punpcklqdq xmm4, i10
- punpckhqdq xmm10, i10
-
- movdqa xmm11, xmm3
- punpcklqdq xmm3, i11
- punpckhqdq xmm11, i11
-
- movdqa xmm12, xmm1
- punpcklqdq xmm1, i12
- punpckhqdq xmm12, i12
-
- movdqa xmm13, xmm8
- punpcklqdq xmm8, i13
- punpckhqdq xmm13, i13
-
- movdqa xmm14, xmm2
- punpcklqdq xmm2, i14
- punpckhqdq xmm14, i14
-
- movdqa xmm15, xmm5
- punpcklqdq xmm5, i15
- punpckhqdq xmm15, i15
-
- movdqa s0, xmm0
- movdqa s1, xmm6
- movdqa s2, xmm7
- movdqa s3, xmm9
- movdqa s4, xmm4
- movdqa s5, xmm10
- movdqa s6, xmm3
- movdqa s7, xmm11
- movdqa s8, xmm1
- movdqa s9, xmm12
- movdqa s10, xmm8
- movdqa s11, xmm13
- movdqa s12, xmm2
- movdqa s13, xmm14
- movdqa s14, xmm5
- movdqa s15, xmm15
-
- ; free stack space
- add rsp, stack_size
-
- ; un-ALIGN_STACK
- pop rsp
-
-%if LIBVPX_YASM_WIN64
- pop r13
- pop r12
- RESTORE_XMM
- pop rbp
-%endif
-
- ret
-
-SECTION_RODATA
-align 16
-te0:
- times 16 db 0xe0
-align 16
-t7f:
- times 16 db 0x7f
-align 16
-tfe:
- times 16 db 0xfe
-align 16
-t1f:
- times 16 db 0x1f
-align 16
-t80:
- times 16 db 0x80
-align 16
-t1:
- times 16 db 0x01
-align 16
-t3:
- times 16 db 0x03
-align 16
-t4:
- times 16 db 0x04
diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm b/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm
deleted file mode 100644
index 1913abc69b..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/loopfilter_sse2.asm
+++ /dev/null
@@ -1,1640 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%define _t0 0
-%define _t1 _t0 + 16
-%define _p3 _t1 + 16
-%define _p2 _p3 + 16
-%define _p1 _p2 + 16
-%define _p0 _p1 + 16
-%define _q0 _p0 + 16
-%define _q1 _q0 + 16
-%define _q2 _q1 + 16
-%define _q3 _q2 + 16
-%define lf_var_size 160
-
-; Use of pmaxub instead of psubusb to compute filter mask was seen
-; in ffvp8
-
-%macro LFH_FILTER_AND_HEV_MASK 1
-%if %1
- movdqa xmm2, [rdi+2*rax] ; q3
- movdqa xmm1, [rsi+2*rax] ; q2
- movdqa xmm4, [rsi+rax] ; q1
- movdqa xmm5, [rsi] ; q0
- neg rax ; negate pitch to deal with above border
-%else
- movlps xmm2, [rsi + rcx*2] ; q3
- movlps xmm1, [rsi + rcx] ; q2
- movlps xmm4, [rsi] ; q1
- movlps xmm5, [rsi + rax] ; q0
-
- movhps xmm2, [rdi + rcx*2]
- movhps xmm1, [rdi + rcx]
- movhps xmm4, [rdi]
- movhps xmm5, [rdi + rax]
-
- lea rsi, [rsi + rax*4]
- lea rdi, [rdi + rax*4]
-
- movdqa [rsp+_q2], xmm1 ; store q2
- movdqa [rsp+_q1], xmm4 ; store q1
-%endif
- movdqa xmm7, [rdx] ;limit
-
- movdqa xmm6, xmm1 ; q2
- movdqa xmm3, xmm4 ; q1
-
- psubusb xmm1, xmm2 ; q2-=q3
- psubusb xmm2, xmm6 ; q3-=q2
-
- psubusb xmm4, xmm6 ; q1-=q2
- psubusb xmm6, xmm3 ; q2-=q1
-
- por xmm4, xmm6 ; abs(q2-q1)
- por xmm1, xmm2 ; abs(q3-q2)
-
- movdqa xmm0, xmm5 ; q0
- pmaxub xmm1, xmm4
-
- psubusb xmm5, xmm3 ; q0-=q1
- psubusb xmm3, xmm0 ; q1-=q0
-
- por xmm5, xmm3 ; abs(q0-q1)
- movdqa [rsp+_t0], xmm5 ; save to t0
-
- pmaxub xmm1, xmm5
-
-%if %1
- movdqa xmm2, [rsi+4*rax] ; p3
- movdqa xmm4, [rdi+4*rax] ; p2
- movdqa xmm6, [rsi+2*rax] ; p1
-%else
- movlps xmm2, [rsi + rax] ; p3
- movlps xmm4, [rsi] ; p2
- movlps xmm6, [rsi + rcx] ; p1
-
- movhps xmm2, [rdi + rax]
- movhps xmm4, [rdi]
- movhps xmm6, [rdi + rcx]
-
- movdqa [rsp+_p2], xmm4 ; store p2
- movdqa [rsp+_p1], xmm6 ; store p1
-%endif
-
- movdqa xmm5, xmm4 ; p2
- movdqa xmm3, xmm6 ; p1
-
- psubusb xmm4, xmm2 ; p2-=p3
- psubusb xmm2, xmm5 ; p3-=p2
-
- psubusb xmm3, xmm5 ; p1-=p2
- pmaxub xmm1, xmm4 ; abs(p3 - p2)
-
- psubusb xmm5, xmm6 ; p2-=p1
- pmaxub xmm1, xmm2 ; abs(p3 - p2)
-
- pmaxub xmm1, xmm5 ; abs(p2 - p1)
- movdqa xmm2, xmm6 ; p1
-
- pmaxub xmm1, xmm3 ; abs(p2 - p1)
-%if %1
- movdqa xmm4, [rsi+rax] ; p0
- movdqa xmm3, [rdi] ; q1
-%else
- movlps xmm4, [rsi + rcx*2] ; p0
- movhps xmm4, [rdi + rcx*2]
- movdqa xmm3, [rsp+_q1] ; q1
-%endif
-
- movdqa xmm5, xmm4 ; p0
- psubusb xmm4, xmm6 ; p0-=p1
-
- psubusb xmm6, xmm5 ; p1-=p0
-
- por xmm6, xmm4 ; abs(p1 - p0)
- mov rdx, arg(2) ; get blimit
-
- movdqa [rsp+_t1], xmm6 ; save to t1
-
- movdqa xmm4, xmm3 ; q1
- pmaxub xmm1, xmm6
-
- psubusb xmm3, xmm2 ; q1-=p1
- psubusb xmm2, xmm4 ; p1-=q1
-
- psubusb xmm1, xmm7
- por xmm2, xmm3 ; abs(p1-q1)
-
- movdqa xmm7, [rdx] ; blimit
- mov rdx, arg(4) ; hev get thresh
-
- movdqa xmm3, xmm0 ; q0
- pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
-
- movdqa xmm6, xmm5 ; p0
- psrlw xmm2, 1 ; abs(p1-q1)/2
-
- psubusb xmm5, xmm3 ; p0-=q0
- psubusb xmm3, xmm6 ; q0-=p0
- por xmm5, xmm3 ; abs(p0 - q0)
-
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
-
- movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0)
- movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0)
-
- paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- movdqa xmm2, [rdx] ; hev
-
- psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- psubusb xmm4, xmm2 ; hev
-
- psubusb xmm3, xmm2 ; hev
- por xmm1, xmm5
-
- pxor xmm7, xmm7
- paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- pcmpeqb xmm4, xmm5 ; hev
- pcmpeqb xmm3, xmm3 ; hev
-
- pcmpeqb xmm1, xmm7 ; mask xmm1
- pxor xmm4, xmm3 ; hev
-%endmacro
-
-%macro B_FILTER 1
- movdqa xmm3, [GLOBAL(t80)]
-%if %1 == 0
- movdqa xmm2, [rsp+_p1] ; p1
- movdqa xmm7, [rsp+_q1] ; q1
-%elif %1 == 1
- movdqa xmm2, [rsi+2*rax] ; p1
- movdqa xmm7, [rdi] ; q1
-%elif %1 == 2
- movdqa xmm2, [rsp+_p1] ; p1
- movdqa xmm6, [rsp+_p0] ; p0
- movdqa xmm0, [rsp+_q0] ; q0
- movdqa xmm7, [rsp+_q1] ; q1
-%endif
-
- pxor xmm2, xmm3 ; p1 offset to convert to signed values
- pxor xmm7, xmm3 ; q1 offset to convert to signed values
-
- psubsb xmm2, xmm7 ; p1 - q1
- pxor xmm6, xmm3 ; offset to convert to signed values
-
- pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
- pxor xmm0, xmm3 ; offset to convert to signed values
-
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
- paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand xmm1, xmm2 ; mask filter values we don't care about
-
- movdqa xmm2, xmm1
- paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
- paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
- punpckhbw xmm5, xmm2 ; axbxcxdx
- punpcklbw xmm2, xmm2 ; exfxgxhx
-
- punpcklbw xmm0, xmm1 ; exfxgxhx
- psraw xmm5, 11 ; sign extended shift right by 3
-
- punpckhbw xmm1, xmm1 ; axbxcxdx
- psraw xmm2, 11 ; sign extended shift right by 3
-
- packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
- psraw xmm0, 11 ; sign extended shift right by 3
-
- psraw xmm1, 11 ; sign extended shift right by 3
- movdqa xmm5, xmm0 ; save results
-
- packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-
- paddsb xmm6, xmm2 ; p0+= p0 add
-
- movdqa xmm2, [GLOBAL(ones)]
- paddsw xmm5, xmm2
- paddsw xmm1, xmm2
- psraw xmm5, 1 ; partial shifted one more time for 2nd tap
- psraw xmm1, 1 ; partial shifted one more time for 2nd tap
- packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
- movdqa xmm2, [GLOBAL(t80)]
-
-%if %1 == 0
- movdqa xmm1, [rsp+_p1] ; p1
- lea rsi, [rsi + rcx*2]
- lea rdi, [rdi + rcx*2]
-%elif %1 == 1
- movdqa xmm1, [rsi+2*rax] ; p1
-%elif %1 == 2
- movdqa xmm1, [rsp+_p1] ; p1
-%endif
-
- pandn xmm4, xmm5 ; high edge variance additive
- pxor xmm6, xmm2 ; unoffset
-
- pxor xmm1, xmm2 ; reoffset
- psubsb xmm3, xmm0 ; q0-= q0 add
-
- paddsb xmm1, xmm4 ; p1+= p1 add
- pxor xmm3, xmm2 ; unoffset
-
- pxor xmm1, xmm2 ; unoffset
- psubsb xmm7, xmm4 ; q1-= q1 add
-
- pxor xmm7, xmm2 ; unoffset
-%if %1 == 0
- movq [rsi], xmm6 ; p0
- movhps [rdi], xmm6
- movq [rsi + rax], xmm1 ; p1
- movhps [rdi + rax], xmm1
- movq [rsi + rcx], xmm3 ; q0
- movhps [rdi + rcx], xmm3
- movq [rsi + rcx*2], xmm7 ; q1
- movhps [rdi + rcx*2], xmm7
-%elif %1 == 1
- movdqa [rsi+rax], xmm6 ; write back
- movdqa [rsi+2*rax], xmm1 ; write back
- movdqa [rsi], xmm3 ; write back
- movdqa [rdi], xmm7 ; write back
-%endif
-
-%endmacro
-
-%if ABI_IS_32BIT
-
-;void vp8_loop_filter_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-;)
-global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
-sym(vp8_loop_filter_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step
-
- mov rdx, arg(3) ;limit
-
- lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 1
- ; filter and write back the result
- B_FILTER 1
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%endif
-
-;void vp8_loop_filter_horizontal_edge_uv_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
-sym(vp8_loop_filter_horizontal_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ; u
- mov rdi, arg(5) ; v
- movsxd rax, dword ptr arg(1) ; src_pixel_step
- mov rcx, rax
- neg rax ; negate pitch to deal with above border
-
- mov rdx, arg(3) ;limit
-
- lea rsi, [rsi + rcx]
- lea rdi, [rdi + rcx]
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 0
- ; filter and write back the result
- B_FILTER 0
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-%macro MB_FILTER_AND_WRITEBACK 1
- movdqa xmm3, [GLOBAL(t80)]
-%if %1 == 0
- movdqa xmm2, [rsp+_p1] ; p1
- movdqa xmm7, [rsp+_q1] ; q1
-%elif %1 == 1
- movdqa xmm2, [rsi+2*rax] ; p1
- movdqa xmm7, [rdi] ; q1
-
- mov rcx, rax
- neg rcx
-%elif %1 == 2
- movdqa xmm2, [rsp+_p1] ; p1
- movdqa xmm6, [rsp+_p0] ; p0
- movdqa xmm0, [rsp+_q0] ; q0
- movdqa xmm7, [rsp+_q1] ; q1
-%endif
-
- pxor xmm2, xmm3 ; p1 offset to convert to signed values
- pxor xmm7, xmm3 ; q1 offset to convert to signed values
- pxor xmm6, xmm3 ; offset to convert to signed values
- pxor xmm0, xmm3 ; offset to convert to signed values
-
- psubsb xmm2, xmm7 ; p1 - q1
-
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
- paddsb xmm2, xmm0 ; 2 * (q0 - p0)
- paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
- pand xmm1, xmm2 ; mask filter values we don't care about
-
- movdqa xmm2, xmm1 ; vp8_filter
-
- pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
- pxor xmm0, xmm0
-
- pandn xmm4, xmm1 ; vp8_filter&=~hev
- pxor xmm1, xmm1
-
- punpcklbw xmm0, xmm4 ; Filter 2 (hi)
- punpckhbw xmm1, xmm4 ; Filter 2 (lo)
-
- movdqa xmm5, xmm2
-
- movdqa xmm4, [GLOBAL(s9)]
- paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
- paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
-
- pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9
- pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9
-
- punpckhbw xmm7, xmm5 ; axbxcxdx
- punpcklbw xmm5, xmm5 ; exfxgxhx
-
- psraw xmm7, 11 ; sign extended shift right by 3
-
- psraw xmm5, 11 ; sign extended shift right by 3
- punpckhbw xmm4, xmm2 ; axbxcxdx
-
- punpcklbw xmm2, xmm2 ; exfxgxhx
- psraw xmm4, 11 ; sign extended shift right by 3
-
- packsswb xmm5, xmm7 ; Filter2 >>=3;
- psraw xmm2, 11 ; sign extended shift right by 3
-
- packsswb xmm2, xmm4 ; Filter1 >>=3;
-
- paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
-
- psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
- movdqa xmm7, xmm1
-
- movdqa xmm4, [GLOBAL(s63)]
- movdqa xmm5, xmm0
- movdqa xmm2, xmm5
- paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63
- paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63
- movdqa xmm4, xmm7
-
- paddw xmm5, xmm5 ; Filter 2 (hi) * 18
-
- paddw xmm7, xmm7 ; Filter 2 (lo) * 18
- paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
-
- paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
- paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
- psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
-
- paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
- psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
- psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
-
- packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
-
- psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
- psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
- psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
-
- packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
- packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
- movdqa xmm7, [GLOBAL(t80)]
-
-%if %1 == 0
- movdqa xmm1, [rsp+_q1] ; q1
- movdqa xmm4, [rsp+_p1] ; p1
- lea rsi, [rsi+rcx*2]
- lea rdi, [rdi+rcx*2]
-
-%elif %1 == 1
- movdqa xmm1, [rdi] ; q1
- movdqa xmm4, [rsi+rax*2] ; p1
-%elif %1 == 2
- movdqa xmm4, [rsp+_p1] ; p1
- movdqa xmm1, [rsp+_q1] ; q1
-%endif
-
- pxor xmm1, xmm7
- pxor xmm4, xmm7
-
- psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
- paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
- psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
- paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
-
-%if %1 == 1
- movdqa xmm2, [rdi+rax*4] ; p2
- movdqa xmm5, [rdi+rcx] ; q2
-%else
- movdqa xmm2, [rsp+_p2] ; p2
- movdqa xmm5, [rsp+_q2] ; q2
-%endif
-
- pxor xmm1, xmm7 ; *oq1 = sq^0x80;
- pxor xmm4, xmm7 ; *op1 = sp^0x80;
- pxor xmm2, xmm7
- pxor xmm5, xmm7
- paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
- psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
- pxor xmm2, xmm7 ; *op2 = sp^0x80;
- pxor xmm5, xmm7 ; *oq2 = sq^0x80;
- pxor xmm3, xmm7 ; *oq0 = sq^0x80
- pxor xmm6, xmm7 ; *oq0 = sp^0x80
-%if %1 == 0
- movq [rsi], xmm6 ; p0
- movhps [rdi], xmm6
- movq [rsi + rcx], xmm3 ; q0
- movhps [rdi + rcx], xmm3
- lea rdx, [rcx + rcx*2]
- movq [rsi+rcx*2], xmm1 ; q1
- movhps [rdi+rcx*2], xmm1
-
- movq [rsi + rax], xmm4 ; p1
- movhps [rdi + rax], xmm4
-
- movq [rsi+rax*2], xmm2 ; p2
- movhps [rdi+rax*2], xmm2
-
- movq [rsi+rdx], xmm5 ; q2
- movhps [rdi+rdx], xmm5
-%elif %1 == 1
- movdqa [rdi+rcx], xmm5 ; q2
- movdqa [rdi], xmm1 ; q1
- movdqa [rsi], xmm3 ; q0
- movdqa [rsi+rax ], xmm6 ; p0
- movdqa [rsi+rax*2], xmm4 ; p1
- movdqa [rdi+rax*4], xmm2 ; p2
-%elif %1 == 2
- movdqa [rsp+_p1], xmm4 ; p1
- movdqa [rsp+_p0], xmm6 ; p0
- movdqa [rsp+_q0], xmm3 ; q0
- movdqa [rsp+_q1], xmm1 ; q1
-%endif
-
-%endmacro
-
-
-;void vp8_mbloop_filter_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
-sym(vp8_mbloop_filter_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step
- mov rdx, arg(3) ;limit
-
- lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 1
- ; filter and write back the results
- MB_FILTER_AND_WRITEBACK 1
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_mbloop_filter_horizontal_edge_uv_sse2
-;(
-; unsigned char *u,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; unsigned char *v
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
-sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ; u
- mov rdi, arg(5) ; v
- movsxd rax, dword ptr arg(1) ; src_pixel_step
- mov rcx, rax
- neg rax ; negate pitch to deal with above border
- mov rdx, arg(3) ;limit
-
- lea rsi, [rsi + rcx]
- lea rdi, [rdi + rcx]
-
- ; calculate breakout conditions and high edge variance
- LFH_FILTER_AND_HEV_MASK 0
- ; filter and write back the results
- MB_FILTER_AND_WRITEBACK 0
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-%macro TRANSPOSE_16X8 2
- movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
- movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
- movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
- movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
- movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
- movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
-
- punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
- movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
-
- movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
- punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
- movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
-
- punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-%if %1
- lea rsi, [rsi+rax*8]
- lea rdi, [rdi+rax*8]
-%else
- mov rsi, arg(5) ; v_ptr
-%endif
-
- movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
- punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
- punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
- punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
- punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-
-%if %1 == 0
- lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
- lea rsi, [rsi - 4]
-%endif
-
- movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
- movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
- punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
- punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
- punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
- movdqa [rsp+_t0], xmm2 ; save to free XMM2
-
- movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
- movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
- movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
- movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
- movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-
- punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
- punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
- movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
- punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
- movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
-
- punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
- movdqa xmm6, xmm1 ;
- punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
- punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
- movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
- punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
- punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- movdqa xmm0, xmm5
- punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
- punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
- movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
- punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
- punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
- movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
- punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
- punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-
-%if %2 == 0
- movdqa [rsp+_q3], xmm7 ; save 7
- movdqa [rsp+_q2], xmm6 ; save 6
-%endif
- movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- movdqa [rsp+_p1], xmm2 ; save 2
-
- movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- movdqa [rsp+_p0], xmm3 ; save 3
-
- punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
- movdqa [rsp+_q0], xmm4 ; save 4
- movdqa [rsp+_q1], xmm5 ; save 5
- movdqa xmm1, [rsp+_t0]
-
- movdqa xmm2, xmm1 ;
- punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
-%if %2 == 0
- movdqa [rsp+_p2], xmm1
- movdqa [rsp+_p3], xmm2
-%endif
-
-%endmacro
-
-%macro LFV_FILTER_MASK_HEV_MASK 0
- movdqa xmm0, xmm6 ; q2
- psubusb xmm0, xmm7 ; q2-q3
-
- psubusb xmm7, xmm6 ; q3-q2
- movdqa xmm4, xmm5 ; q1
-
- por xmm7, xmm0 ; abs (q3-q2)
- psubusb xmm4, xmm6 ; q1-q2
-
- movdqa xmm0, xmm1
- psubusb xmm6, xmm5 ; q2-q1
-
- por xmm6, xmm4 ; abs (q2-q1)
- psubusb xmm0, xmm2 ; p2 - p3;
-
- psubusb xmm2, xmm1 ; p3 - p2;
- por xmm0, xmm2 ; abs(p2-p3)
-
- movdqa xmm5, [rsp+_p1] ; p1
- pmaxub xmm0, xmm7
-
- movdqa xmm2, xmm5 ; p1
- psubusb xmm5, xmm1 ; p1-p2
- psubusb xmm1, xmm2 ; p2-p1
-
- movdqa xmm7, xmm3 ; p0
- psubusb xmm7, xmm2 ; p0-p1
-
- por xmm1, xmm5 ; abs(p2-p1)
- pmaxub xmm0, xmm6
-
- pmaxub xmm0, xmm1
- movdqa xmm1, xmm2 ; p1
-
- psubusb xmm2, xmm3 ; p1-p0
-
- por xmm2, xmm7 ; abs(p1-p0)
-
- pmaxub xmm0, xmm2
-
- movdqa xmm5, [rsp+_q0] ; q0
- movdqa xmm7, [rsp+_q1] ; q1
-
- mov rdx, arg(3) ; limit
-
- movdqa xmm6, xmm5 ; q0
- movdqa xmm4, xmm7 ; q1
-
- psubusb xmm5, xmm7 ; q0-q1
- psubusb xmm7, xmm6 ; q1-q0
-
- por xmm7, xmm5 ; abs(q1-q0)
-
- pmaxub xmm0, xmm7
-
- psubusb xmm0, [rdx] ; limit
-
- mov rdx, arg(2) ; blimit
- movdqa xmm5, xmm4 ; q1
-
- psubusb xmm5, xmm1 ; q1-=p1
- psubusb xmm1, xmm4 ; p1-=q1
-
- por xmm5, xmm1 ; abs(p1-q1)
- movdqa xmm1, xmm3 ; p0
-
- pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psubusb xmm1, xmm6 ; p0-q0
-
- movdqa xmm4, [rdx] ; blimit
- mov rdx, arg(4) ; get thresh
-
- psrlw xmm5, 1 ; abs(p1-q1)/2
- psubusb xmm6, xmm3 ; q0-p0
-
- por xmm1, xmm6 ; abs(q0-p0)
- paddusb xmm1, xmm1 ; abs(q0-p0)*2
- movdqa xmm3, [rdx]
-
- paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh
-
- psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh
-
- psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- por xmm1, xmm0 ; mask
- pcmpeqb xmm2, xmm0
-
- pxor xmm0, xmm0
- pcmpeqb xmm4, xmm4
-
- pcmpeqb xmm1, xmm0
- pxor xmm4, xmm2
-%endmacro
-
-%macro BV_TRANSPOSE 0
- ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
- movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
- movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
- punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
- movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
- punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
- punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
- ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
- ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
- ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
- ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-%endmacro
-
-%macro BV_WRITEBACK 2
- movd [rsi+2], %1
- movd [rsi+4*rax+2], %2
- psrldq %1, 4
- psrldq %2, 4
- movd [rdi+2], %1
- movd [rdi+4*rax+2], %2
- psrldq %1, 4
- psrldq %2, 4
- movd [rsi+2*rax+2], %1
- movd [rsi+2*rcx+2], %2
- psrldq %1, 4
- psrldq %2, 4
- movd [rdi+2*rax+2], %1
- movd [rdi+2*rcx+2], %2
-%endmacro
-
-%if ABI_IS_32BIT
-
-;void vp8_loop_filter_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-;)
-global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
-sym(vp8_loop_filter_vertical_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ; src_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax*2+rax]
-
- ;transpose 16x8 to 8x16, and store the 8-line result on stack.
- TRANSPOSE_16X8 1, 1
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK
-
- ; start work on filters
- B_FILTER 2
-
- ; transpose and write back - only work on q1, q0, p0, p1
- BV_TRANSPOSE
- ; store 16-line result
-
- lea rdx, [rax]
- neg rdx
-
- BV_WRITEBACK xmm1, xmm5
-
- lea rsi, [rsi+rdx*8]
- lea rdi, [rdi+rdx*8]
- BV_WRITEBACK xmm2, xmm6
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%endif
-
-;void vp8_loop_filter_vertical_edge_uv_sse2
-;(
-; unsigned char *u,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; unsigned char *v
-;)
-global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
-sym(vp8_loop_filter_vertical_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ; u_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax+2*rax]
-
- ;transpose 16x8 to 8x16, and store the 8-line result on stack.
- TRANSPOSE_16X8 0, 1
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK
-
- ; start work on filters
- B_FILTER 2
-
- ; transpose and write back - only work on q1, q0, p0, p1
- BV_TRANSPOSE
-
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-
- ; store 16-line result
- BV_WRITEBACK xmm1, xmm5
-
- mov rsi, arg(0) ; u_ptr
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- BV_WRITEBACK xmm2, xmm6
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%macro MBV_TRANSPOSE 0
- movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
- punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-
- punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
- movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
- movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
- punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
-
- movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
- punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
-
- punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
- movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-
- punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
- punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
-%endmacro
-
-%macro MBV_WRITEBACK_1 0
- movq [rsi], xmm0
- movhps [rdi], xmm0
-
- movq [rsi+2*rax], xmm6
- movhps [rdi+2*rax], xmm6
-
- movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
- punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
-
- movq [rsi+4*rax], xmm0
- movhps [rdi+4*rax], xmm0
-
- movq [rsi+2*rcx], xmm3
- movhps [rdi+2*rcx], xmm3
-
- movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
- punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
-
- movdqa xmm0, xmm7
- punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
- punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
-
- movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
- punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
-%endmacro
-
-%macro MBV_WRITEBACK_2 0
- movq [rsi], xmm1
- movhps [rdi], xmm1
-
- movq [rsi+2*rax], xmm5
- movhps [rdi+2*rax], xmm5
-
- movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
- punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
- punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
-
- movq [rsi+4*rax], xmm1
- movhps [rdi+4*rax], xmm1
-
- movq [rsi+2*rcx], xmm4
- movhps [rdi+2*rcx], xmm4
-%endmacro
-
-
-;void vp8_mbloop_filter_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-;)
-global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
-sym(vp8_mbloop_filter_vertical_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ; src_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax*2+rax]
-
- ; Transpose
- TRANSPOSE_16X8 1, 0
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK
-
- neg rax
- ; start work on filters
- MB_FILTER_AND_WRITEBACK 2
-
- lea rsi, [rsi+rax*8]
- lea rdi, [rdi+rax*8]
-
- ; transpose and write back
- MBV_TRANSPOSE
-
- neg rax
-
- MBV_WRITEBACK_1
-
-
- lea rsi, [rsi+rax*8]
- lea rdi, [rdi+rax*8]
- MBV_WRITEBACK_2
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_mbloop_filter_vertical_edge_uv_sse2
-;(
-; unsigned char *u,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; unsigned char *v
-;)
-global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
-sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, lf_var_size
-
- mov rsi, arg(0) ; u_ptr
- movsxd rax, dword ptr arg(1) ; src_pixel_step
-
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
- lea rcx, [rax+2*rax]
-
- ; Transpose
- TRANSPOSE_16X8 0, 0
-
- ; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK
-
- ; start work on filters
- MB_FILTER_AND_WRITEBACK 2
-
- ; transpose and write back
- MBV_TRANSPOSE
-
- mov rsi, arg(0) ;u_ptr
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax]
- MBV_WRITEBACK_1
- mov rsi, arg(5) ;v_ptr
- lea rsi, [rsi - 4]
- lea rdi, [rsi + rax]
- MBV_WRITEBACK_2
-
- add rsp, lf_var_size
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_loop_filter_simple_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
-sym(vp8_loop_filter_simple_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx
- ; end prolog
-
- mov rcx, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
- movdqa xmm6, [GLOBAL(tfe)]
- lea rdx, [rcx + rax]
- neg rax
-
- ; calculate mask
- movdqa xmm0, [rdx] ; q1
- mov rdx, arg(2) ;blimit
- movdqa xmm1, [rcx+2*rax] ; p1
-
- movdqa xmm2, xmm1
- movdqa xmm3, xmm0
-
- psubusb xmm0, xmm1 ; q1-=p1
- psubusb xmm1, xmm3 ; p1-=q1
- por xmm1, xmm0 ; abs(p1-q1)
- pand xmm1, xmm6 ; set lsb of each byte to zero
- psrlw xmm1, 1 ; abs(p1-q1)/2
-
- movdqa xmm7, XMMWORD PTR [rdx]
-
- movdqa xmm5, [rcx+rax] ; p0
- movdqa xmm4, [rcx] ; q0
- movdqa xmm0, xmm4 ; q0
- movdqa xmm6, xmm5 ; p0
- psubusb xmm5, xmm4 ; p0-=q0
- psubusb xmm4, xmm6 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
-
- movdqa xmm4, [GLOBAL(t80)]
-
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm7, xmm7
- pcmpeqb xmm5, xmm7
-
-
- ; start work on filters
- pxor xmm2, xmm4 ; p1 offset to convert to signed values
- pxor xmm3, xmm4 ; q1 offset to convert to signed values
- psubsb xmm2, xmm3 ; p1 - q1
-
- pxor xmm6, xmm4 ; offset to convert to signed values
- pxor xmm0, xmm4 ; offset to convert to signed values
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
- pand xmm5, xmm2 ; mask filter values we don't care about
-
- movdqa xmm0, xmm5
- paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
- paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
-
- movdqa xmm1, [GLOBAL(te0)]
- movdqa xmm2, [GLOBAL(t1f)]
-
-; pxor xmm7, xmm7
- pcmpgtb xmm7, xmm0 ;save sign
- pand xmm7, xmm1 ;preserve the upper 3 bits
- psrlw xmm0, 3
- pand xmm0, xmm2 ;clear out upper 3 bits
- por xmm0, xmm7 ;add sign
- psubsb xmm3, xmm0 ; q0-= q0sz add
-
- pxor xmm7, xmm7
- pcmpgtb xmm7, xmm5 ;save sign
- pand xmm7, xmm1 ;preserve the upper 3 bits
- psrlw xmm5, 3
- pand xmm5, xmm2 ;clear out upper 3 bits
- por xmm5, xmm7 ;add sign
- paddsb xmm6, xmm5 ; p0+= p0 add
-
- pxor xmm3, xmm4 ; unoffset
- movdqa [rcx], xmm3 ; write back
-
- pxor xmm6, xmm4 ; unoffset
- movdqa [rcx+rax], xmm6 ; write back
-
- ; begin epilog
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_loop_filter_simple_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
-sym(vp8_loop_filter_simple_vertical_edge_sse2):
- push rbp ; save old base pointer value.
- mov rbp, rsp ; set new base pointer value.
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx ; save callee-saved reg
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi - 2 ]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
- movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
- movd xmm2, [rdi] ; 13 12 11 10
- movd xmm3, [rcx] ; 53 52 51 50
- punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
- punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
-
- movd xmm4, [rsi + rax*2] ; 23 22 21 20
- movd xmm5, [rdx + rax*2] ; 63 62 61 60
- movd xmm6, [rdi + rax*2] ; 33 32 31 30
- movd xmm7, [rcx + rax*2] ; 73 72 71 70
- punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
- punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
-
- punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
- punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
- movdqa xmm2, xmm0
- punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- lea rsi, [rsi + rax*8]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm4, [rsi] ; 83 82 81 80
- movd xmm1, [rdx] ; c3 c2 c1 c0
- movd xmm6, [rdi] ; 93 92 91 90
- movd xmm3, [rcx] ; d3 d2 d1 d0
- punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
- punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
-
- movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0
- movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
- movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0
- movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
- punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
- punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
-
- punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
- punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
- movdqa xmm7, xmm4
- punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
- punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
- movdqa xmm6, xmm4
- punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
- punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
-
- punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- mov rdx, arg(2) ;blimit
-
- ; calculate mask
- movdqa xmm6, xmm0 ; p1
- movdqa xmm7, xmm3 ; q1
- psubusb xmm7, xmm0 ; q1-=p1
- psubusb xmm6, xmm3 ; p1-=q1
- por xmm6, xmm7 ; abs(p1-q1)
- pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw xmm6, 1 ; abs(p1-q1)/2
-
- movdqa xmm7, [rdx]
-
- movdqa xmm5, xmm1 ; p0
- movdqa xmm4, xmm2 ; q0
- psubusb xmm5, xmm2 ; p0-=q0
- psubusb xmm4, xmm1 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- movdqa xmm4, [GLOBAL(t80)]
-
- psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm7, xmm7
- pcmpeqb xmm5, xmm7 ; mm5 = mask
-
- ; start work on filters
- movdqa t0, xmm0
- movdqa t1, xmm3
-
- pxor xmm0, xmm4 ; p1 offset to convert to signed values
- pxor xmm3, xmm4 ; q1 offset to convert to signed values
- psubsb xmm0, xmm3 ; p1 - q1
-
- pxor xmm1, xmm4 ; offset to convert to signed values
- pxor xmm2, xmm4 ; offset to convert to signed values
-
- movdqa xmm3, xmm2 ; offseted ; q0
- psubsb xmm2, xmm1 ; q0 - p0
- paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
- paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
- pand xmm5, xmm0 ; mask filter values we don't care about
-
- movdqa xmm0, xmm5
- paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
- paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
-
- movdqa xmm6, [GLOBAL(te0)]
- movdqa xmm2, [GLOBAL(t1f)]
-
-; pxor xmm7, xmm7
- pcmpgtb xmm7, xmm0 ;save sign
- pand xmm7, xmm6 ;preserve the upper 3 bits
- psrlw xmm0, 3
- pand xmm0, xmm2 ;clear out upper 3 bits
- por xmm0, xmm7 ;add sign
- psubsb xmm3, xmm0 ; q0-= q0sz add
-
- pxor xmm7, xmm7
- pcmpgtb xmm7, xmm5 ;save sign
- pand xmm7, xmm6 ;preserve the upper 3 bits
- psrlw xmm5, 3
- pand xmm5, xmm2 ;clear out upper 3 bits
- por xmm5, xmm7 ;add sign
- paddsb xmm1, xmm5 ; p0+= p0 add
-
- pxor xmm3, xmm4 ; unoffset q0
- pxor xmm1, xmm4 ; unoffset p0
-
- movdqa xmm0, t0 ; p1
- movdqa xmm4, t1 ; q1
-
- ; write out order: xmm0 xmm2 xmm1 xmm3
- lea rdx, [rsi + rax*4]
-
- ; transpose back to write out
- ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- movdqa xmm6, xmm0
- punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- movdqa xmm5, xmm3
- punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
- punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
- movdqa xmm3, xmm6
- punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
- movd [rsi], xmm6 ; write the second 8-line result
- movd [rdx], xmm3
- psrldq xmm6, 4
- psrldq xmm3, 4
- movd [rdi], xmm6
- movd [rcx], xmm3
- psrldq xmm6, 4
- psrldq xmm3, 4
- movd [rsi + rax*2], xmm6
- movd [rdx + rax*2], xmm3
- psrldq xmm6, 4
- psrldq xmm3, 4
- movd [rdi + rax*2], xmm6
- movd [rcx + rax*2], xmm3
-
- neg rax
- lea rsi, [rsi + rax*8]
- neg rax
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd [rsi], xmm0 ; write the first 8-line result
- movd [rdx], xmm2
- psrldq xmm0, 4
- psrldq xmm2, 4
- movd [rdi], xmm0
- movd [rcx], xmm2
- psrldq xmm0, 4
- psrldq xmm2, 4
- movd [rsi + rax*2], xmm0
- movd [rdx + rax*2], xmm2
- psrldq xmm0, 4
- psrldq xmm2, 4
- movd [rdi + rax*2], xmm0
- movd [rcx + rax*2], xmm2
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-tfe:
- times 16 db 0xfe
-align 16
-t80:
- times 16 db 0x80
-align 16
-t1s:
- times 16 db 0x01
-align 16
-t3:
- times 16 db 0x03
-align 16
-t4:
- times 16 db 0x04
-align 16
-ones:
- times 8 dw 0x0001
-align 16
-s9:
- times 8 dw 0x0900
-align 16
-s63:
- times 8 dw 0x003f
-align 16
-te0:
- times 16 db 0xe0
-align 16
-t1f:
- times 16 db 0x1f
diff --git a/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c b/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c
deleted file mode 100644
index 6586004600..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/loopfilter_x86.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/common/loopfilter.h"
-
-#define prototype_loopfilter(sym) \
- void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
- const unsigned char *limit, const unsigned char *thresh, int count)
-
-#define prototype_loopfilter_nc(sym) \
- void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
- const unsigned char *limit, const unsigned char *thresh)
-
-#define prototype_simple_loopfilter(sym) \
- void sym(unsigned char *y, int ystride, const unsigned char *blimit)
-
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
-prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
-prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
-prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
-
-#if HAVE_SSE2 && ARCH_X86_64
-prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
-prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
-#else
-prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
-prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
-#endif
-prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
-prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
-
-extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
-extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
-
-#if HAVE_MMX
-/* Horizontal MB filtering */
-void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-
-/* Vertical MB Filtering */
-void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-
-/* Horizontal B Filtering */
-void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-
-void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
-{
- vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
- vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
- vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
-}
-
-
-/* Vertical B Filtering */
-void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
- if (u_ptr)
- vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
- if (v_ptr)
- vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-
-void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
-{
- vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
- vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
- vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
-#endif
-
-
-/* Horizontal MB filtering */
-#if HAVE_SSE2
-void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
-
- if (u_ptr)
- vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
-}
-
-
-/* Vertical MB Filtering */
-void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
-
- if (u_ptr)
- vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
-}
-
-
-/* Horizontal B Filtering */
-void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-#if ARCH_X86_64
- vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-#else
- vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
- vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
- vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
-#endif
-
- if (u_ptr)
- vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
-}
-
-
-void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
-{
- vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
- vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
- vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
-}
-
-
-/* Vertical B Filtering */
-void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-#if ARCH_X86_64
- vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-#else
- vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
- vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
- vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
-#endif
-
- if (u_ptr)
- vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
-}
-
-
-void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
-{
- vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
- vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
- vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
-
-#endif
diff --git a/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm b/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm
deleted file mode 100644
index 15e98713c7..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/recon_mmx.asm
+++ /dev/null
@@ -1,274 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void copy_mem8x8_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp8_copy_mem8x8_mmx) PRIVATE
-sym(vp8_copy_mem8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- add rsi, rax
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx*2], mm2
-
-
- lea rdi, [rdi+rcx*2]
- movq mm3, [rsi]
-
- add rdi, rcx
- movq mm4, [rsi+rax]
-
- movq mm5, [rsi+rax*2]
- movq [rdi], mm3
-
- lea rsi, [rsi+rax*2]
- movq [rdi+rcx], mm4
-
- movq [rdi+rcx*2], mm5
- lea rdi, [rdi+rcx*2]
-
- movq mm0, [rsi+rax]
- movq mm1, [rsi+rax*2]
-
- movq [rdi+rcx], mm0
- movq [rdi+rcx*2],mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem8x4_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp8_copy_mem8x4_mmx) PRIVATE
-sym(vp8_copy_mem8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- movq [rdi+rcx], mm1
-
- movq [rdi+rcx*2], mm2
- lea rdi, [rdi+rcx*2]
-
- movq mm3, [rsi+rax]
- movq [rdi+rcx], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem16x16_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp8_copy_mem16x16_mmx) PRIVATE
-sym(vp8_copy_mem16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movsxd rax, dword ptr arg(1) ;src_stride;
-
- mov rdi, arg(2) ;dst;
- movsxd rcx, dword ptr arg(3) ;dst_stride
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm b/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm
deleted file mode 100644
index cb89537f76..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/recon_sse2.asm
+++ /dev/null
@@ -1,116 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void copy_mem16x16_sse2(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp8_copy_mem16x16_sse2) PRIVATE
-sym(vp8_copy_mem16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movdqu xmm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movdqu xmm1, [rsi+rax]
- movdqu xmm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm3, [rsi]
-
- add rdi, rcx
- movdqu xmm4, [rsi+rax]
-
- movdqu xmm5, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm3
- add rsi, rax
-
- movdqa [rdi+rcx], xmm4
- movdqa [rdi+rcx*2],xmm5
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm0, [rsi]
-
- add rdi, rcx
- movdqu xmm1, [rsi+rax]
-
- movdqu xmm2, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
-
- movdqa [rdi+rcx*2], xmm2
- movdqu xmm3, [rsi]
-
- movdqu xmm4, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- add rdi, rcx
- movdqu xmm5, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm3
-
- add rsi, rax
- movdqa [rdi+rcx], xmm4
-
- movdqa [rdi+rcx*2],xmm5
- movdqu xmm0, [rsi]
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm1, [rsi+rax]
-
- add rdi, rcx
- movdqu xmm2, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm0
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- movdqu xmm3, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- movdqa [rdi+rcx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm
deleted file mode 100644
index 47dd452297..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/subpixel_mmx.asm
+++ /dev/null
@@ -1,702 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp8_filter_weight 128
-%define VP8_FILTER_SHIFT 7
-
-
-;void vp8_filter_block1d_h6_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp8_filter
-;)
-global sym(vp8_filter_block1d_h6_mmx) PRIVATE
-sym(vp8_filter_block1d_h6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp8_filter
-
- movq mm1, [rdx + 16] ; do both the negative taps first!!!
- movq mm2, [rdx + 32] ;
- movq mm6, [rdx + 48] ;
- movq mm7, [rdx + 64] ;
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
- movq mm3, [rsi-2] ; mm3 = p-2..p5
- movq mm4, mm3 ; mm4 = p-2..p5
- psrlq mm3, 8 ; mm3 = p-1..p5
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
- movq mm5, mm4 ; mm5 = p-2..p5
- punpckhbw mm4, mm0 ; mm5 = p2..p5
- pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- movq mm4, mm5 ; mm4 = p-2..p5;
- psrlq mm5, 16 ; mm5 = p0..p5;
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- movq mm5, mm4 ; mm5 = p-2..p5
- psrlq mm4, 24 ; mm4 = p1..p5
- punpcklbw mm4, mm0 ; mm4 = p1..p4
- pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- ; do outer positive taps
- movd mm4, [rsi+3]
- punpcklbw mm4, mm0 ; mm5 = p3..p6
- pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- punpcklbw mm3, mm0 ;
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
- add rdi, rax;
-%else
- movsxd r8, dword ptr arg(2) ;src_pixels_per_line
- add rdi, rax;
-
- add rsi, r8 ; next line
-%endif
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_filter_block1dc_v6_mmx
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int output_pitch,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp8_filter
-;)
-global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
-sym(vp8_filter_block1dc_v6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movq mm5, [GLOBAL(rd)]
- push rbx
- mov rbx, arg(7) ;vp8_filter
- movq mm1, [rbx + 16] ; do both the negative taps first!!!
- movq mm2, [rbx + 32] ;
- movq mm6, [rbx + 48] ;
- movq mm7, [rbx + 64] ;
-
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- sub rsi, rdx
- sub rsi, rdx
- movsxd rcx, DWORD PTR arg(5) ;output_height
- movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-
-.nextrow_cv:
- movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
- pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
- pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi] ; mm4 = p0..p3 = row -2
- pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
- pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
- pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- paddsw mm3, mm5 ; mm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
-
- movd [rdi],mm3 ; store the results in the destination
- ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
- ; recon block should be in cache this shouldn't cost much. Its obviously
- ; avoidable!!!.
- lea rdi, [rdi+rax] ;
- dec rcx ; decrement count
- jnz .nextrow_cv ; next row
-
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void bilinear_predict8x8_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
-sym(vp8_bilinear_predict8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- shl rax, 5 ; offset * 32
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
-
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
-
- shl rax, 5 ; offset*32
- add rax, rcx ; VFilter
-
- lea rcx, [rdi+rdx*8] ;
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
-
-
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
- add rsi, rdx ; next line
-.next_row_8x8:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- movq mm5, mm7 ;
- movq mm6, mm7 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
-
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
-
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- packuswb mm3, mm4
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8 ;dst_pitch
-%endif
- cmp rdi, rcx ;
- jne .next_row_8x8
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void bilinear_predict8x4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
-sym(vp8_bilinear_predict8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- shl rax, 5
-
- mov rsi, arg(0) ;src_ptr ;
- add rax, rcx
-
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
- shl rax, 5
-
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
-
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
- add rsi, rdx ; next line
-.next_row_8x4:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- movq mm5, mm7 ;
- movq mm6, mm7 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
-
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
-
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- packuswb mm3, mm4
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8
-%endif
- cmp rdi, rcx ;
- jne .next_row_8x4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void bilinear_predict4x4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
-sym(vp8_bilinear_predict4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- shl rax, 5
-
- add rax, rcx ; HFilter
- mov rsi, arg(0) ;src_ptr ;
-
- movsxd rdx, dword ptr arg(5) ;ldst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
- shl rax, 5
-
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
-
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
- ; get the first horizontal line done ;
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
-
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
-
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
-
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- movq mm7, mm3 ;
- packuswb mm7, mm0 ;
-
- add rsi, rdx ; next line
-.next_row_4x4:
- movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
-
- pmullw mm3, mm1 ;
- movd mm5, [rsi+1] ;
-
- punpcklbw mm5, mm0 ;
- pmullw mm5, mm2 ;
-
- paddw mm3, mm5 ;
-
- movq mm5, mm7 ;
- punpcklbw mm5, mm0 ;
-
- pmullw mm5, [rax] ;
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
-
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
- movq mm7, mm3 ;
-
- packuswb mm7, mm0 ;
-
- pmullw mm3, [rax+16] ;
- paddw mm3, mm5 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb mm3, mm0
- movd [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch ;
- add rsi, rdx ; next line
- add rdi, r8
-%endif
-
- cmp rdi, rcx ;
- jne .next_row_4x4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-SECTION_RODATA
-align 16
-rd:
- times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp8_six_tap_mmx))
-sym(vp8_six_tap_mmx):
- times 8 dw 0
- times 8 dw 0
- times 8 dw 128
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 123
- times 8 dw 12
- times 8 dw -1
- times 8 dw 0
-
- times 8 dw 2
- times 8 dw -11
- times 8 dw 108
- times 8 dw 36
- times 8 dw -8
- times 8 dw 1
-
- times 8 dw 0
- times 8 dw -9
- times 8 dw 93
- times 8 dw 50
- times 8 dw -6
- times 8 dw 0
-
- times 8 dw 3
- times 8 dw -16
- times 8 dw 77
- times 8 dw 77
- times 8 dw -16
- times 8 dw 3
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 50
- times 8 dw 93
- times 8 dw -9
- times 8 dw 0
-
- times 8 dw 1
- times 8 dw -8
- times 8 dw 36
- times 8 dw 108
- times 8 dw -11
- times 8 dw 2
-
- times 8 dw 0
- times 8 dw -1
- times 8 dw 12
- times 8 dw 123
- times 8 dw -6
- times 8 dw 0
-
-
diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm
deleted file mode 100644
index 69f8d103c1..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP8_FILTER_WEIGHT 128
-%define VP8_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp8_filter_block1d8_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp8_filter
-;)
-global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
-sym(vp8_filter_block1d8_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp8_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_filter_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp8_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
-sym(vp8_filter_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp8_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi+16], xmm4
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_filter_block1d8_v6_sse2
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp8_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
-sym(vp8_filter_block1d8_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp8_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp8_filter_block1d8_v6_sse2_loop:
- movdqa xmm1, XMMWORD PTR [rsi]
- pmullw xmm1, [rax]
-
- movdqa xmm2, XMMWORD PTR [rsi + rdx]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
- pmullw xmm3, [rax + 32]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
- pmullw xmm5, [rax + 64]
-
- add rsi, rdx
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
-
- pmullw xmm4, [rax + 48]
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
-
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_filter_block1d16_v6_sse2
-;(
-; unsigned short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; const short *vp8_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
-sym(vp8_filter_block1d16_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp8_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp8_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
- movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
- movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
- pmullw xmm1, [rax + 16]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm3, [rax + 64]
- pmullw xmm4, [rax + 64]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm5, [rax + 32]
- pmullw xmm6, [rax + 32]
-
- movdqa xmm7, XMMWORD PTR [rsi] ; line 1
- movdqa xmm0, XMMWORD PTR [rsi + 16]
- pmullw xmm7, [rax]
- pmullw xmm0, [rax]
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
- paddsw xmm1, xmm7
- paddsw xmm2, xmm0
-
- add rsi, rdx
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm3, [rax + 48]
- pmullw xmm4, [rax + 48]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm5, [rax + 80]
- pmullw xmm6, [rax + 80]
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
- pxor xmm0, xmm0 ; clear xmm0
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
-
- paddsw xmm1, xmm7
- paddsw xmm2, xmm7
-
- psraw xmm1, 7
- psraw xmm2, 7
-
- packuswb xmm1, xmm2 ; pack and saturate
- movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_filter_block1d8_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp8_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
-sym(vp8_filter_block1d8_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp8_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
-
- movq QWORD PTR [rdi], xmm4 ; store the results in the destination
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_only_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_filter_block1d16_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp8_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
-sym(vp8_filter_block1d16_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp8_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; lower 8 bytes
-
- movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; higher 8 bytes
-
- movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_filter_block1d8_v6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp8_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
-sym(vp8_filter_block1d8_v6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- mov rax, arg(5) ;vp8_filter
-
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp8_filter_block1d8_v6_only_sse2_loop:
- movq xmm1, MMWORD PTR [rsi]
- movq xmm2, MMWORD PTR [rsi + rdx]
- movq xmm3, MMWORD PTR [rsi + rdx * 2]
- movq xmm5, MMWORD PTR [rsi + rdx * 4]
- add rsi, rdx
- movq xmm4, MMWORD PTR [rsi + rdx * 2]
- movq xmm6, MMWORD PTR [rsi + rdx * 4]
-
- punpcklbw xmm1, xmm0
- pmullw xmm1, [rax]
-
- punpcklbw xmm2, xmm0
- pmullw xmm2, [rax + 16]
-
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax + 32]
-
- punpcklbw xmm5, xmm0
- pmullw xmm5, [rax + 64]
-
- punpcklbw xmm4, xmm0
- pmullw xmm4, [rax + 48]
-
- punpcklbw xmm6, xmm0
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_unpack_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int output_height,
-; unsigned int output_width
-;)
-global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
-sym(vp8_unpack_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(3) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
- movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
- movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- punpcklbw xmm1, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm1
- movdqa XMMWORD Ptr [rdi + 16], xmm3
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(4) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .unpack_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_bilinear_predict16x16_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp8_bilinear_filters_x86_8)
-global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
-sym(vp8_bilinear_predict16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- movsxd rax, dword ptr arg(2) ;xoffset
-
- cmp rax, 0 ;skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
-
- cmp rax, 0 ;skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;dst_pitch
-%endif
- ; get the first horizontal line done
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- add rsi, rdx ; next line
-.next_row:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, [rax]
- pmullw xmm6, [rax]
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- pmullw xmm3, [rax+16]
- pmullw xmm4, [rax+16]
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rdx ; next line
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ;dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- add rsi, rax ; next line
-.next_row_spo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- movdqa xmm4, xmm3 ; make a copy of current line
- movdqa xmm7, xmm3
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm5, xmm1
- pmullw xmm6, xmm1
- pmullw xmm3, xmm2
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ;dst_pitch
- cmp rdi, rcx
- jne .next_row_spo
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.next_row_fpo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ; dst_pitch
- cmp rdi, rcx
- jne .next_row_fpo
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_bilinear_predict8x8_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
-sym(vp8_bilinear_predict8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ;xoffset
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm5, [rax]
- movdqa xmm6, [rax+16]
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqa xmm3, XMMWORD PTR [rsp]
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- add rsp, 16 ; next line
-.next_row8x8:
- movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
- pmullw xmm7, xmm5
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm4, xmm3
-
- pmullw xmm3, xmm6
- paddw xmm3, xmm7
-
- movdqa xmm7, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb xmm3, xmm0
- movq [rdi], xmm3 ; store the results in the destination
-
- add rsp, 16 ; next line
- add rdi, rdx
-
- cmp rdi, rcx
- jne .next_row8x8
-
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-rd:
- times 8 dw 0x40
diff --git a/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm b/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm
deleted file mode 100644
index c06f24556e..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/subpixel_ssse3.asm
+++ /dev/null
@@ -1,1508 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP8_FILTER_WEIGHT 128
-%define VP8_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp8_filter_block1d8_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
-sym(vp8_filter_block1d8_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4
-
- movdqa xmm7, [GLOBAL(rd)]
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- mov rdi, arg(2) ;output_ptr
-
- cmp esi, DWORD PTR [rax]
- je vp8_filter_block1d8_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- pmaddubsw xmm1, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
- jnz .filter_block1d8_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp8_filter_block1d8_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
- movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
- mov rsi, arg(0) ;src_ptr
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm2, xmm0
- pshufb xmm0, xmm3
-
- pshufb xmm2, xmm4
- pmaddubsw xmm0, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
-
- jnz .filter_block1d8_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp8_filter_block1d16_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
-sym(vp8_filter_block1d16_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- mov rdi, arg(2) ;output_ptr
-
- mov rsi, arg(0) ;src_ptr
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- movq xmm3, MMWORD PTR [rsi + 6]
-
- pmaddubsw xmm1, xmm5
- movq xmm7, MMWORD PTR [rsi + 11]
-
- pmaddubsw xmm2, xmm6
- punpcklbw xmm3, xmm7
-
- paddsw xmm0, xmm1
- movdqa xmm1, xmm3
-
- pmaddubsw xmm3, xmm4
- paddsw xmm0, xmm2
-
- movdqa xmm2, xmm1
- paddsw xmm0, [GLOBAL(rd)]
-
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
-
- psraw xmm0, 7
- pmaddubsw xmm1, xmm5
-
- pmaddubsw xmm2, xmm6
- packuswb xmm0, xmm0
-
- lea rsi, [rsi + rax]
- paddsw xmm3, xmm1
-
- paddsw xmm3, xmm2
-
- paddsw xmm3, [GLOBAL(rd)]
-
- psraw xmm3, 7
-
- packuswb xmm3, xmm3
-
- punpcklqdq xmm0, xmm3
-
- movdqa XMMWORD Ptr [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .filter_block1d16_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_filter_block1d4_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
-sym(vp8_filter_block1d4_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- movdqa xmm7, [GLOBAL(rd)]
-
- cmp esi, DWORD PTR [rax]
- je .vp8_filter_block1d4_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf1b)]
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2b)]
- pmaddubsw xmm0, xmm4
- pshufb xmm2, [GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm0, xmm1
- paddsw xmm0, xmm7
- pxor xmm1, xmm1
- paddsw xmm0, xmm2
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movd DWORD PTR [rdi], xmm0
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp8_filter_block1d4_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
- movdqu xmm1, XMMWORD PTR [rsi - 2]
-
- movdqa xmm2, xmm1
- pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
- pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm1, xmm7
- paddsw xmm1, xmm2
- psraw xmm1, 7
- packuswb xmm1, xmm1
-
- movd DWORD PTR [rdi], xmm1
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void vp8_filter_block1d16_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
-sym(vp8_filter_block1d16_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- cmp esi, DWORD PTR [rax]
- je .vp8_filter_block1d16_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-
-.vp8_filter_block1d16_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2 ;store the results
-
- movq xmm1, MMWORD PTR [rsi + 8] ;A
- movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp8_filter_block1d16_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp8_filter_block1d16_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-.vp8_filter_block1d16_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- paddsw xmm2, [GLOBAL(rd)]
- paddsw xmm2, xmm3
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- punpcklbw xmm5, xmm4 ;B D
- punpcklbw xmm1, xmm0 ;C E
-
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm5, xmm7
-
- movdqa xmm4, [GLOBAL(rd)]
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm5, xmm1
- paddsw xmm5, xmm4
- psraw xmm5, 7
- packuswb xmm5, xmm5
-
- punpcklqdq xmm2, xmm5
-
- movdqa XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp8_filter_block1d16_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_filter_block1d8_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
-sym(vp8_filter_block1d8_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp8_filter_block1d8_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp8_filter_block1d8_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
- movdqa xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp8_filter_block1d8_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp8_filter_block1d8_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm5, [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp8_filter_block1d8_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm5
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp8_filter_block1d8_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp8_filter_block1d4_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp8_filter_index
-;)
-global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
-sym(vp8_filter_block1d4_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp8_filter_block1d4_v4_ssse3
-
- movq mm5, MMWORD PTR [rax] ;k0_k5
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp8_filter_block1d4_v6_ssse3_loop:
- movd mm1, DWORD PTR [rsi] ;A
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- movd mm0, DWORD PTR [rax + rdx * 4] ;F
-
- movq mm4, [GLOBAL(rd)]
-
- pmaddubsw mm3, mm6
- punpcklbw mm1, mm0 ;A F
- pmaddubsw mm2, mm7
- pmaddubsw mm1, mm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm1
- paddsw mm2, mm4
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp8_filter_block1d4_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp8_filter_block1d4_v4_ssse3:
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
- movq mm5, MMWORD PTR [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp8_filter_block1d4_v4_ssse3_loop:
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- pmaddubsw mm3, mm6
- pmaddubsw mm2, mm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm5
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp8_filter_block1d4_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_bilinear_predict16x16_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
-sym(vp8_bilinear_predict16x16_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(2) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
-
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
-
- movdqa xmm2, [rax]
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ; dst_pitch
-%endif
- movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
-
- punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
- pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm6, xmm5
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm6, xmm1
-
- punpcklbw xmm4, xmm5
- pmaddubsw xmm4, xmm1
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
-
- packuswb xmm6, xmm4
- movdqa xmm5, xmm7
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm2
-
- punpckhbw xmm7, xmm6
- pmaddubsw xmm7, xmm2
-
- paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
- psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm5, xmm7
- movdqa xmm7, xmm6
-
- movdqa [rdi], xmm5 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ; dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
- ; get the first horizontal line done
- movq xmm4, [rsi] ; load row 0
- movq xmm2, [rsi + 8] ; load row 0
-
- lea rsi, [rsi + rax] ; next line
-.next_row_sp:
- movq xmm3, [rsi] ; load row + 1
- movq xmm5, [rsi + 8] ; load row + 1
-
- punpcklbw xmm4, xmm3
- punpcklbw xmm2, xmm5
-
- pmaddubsw xmm4, xmm1
- movq xmm7, [rsi + rax] ; load row + 2
-
- pmaddubsw xmm2, xmm1
- movq xmm6, [rsi + rax + 8] ; load row + 2
-
- punpcklbw xmm3, xmm7
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm3, xmm1
- paddw xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm5, xmm1
- paddw xmm2, [GLOBAL(rd)]
-
- psraw xmm4, VP8_FILTER_SHIFT
- psraw xmm2, VP8_FILTER_SHIFT
-
- packuswb xmm4, xmm2
- paddw xmm3, [GLOBAL(rd)]
-
- movdqa [rdi], xmm4 ; store row 0
- paddw xmm5, [GLOBAL(rd)]
-
- psraw xmm3, VP8_FILTER_SHIFT
- psraw xmm5, VP8_FILTER_SHIFT
-
- packuswb xmm3, xmm5
- movdqa xmm4, xmm7
-
- movdqa [rdi + rdx],xmm3 ; store row 1
- lea rsi, [rsi + 2*rax]
-
- movdqa xmm2, xmm6
- lea rdi, [rdi + 2*rdx]
-
- cmp rdi, rcx
- jne .next_row_sp
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
-.next_row_fp:
- movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm2, xmm4
- movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- pmaddubsw xmm2, xmm1
- movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rax] ; next line
- punpcklbw xmm3, xmm4
-
- pmaddubsw xmm3, xmm1
- movq xmm5, [rsi]
-
- paddw xmm2, [GLOBAL(rd)]
- movq xmm7, [rsi+1]
-
- movq xmm6, [rsi+8]
- psraw xmm2, VP8_FILTER_SHIFT
-
- punpcklbw xmm5, xmm7
- movq xmm7, [rsi+9]
-
- paddw xmm3, [GLOBAL(rd)]
- pmaddubsw xmm5, xmm1
-
- psraw xmm3, VP8_FILTER_SHIFT
- punpcklbw xmm6, xmm7
-
- packuswb xmm2, xmm3
- pmaddubsw xmm6, xmm1
-
- movdqa [rdi], xmm2 ; store the results in the destination
- paddw xmm5, [GLOBAL(rd)]
-
- lea rdi, [rdi + rdx] ; dst_pitch
- psraw xmm5, VP8_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm6, VP8_FILTER_SHIFT
-
- packuswb xmm5, xmm6
- lea rsi, [rsi + rax] ; next line
-
- movdqa [rdi], xmm5 ; store the results in the destination
- lea rdi, [rdi + rdx] ; dst_pitch
-
- cmp rdi, rcx
-
- jne .next_row_fp
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_bilinear_predict8x8_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
-sym(vp8_bilinear_predict8x8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ; xoffset
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b8x8_sp_only
-
- shl rax, 4
- add rax, rcx ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b8x8_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm1, [rax]
-
- ; get the first horizontal line done
- movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
- psrldq xmm5, 1
- lea rsp, [rsp + 16] ; next line
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- lea rsp, [rsp + 16] ; next line
-
- movdqa xmm5, xmm6
-
- psrldq xmm5, 1
-
- punpcklbw xmm6, xmm5
- pmaddubsw xmm6, xmm0
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
-
- packuswb xmm6, xmm6
-
- punpcklbw xmm7, xmm6
- pmaddubsw xmm7, xmm1
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm7, xmm7
-
- movq [rdi], xmm7 ; store the results in the destination
- lea rdi, [rdi + rdx]
-
- movdqa xmm7, xmm6
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done8x8
-
-.b8x8_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax] ; VFilter
-
- movq xmm1, XMMWORD PTR [rsp]
- movq xmm2, XMMWORD PTR [rsp+16]
-
- movq xmm3, XMMWORD PTR [rsp+32]
- punpcklbw xmm1, xmm2
-
- movq xmm4, XMMWORD PTR [rsp+48]
- punpcklbw xmm2, xmm3
-
- movq xmm5, XMMWORD PTR [rsp+64]
- punpcklbw xmm3, xmm4
-
- movq xmm6, XMMWORD PTR [rsp+80]
- punpcklbw xmm4, xmm5
-
- movq xmm7, XMMWORD PTR [rsp+96]
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm1, xmm0
- pmaddubsw xmm2, xmm0
-
- pmaddubsw xmm3, xmm0
- pmaddubsw xmm4, xmm0
-
- pmaddubsw xmm5, xmm0
- punpcklbw xmm6, xmm7
-
- pmaddubsw xmm6, xmm0
- paddw xmm1, [GLOBAL(rd)]
-
- paddw xmm2, [GLOBAL(rd)]
- psraw xmm1, VP8_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm2, VP8_FILTER_SHIFT
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm3, VP8_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm5, VP8_FILTER_SHIFT
-
- psraw xmm6, VP8_FILTER_SHIFT
- packuswb xmm1, xmm1
-
- packuswb xmm2, xmm2
- movq [rdi], xmm1
-
- packuswb xmm3, xmm3
- movq [rdi+rdx], xmm2
-
- packuswb xmm4, xmm4
- movq xmm1, XMMWORD PTR [rsp+112]
-
- lea rdi, [rdi + 2*rdx]
- movq xmm2, XMMWORD PTR [rsp+128]
-
- packuswb xmm5, xmm5
- movq [rdi], xmm3
-
- packuswb xmm6, xmm6
- movq [rdi+rdx], xmm4
-
- lea rdi, [rdi + 2*rdx]
- punpcklbw xmm7, xmm1
-
- movq [rdi], xmm5
- pmaddubsw xmm7, xmm0
-
- movq [rdi+rdx], xmm6
- punpcklbw xmm1, xmm2
-
- pmaddubsw xmm1, xmm0
- paddw xmm7, [GLOBAL(rd)]
-
- psraw xmm7, VP8_FILTER_SHIFT
- paddw xmm1, [GLOBAL(rd)]
-
- psraw xmm1, VP8_FILTER_SHIFT
- packuswb xmm7, xmm7
-
- packuswb xmm1, xmm1
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm7
-
- movq [rdi+rdx], xmm1
- lea rsp, [rsp + 144]
-
- jmp .done8x8
-
-.b8x8_fp_only:
- lea rcx, [rdi+rdx*8]
-
-.next_row_fp:
- movdqa xmm1, XMMWORD PTR [rsp]
- movdqa xmm3, XMMWORD PTR [rsp+16]
-
- movdqa xmm2, xmm1
- movdqa xmm5, XMMWORD PTR [rsp+32]
-
- psrldq xmm2, 1
- movdqa xmm7, XMMWORD PTR [rsp+48]
-
- movdqa xmm4, xmm3
- psrldq xmm4, 1
-
- movdqa xmm6, xmm5
- psrldq xmm6, 1
-
- punpcklbw xmm1, xmm2
- pmaddubsw xmm1, xmm0
-
- punpcklbw xmm3, xmm4
- pmaddubsw xmm3, xmm0
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm0
-
- movdqa xmm2, xmm7
- psrldq xmm2, 1
-
- punpcklbw xmm7, xmm2
- pmaddubsw xmm7, xmm0
-
- paddw xmm1, [GLOBAL(rd)]
- psraw xmm1, VP8_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm3, VP8_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm5, VP8_FILTER_SHIFT
-
- paddw xmm7, [GLOBAL(rd)]
- psraw xmm7, VP8_FILTER_SHIFT
-
- packuswb xmm1, xmm1
- packuswb xmm3, xmm3
-
- packuswb xmm5, xmm5
- movq [rdi], xmm1
-
- packuswb xmm7, xmm7
- movq [rdi+rdx], xmm3
-
- lea rdi, [rdi + 2*rdx]
- movq [rdi], xmm5
-
- lea rsp, [rsp + 4*16]
- movq [rdi+rdx], xmm7
-
- lea rdi, [rdi + 2*rdx]
- cmp rdi, rcx
-
- jne .next_row_fp
-
- lea rsp, [rsp + 16]
-
-.done8x8:
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-shuf1b:
- db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
- db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
- db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
- db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
- db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
- times 8 dw 0x40
-
-align 16
-k0_k5:
- times 8 db 0, 0 ;placeholder
- times 8 db 0, 0
- times 8 db 2, 1
- times 8 db 0, 0
- times 8 db 3, 3
- times 8 db 0, 0
- times 8 db 1, 2
- times 8 db 0, 0
-k1_k3:
- times 8 db 0, 0 ;placeholder
- times 8 db -6, 12
- times 8 db -11, 36
- times 8 db -9, 50
- times 8 db -16, 77
- times 8 db -6, 93
- times 8 db -8, 108
- times 8 db -1, 123
-k2_k4:
- times 8 db 128, 0 ;placeholder
- times 8 db 123, -1
- times 8 db 108, -8
- times 8 db 93, -6
- times 8 db 77, -16
- times 8 db 50, -9
- times 8 db 36, -11
- times 8 db 12, -6
-align 16
-vp8_bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 112, 16
- times 8 db 96, 32
- times 8 db 80, 48
- times 8 db 64, 64
- times 8 db 48, 80
- times 8 db 32, 96
- times 8 db 16, 112
-
diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c b/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c
deleted file mode 100644
index fb0b57eb1c..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ /dev/null
@@ -1,625 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "filter_x86.h"
-
-extern const short vp8_six_tap_mmx[8][6*8];
-
-extern void vp8_filter_block1d_h6_mmx
-(
- unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-);
-extern void vp8_filter_block1dc_v6_mmx
-(
- unsigned short *src_ptr,
- unsigned char *output_ptr,
- int output_pitch,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-);
-extern void vp8_filter_block1d8_h6_sse2
-(
- unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-);
-extern void vp8_filter_block1d16_h6_sse2
-(
- unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-);
-extern void vp8_filter_block1d8_v6_sse2
-(
- unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-);
-extern void vp8_filter_block1d16_v6_sse2
-(
- unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp8_filter
-);
-extern void vp8_unpack_block1d16_h6_sse2
-(
- unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_height,
- unsigned int output_width
-);
-extern void vp8_filter_block1d8_h6_only_sse2
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int output_height,
- const short *vp8_filter
-);
-extern void vp8_filter_block1d16_h6_only_sse2
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int output_height,
- const short *vp8_filter
-);
-extern void vp8_filter_block1d8_v6_only_sse2
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int output_height,
- const short *vp8_filter
-);
-
-
-#if HAVE_MMX
-void vp8_sixtap_predict4x4_mmx
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED(16, unsigned short, FData2[16*16]); /* Temp data bufffer used in filtering */
- const short *HFilter, *VFilter;
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
-
-}
-
-
-void vp8_sixtap_predict16x16_mmx
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
-
- DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
-
- const short *HFilter, *VFilter;
-
-
- HFilter = vp8_six_tap_mmx[xoffset];
-
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
-
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
- vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
- vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
- vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
-
-}
-
-
-void vp8_sixtap_predict8x8_mmx
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
-
- DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
-
- const short *HFilter, *VFilter;
-
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
-
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
- vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
-
-}
-
-
-void vp8_sixtap_predict8x4_mmx
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
-
- DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
-
- const short *HFilter, *VFilter;
-
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
- vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
-
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
- vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
-
-}
-
-
-
-void vp8_bilinear_predict16x16_mmx
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
- vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
- vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
- vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
-}
-#endif
-
-
-#if HAVE_SSE2
-void vp8_sixtap_predict16x16_sse2
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-
-)
-{
- DECLARE_ALIGNED(16, unsigned short, FData2[24*24]); /* Temp data bufffer used in filtering */
-
- const short *HFilter, *VFilter;
-
- if (xoffset)
- {
- if (yoffset)
- {
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
- }
- else
- {
- /* First-pass only */
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
- }
- }
- else
- {
- /* Second-pass only */
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
- vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
- }
-}
-
-
-void vp8_sixtap_predict8x8_sse2
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
- const short *HFilter, *VFilter;
-
- if (xoffset)
- {
- if (yoffset)
- {
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
- }
- else
- {
- /* First-pass only */
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
- }
- }
- else
- {
- /* Second-pass only */
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
- }
-}
-
-
-void vp8_sixtap_predict8x4_sse2
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED(16, unsigned short, FData2[256]); /* Temp data bufffer used in filtering */
- const short *HFilter, *VFilter;
-
- if (xoffset)
- {
- if (yoffset)
- {
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
- }
- else
- {
- /* First-pass only */
- HFilter = vp8_six_tap_mmx[xoffset];
- vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
- }
- }
- else
- {
- /* Second-pass only */
- VFilter = vp8_six_tap_mmx[yoffset];
- vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
- }
-}
-
-#endif
-
-#if HAVE_SSSE3
-
-extern void vp8_filter_block1d8_h6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d16_h6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d16_v6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d8_v6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d4_h6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-extern void vp8_filter_block1d4_v6_ssse3
-(
- unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp8_filter_index
-);
-
-void vp8_sixtap_predict16x16_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-
-)
-{
- DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, FData2,
- 16, 21, xoffset);
- vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
- 16, yoffset);
- }
- else
- {
- /* First-pass only */
- vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, xoffset);
- }
- }
- else
- {
- if (yoffset)
- {
- /* Second-pass only */
- vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 16, yoffset);
- }
- else
- {
- /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
- * yoffset==0) case correctly. Add copy function here to guarantee
- * six-tap function handles all possible offsets. */
- vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
- }
- }
-}
-
-void vp8_sixtap_predict8x8_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED(16, unsigned char, FData2[256]);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, FData2,
- 8, 13, xoffset);
- vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
- 8, yoffset);
- }
- else
- {
- vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, xoffset);
- }
- }
- else
- {
- if (yoffset)
- {
- /* Second-pass only */
- vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, yoffset);
- }
- else
- {
- /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
- * yoffset==0) case correctly. Add copy function here to guarantee
- * six-tap function handles all possible offsets. */
- vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
- }
- }
-}
-
-
-void vp8_sixtap_predict8x4_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED(16, unsigned char, FData2[256]);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, FData2,
- 8, 9, xoffset);
- vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
- 4, yoffset);
- }
- else
- {
- /* First-pass only */
- vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- }
- else
- {
- if (yoffset)
- {
- /* Second-pass only */
- vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
- else
- {
- /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
- * yoffset==0) case correctly. Add copy function here to guarantee
- * six-tap function handles all possible offsets. */
- vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
- }
- }
-}
-
-void vp8_sixtap_predict4x4_ssse3
-(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch
-)
-{
- DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
-
- if (xoffset)
- {
- if (yoffset)
- {
- vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- FData2, 4, 9, xoffset);
- vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
- 4, yoffset);
- }
- else
- {
- vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- }
- else
- {
- if (yoffset)
- {
- vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
- else
- {
- /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
- * yoffset==0) case correctly. Add copy function here to guarantee
- * six-tap function handles all possible offsets. */
- int r;
-
- for (r = 0; r < 4; r++)
- {
- dst_ptr[0] = src_ptr[0];
- dst_ptr[1] = src_ptr[1];
- dst_ptr[2] = src_ptr[2];
- dst_ptr[3] = src_ptr[3];
- dst_ptr += dst_pitch;
- src_ptr += src_pixels_per_line;
- }
- }
- }
-}
-
-#endif
diff --git a/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm b/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
deleted file mode 100644
index 88a07b9f3f..0000000000
--- a/thirdparty/libvpx/vp8/common/x86/vp8_loopfilter_mmx.asm
+++ /dev/null
@@ -1,1753 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vp8_loop_filter_horizontal_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
-sym(vp8_loop_filter_horizontal_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- movsxd rcx, dword ptr arg(5) ;count
-.next8_h:
- mov rdx, arg(3) ;limit
- movq mm7, [rdx]
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
- ; calculate breakout conditions
- movq mm2, [rdi+2*rax] ; q3
- movq mm1, [rsi+2*rax] ; q2
- movq mm6, mm1 ; q2
- psubusb mm1, mm2 ; q2-=q3
- psubusb mm2, mm6 ; q3-=q2
- por mm1, mm2 ; abs(q3-q2)
- psubusb mm1, mm7 ;
-
-
- movq mm4, [rsi+rax] ; q1
- movq mm3, mm4 ; q1
- psubusb mm4, mm6 ; q1-=q2
- psubusb mm6, mm3 ; q2-=q1
- por mm4, mm6 ; abs(q2-q1)
-
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- psubusb mm4, mm3 ; q0-=q1
- psubusb mm3, mm0 ; q1-=q0
- por mm4, mm3 ; abs(q0-q1)
- movq t0, mm4 ; save to t0
- psubusb mm4, mm7
- por mm1, mm4
-
-
- neg rax ; negate pitch to deal with above border
-
- movq mm2, [rsi+4*rax] ; p3
- movq mm4, [rdi+4*rax] ; p2
- movq mm5, mm4 ; p2
- psubusb mm4, mm2 ; p2-=p3
- psubusb mm2, mm5 ; p3-=p2
- por mm4, mm2 ; abs(p3 - p2)
- psubusb mm4, mm7
- por mm1, mm4
-
-
- movq mm4, [rsi+2*rax] ; p1
- movq mm3, mm4 ; p1
- psubusb mm4, mm5 ; p1-=p2
- psubusb mm5, mm3 ; p2-=p1
- por mm4, mm5 ; abs(p2 - p1)
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm2, mm3 ; p1
-
- movq mm4, [rsi+rax] ; p0
- movq mm5, mm4 ; p0
- psubusb mm4, mm3 ; p0-=p1
- psubusb mm3, mm5 ; p1-=p0
- por mm4, mm3 ; abs(p1 - p0)
- movq t1, mm4 ; save to t1
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm3, [rdi] ; q1
- movq mm4, mm3 ; q1
- psubusb mm3, mm2 ; q1-=p1
- psubusb mm2, mm4 ; p1-=q1
- por mm2, mm3 ; abs(p1-q1)
- pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm2, 1 ; abs(p1-q1)/2
-
- movq mm6, mm5 ; p0
- movq mm3, [rsi] ; q0
- psubusb mm5, mm3 ; p0-=q0
- psubusb mm3, mm6 ; q0-=p0
- por mm5, mm3 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx] ; blimit
-
- psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm5
- pxor mm5, mm5
- pcmpeqb mm1, mm5 ; mask mm1
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx] ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7
- paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- pcmpeqb mm4, mm5
-
- pcmpeqb mm5, mm5
- pxor mm4, mm5
-
-
- ; start work on filters
- movq mm2, [rsi+2*rax] ; p1
- movq mm7, [rdi] ; q1
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
- pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
- movq mm2, mm1
- paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
- paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
- pxor mm0, mm0 ;
- pxor mm5, mm5
- punpcklbw mm0, mm2 ;
- punpckhbw mm5, mm2 ;
- psraw mm0, 11 ;
- psraw mm5, 11
- packsswb mm0, mm5
- movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor mm0, mm0 ; 0
- movq mm5, mm1 ; abcdefgh
- punpcklbw mm0, mm1 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- pxor mm1, mm1 ; 0
- punpckhbw mm1, mm5 ; a0b0c0d0
- psraw mm1, 11 ; sign extended shift right by 3
- movq mm5, mm0 ; save results
-
- packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw mm5, [GLOBAL(ones)]
- paddsw mm1, [GLOBAL(ones)]
- psraw mm5, 1 ; partial shifted one more time for 2nd tap
- psraw mm1, 1 ; partial shifted one more time for 2nd tap
- packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
- pandn mm4, mm5 ; high edge variance additive
-
- paddsb mm6, mm2 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+rax], mm6 ; write back
-
- movq mm6, [rsi+2*rax] ; p1
- pxor mm6, [GLOBAL(t80)] ; reoffset
- paddsb mm6, mm4 ; p1+= p1 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+2*rax], mm6 ; write back
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
- movq [rsi], mm3 ; write back
-
- psubsb mm7, mm4 ; q1-= q1 add
- pxor mm7, [GLOBAL(t80)] ; unoffset
- movq [rdi], mm7 ; write back
-
- add rsi,8
- neg rax
- dec rcx
- jnz .next8_h
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_loop_filter_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
-sym(vp8_loop_filter_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 64 ; reserve 64 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4 - 4]
-
- movsxd rcx, dword ptr arg(5) ;count
-.next8_v:
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
-
- ;transpose
- movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
- movq mm7, mm6 ; 77 76 75 74 73 72 71 70
-
- punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
- punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
-
- movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
- movq mm5, mm4 ; 47 46 45 44 43 42 41 40
-
- punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
- punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
-
- movq mm3, mm5 ; 57 47 56 46 55 45 54 44
- punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
-
- punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
- movq mm2, mm4 ; 53 43 52 42 51 41 50 40
-
- punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
- punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
-
- neg rax
- movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
-
- movq mm1, mm6 ; 27 26 25 24 23 22 21 20
- punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
-
- punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
- movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
-
- punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
- movq mm0, mm7 ; 17 07 16 06 15 05 14 04
-
- punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
- punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
-
- movq mm6, mm7 ; 37 27 17 07 36 26 16 06
- punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
-
- punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
-
- movq mm5, mm6 ; 76 66 56 46 36 26 16 06
- psubusb mm5, mm7 ; q2-q3
-
- psubusb mm7, mm6 ; q3-q2
- por mm7, mm5; ; mm7=abs (q3-q2)
-
- movq mm5, mm0 ; 35 25 15 05 34 24 14 04
- punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
-
- punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
- movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
-
- psubusb mm3, mm6 ; q1-q2
- psubusb mm6, mm5 ; q2-q1
-
- por mm6, mm3 ; mm6=abs(q2-q1)
- lea rdx, srct
-
- movq [rdx+24], mm5 ; save q1
- movq [rdx+16], mm0 ; save q0
-
- movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
- punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
-
- movq mm0, mm3 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 31 21 11 01 30 20 10 00
-
- punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
- punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
-
- movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
- psubusb mm2, mm0 ; p2-p3
-
- psubusb mm0, mm1 ; p3-p2
- por mm0, mm2 ; mm0=abs(p3-p2)
-
- movq mm2, mm3 ; 33 23 13 03 32 22 12 02
- punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
-
- punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
- movq [rdx+8], mm3 ; save p0
-
- movq [rdx], mm2 ; save p1
- movq mm5, mm2 ; mm5 = p1
-
- psubusb mm2, mm1 ; p1-p2
- psubusb mm1, mm5 ; p2-p1
-
- por mm1, mm2 ; mm1=abs(p2-p1)
- mov rdx, arg(3) ;limit
-
- movq mm4, [rdx] ; mm4 = limit
- psubusb mm7, mm4
-
- psubusb mm0, mm4
- psubusb mm1, mm4
-
- psubusb mm6, mm4
- por mm7, mm6
-
- por mm0, mm1
- por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
- movq mm1, mm5 ; p1
-
- movq mm7, mm3 ; mm3=mm7=p0
- psubusb mm7, mm5 ; p0 - p1
-
- psubusb mm5, mm3 ; p1 - p0
- por mm5, mm7 ; abs(p1-p0)
-
- movq t0, mm5 ; save abs(p1-p0)
- lea rdx, srct
-
- psubusb mm5, mm4
- por mm0, mm5 ; mm0=mask
-
- movq mm5, [rdx+16] ; mm5=q0
- movq mm7, [rdx+24] ; mm7=q1
-
- movq mm6, mm5 ; mm6=q0
- movq mm2, mm7 ; q1
- psubusb mm5, mm7 ; q0-q1
-
- psubusb mm7, mm6 ; q1-q0
- por mm7, mm5 ; abs(q1-q0)
-
- movq t1, mm7 ; save abs(q1-q0)
- psubusb mm7, mm4
-
- por mm0, mm7 ; mask
-
- movq mm5, mm2 ; q1
- psubusb mm5, mm1 ; q1-=p1
- psubusb mm1, mm2 ; p1-=q1
- por mm5, mm1 ; abs(p1-q1)
- pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm5, 1 ; abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ;
-
- movq mm4, [rdx] ;blimit
- movq mm1, mm3 ; mm1=mm3=p0
-
- movq mm7, mm6 ; mm7=mm6=q0
- psubusb mm1, mm7 ; p0-q0
-
- psubusb mm7, mm3 ; q0-p0
- por mm1, mm7 ; abs(q0-p0)
- paddusb mm1, mm1 ; abs(q0-p0)*2
- paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm0; ; mask
-
- pxor mm0, mm0
- pcmpeqb mm1, mm0
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx]
- ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7
-
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7
-
- por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb mm4, mm0
-
- pcmpeqb mm0, mm0
- pxor mm4, mm0
-
-
-
- ; start work on filters
- lea rdx, srct
-
- movq mm2, [rdx] ; p1
- movq mm7, [rdx+24] ; q1
-
- movq mm6, [rdx+8] ; p0
- movq mm0, [rdx+16] ; q0
-
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb mm2, mm7 ; p1 - q1
- pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
-
- paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
-
- movq mm2, mm1
- paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
- paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
- pxor mm0, mm0 ;
-
- pxor mm5, mm5
- punpcklbw mm0, mm2 ;
-
- punpckhbw mm5, mm2 ;
- psraw mm0, 11 ;
-
- psraw mm5, 11
- packsswb mm0, mm5
-
- movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor mm0, mm0 ; 0
- movq mm5, mm1 ; abcdefgh
-
- punpcklbw mm0, mm1 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
-
- pxor mm1, mm1 ; 0
- punpckhbw mm1, mm5 ; a0b0c0d0
-
- psraw mm1, 11 ; sign extended shift right by 3
- movq mm5, mm0 ; save results
-
- packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw mm5, [GLOBAL(ones)]
-
- paddsw mm1, [GLOBAL(ones)]
- psraw mm5, 1 ; partial shifted one more time for 2nd tap
-
- psraw mm1, 1 ; partial shifted one more time for 2nd tap
- packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
- pandn mm4, mm5 ; high edge variance additive
-
- paddsb mm6, mm2 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
-
- ; mm6=p0 ;
- movq mm1, [rdx] ; p1
- pxor mm1, [GLOBAL(t80)] ; reoffset
-
- paddsb mm1, mm4 ; p1+= p1 add
- pxor mm1, [GLOBAL(t80)] ; unoffset
- ; mm6 = p0 mm1 = p1
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
-
- ; mm3 = q0
- psubsb mm7, mm4 ; q1-= q1 add
- pxor mm7, [GLOBAL(t80)] ; unoffset
- ; mm7 = q1
-
- ; transpose and write back
- ; mm1 = 72 62 52 42 32 22 12 02
- ; mm6 = 73 63 53 43 33 23 13 03
- ; mm3 = 74 64 54 44 34 24 14 04
- ; mm7 = 75 65 55 45 35 25 15 05
-
- movq mm2, mm1 ; 72 62 52 42 32 22 12 02
- punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
-
- movq mm4, mm3 ; 74 64 54 44 34 24 14 04
- punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
-
- punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
- punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
-
- movq mm6, mm2 ; 33 32 23 22 13 12 03 02
- punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
-
- punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
- movq mm5, mm1 ; 73 72 63 62 53 52 43 42
-
- punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
- punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
-
-
- ; mm2 = 15 14 13 12 05 04 03 02
- ; mm6 = 35 34 33 32 25 24 23 22
- ; mm5 = 55 54 53 52 45 44 43 42
- ; mm1 = 75 74 73 72 65 64 63 62
-
-
-
- movd [rsi+rax*4+2], mm2
- psrlq mm2, 32
-
- movd [rdi+rax*4+2], mm2
- movd [rsi+rax*2+2], mm6
-
- psrlq mm6, 32
- movd [rsi+rax+2],mm6
-
- movd [rsi+2], mm1
- psrlq mm1, 32
-
- movd [rdi+2], mm1
- neg rax
-
- movd [rdi+rax+2],mm5
- psrlq mm5, 32
-
- movd [rdi+rax*2+2], mm5
-
- lea rsi, [rsi+rax*8]
- dec rcx
- jnz .next8_v
-
- add rsp, 64
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_mbloop_filter_horizontal_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
-sym(vp8_mbloop_filter_horizontal_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- movsxd rcx, dword ptr arg(5) ;count
-.next8_mbh:
- mov rdx, arg(3) ;limit
- movq mm7, [rdx]
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
- ; calculate breakout conditions
- movq mm2, [rdi+2*rax] ; q3
-
- movq mm1, [rsi+2*rax] ; q2
- movq mm6, mm1 ; q2
- psubusb mm1, mm2 ; q2-=q3
- psubusb mm2, mm6 ; q3-=q2
- por mm1, mm2 ; abs(q3-q2)
- psubusb mm1, mm7
-
-
- ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
- movq mm4, [rsi+rax] ; q1
- movq mm3, mm4 ; q1
- psubusb mm4, mm6 ; q1-=q2
- psubusb mm6, mm3 ; q2-=q1
- por mm4, mm6 ; abs(q2-q1)
- psubusb mm4, mm7
- por mm1, mm4
-
-
- ; mm1 = mask, mm3=q1, mm7 = limit
-
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- psubusb mm4, mm3 ; q0-=q1
- psubusb mm3, mm0 ; q1-=q0
- por mm4, mm3 ; abs(q0-q1)
- movq t0, mm4 ; save to t0
- psubusb mm4, mm7
- por mm1, mm4
-
-
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
-
- neg rax ; negate pitch to deal with above border
-
- movq mm2, [rsi+4*rax] ; p3
- movq mm4, [rdi+4*rax] ; p2
- movq mm5, mm4 ; p2
- psubusb mm4, mm2 ; p2-=p3
- psubusb mm2, mm5 ; p3-=p2
- por mm4, mm2 ; abs(p3 - p2)
- psubusb mm4, mm7
- por mm1, mm4
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
-
- movq mm4, [rsi+2*rax] ; p1
- movq mm3, mm4 ; p1
- psubusb mm4, mm5 ; p1-=p2
- psubusb mm5, mm3 ; p2-=p1
- por mm4, mm5 ; abs(p2 - p1)
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm2, mm3 ; p1
-
-
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
-
- movq mm4, [rsi+rax] ; p0
- movq mm5, mm4 ; p0
- psubusb mm4, mm3 ; p0-=p1
- psubusb mm3, mm5 ; p1-=p0
- por mm4, mm3 ; abs(p1 - p0)
- movq t1, mm4 ; save to t1
- psubusb mm4, mm7
- por mm1, mm4
- ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
- ; mm5 = p0
- movq mm3, [rdi] ; q1
- movq mm4, mm3 ; q1
- psubusb mm3, mm2 ; q1-=p1
- psubusb mm2, mm4 ; p1-=q1
- por mm2, mm3 ; abs(p1-q1)
- pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm2, 1 ; abs(p1-q1)/2
-
- movq mm6, mm5 ; p0
- movq mm3, mm0 ; q0
- psubusb mm5, mm3 ; p0-=q0
- psubusb mm3, mm6 ; q0-=p0
- por mm5, mm3 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx] ; blimit
-
- psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm5
- pxor mm5, mm5
- pcmpeqb mm1, mm5 ; mask mm1
-
- ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
- ; mm6 = p0,
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx] ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7
- paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- pcmpeqb mm4, mm5
-
- pcmpeqb mm5, mm5
- pxor mm4, mm5
-
-
-
- ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
- ; mm6 = p0, mm4=hev
- ; start work on filters
- movq mm2, [rsi+2*rax] ; p1
- movq mm7, [rdi] ; q1
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0)
- paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
-
-
- ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
- movq mm2, mm1 ; vp8_filter
- pand mm2, mm4; ; Filter2 = vp8_filter & hev
-
- movq mm5, mm2 ;
- paddsb mm5, [GLOBAL(t3)];
-
- pxor mm0, mm0 ; 0
- pxor mm7, mm7 ; 0
-
- punpcklbw mm0, mm5 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- punpckhbw mm7, mm5 ; a0b0c0d0
- psraw mm7, 11 ; sign extended shift right by 3
- packsswb mm0, mm7 ; Filter2 >>=3;
-
- movq mm5, mm0 ; Filter2
-
- paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
- pxor mm0, mm0 ; 0
- pxor mm7, mm7 ; 0
-
- punpcklbw mm0, mm2 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- punpckhbw mm7, mm2 ; a0b0c0d0
- psraw mm7, 11 ; sign extended shift right by 3
- packsswb mm0, mm7 ; Filter2 >>=3;
-
- ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
- psubsb mm3, mm0 ; qs0 =qs0 - filter1
- paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
-
- ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
- ; vp8_filter &= ~hev;
- ; Filter2 = vp8_filter;
- pandn mm4, mm1 ; vp8_filter&=~hev
-
-
- ; mm3=qs0, mm4=filter2, mm6=ps0
-
- ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
- ; s = vp8_signed_char_clamp(qs0 - u);
- ; *oq0 = s^0x80;
- ; s = vp8_signed_char_clamp(ps0 + u);
- ; *op0 = s^0x80;
- pxor mm0, mm0
-
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s27)]
- pmulhw mm2, [GLOBAL(s27)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- psubsb mm3, mm1
- paddsb mm6, mm1
-
- pxor mm3, [GLOBAL(t80)]
- pxor mm6, [GLOBAL(t80)]
- movq [rsi+rax], mm6
- movq [rsi], mm3
-
- ; roughly 2/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
- ; s = vp8_signed_char_clamp(qs1 - u);
- ; *oq1 = s^0x80;
- ; s = vp8_signed_char_clamp(ps1 + u);
- ; *op1 = s^0x80;
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s18)]
- pmulhw mm2, [GLOBAL(s18)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- movq mm3, [rdi]
- movq mm6, [rsi+rax*2] ; p1
-
- pxor mm3, [GLOBAL(t80)]
- pxor mm6, [GLOBAL(t80)]
-
- paddsb mm6, mm1
- psubsb mm3, mm1
-
- pxor mm6, [GLOBAL(t80)]
- pxor mm3, [GLOBAL(t80)]
- movq [rdi], mm3
- movq [rsi+rax*2], mm6
-
- ; roughly 1/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
- ; s = vp8_signed_char_clamp(qs2 - u);
- ; *oq2 = s^0x80;
- ; s = vp8_signed_char_clamp(ps2 + u);
- ; *op2 = s^0x80;
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s9)]
- pmulhw mm2, [GLOBAL(s9)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
-
- movq mm6, [rdi+rax*4]
- neg rax
- movq mm3, [rdi+rax ]
-
- pxor mm6, [GLOBAL(t80)]
- pxor mm3, [GLOBAL(t80)]
-
- paddsb mm6, mm1
- psubsb mm3, mm1
-
- pxor mm6, [GLOBAL(t80)]
- pxor mm3, [GLOBAL(t80)]
- movq [rdi+rax ], mm3
- neg rax
- movq [rdi+rax*4], mm6
-
-;EARLY_BREAK_OUT:
- neg rax
- add rsi,8
- dec rcx
- jnz .next8_mbh
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_mbloop_filter_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh,
-; int count
-;)
-global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
-sym(vp8_mbloop_filter_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4 - 4]
-
- movsxd rcx, dword ptr arg(5) ;count
-.next8_mbv:
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
-
- ;transpose
- movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
- movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
-
- movq mm7, mm6 ; 77 76 75 74 73 72 71 70
- punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
-
- punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
- movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
-
- movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
- movq mm5, mm4 ; 47 46 45 44 43 42 41 40
-
- punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
- punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
-
- movq mm3, mm5 ; 57 47 56 46 55 45 54 44
- punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
-
- punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
- movq mm2, mm4 ; 53 43 52 42 51 41 50 40
-
- punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
- punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
-
- neg rax
-
- movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
- movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
-
- movq mm1, mm6 ; 27 26 25 24 23 22 21 20
- punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
-
- punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
-
- movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
- punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
-
- movq mm0, mm7 ; 17 07 16 06 15 05 14 04
- punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
-
- punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
- movq mm6, mm7 ; 37 27 17 07 36 26 16 06
-
- punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
- punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
-
- lea rdx, srct
- movq mm5, mm6 ; 76 66 56 46 36 26 16 06
-
- movq [rdx+56], mm7
- psubusb mm5, mm7 ; q2-q3
-
-
- movq [rdx+48], mm6
- psubusb mm7, mm6 ; q3-q2
-
- por mm7, mm5; ; mm7=abs (q3-q2)
- movq mm5, mm0 ; 35 25 15 05 34 24 14 04
-
- punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
- punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
-
- movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
- psubusb mm3, mm6 ; q1-q2
-
- psubusb mm6, mm5 ; q2-q1
- por mm6, mm3 ; mm6=abs(q2-q1)
-
- movq [rdx+40], mm5 ; save q1
- movq [rdx+32], mm0 ; save q0
-
- movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
- punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
-
- movq mm0, mm3 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 31 21 11 01 30 20 10 00
-
- punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
- punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
-
- movq [rdx], mm0 ; save p3
- movq [rdx+8], mm1 ; save p2
-
- movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
- psubusb mm2, mm0 ; p2-p3
-
- psubusb mm0, mm1 ; p3-p2
- por mm0, mm2 ; mm0=abs(p3-p2)
-
- movq mm2, mm3 ; 33 23 13 03 32 22 12 02
- punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
-
- punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
- movq [rdx+24], mm3 ; save p0
-
- movq [rdx+16], mm2 ; save p1
- movq mm5, mm2 ; mm5 = p1
-
- psubusb mm2, mm1 ; p1-p2
- psubusb mm1, mm5 ; p2-p1
-
- por mm1, mm2 ; mm1=abs(p2-p1)
- mov rdx, arg(3) ;limit
-
- movq mm4, [rdx] ; mm4 = limit
- psubusb mm7, mm4 ; abs(q3-q2) > limit
-
- psubusb mm0, mm4 ; abs(p3-p2) > limit
- psubusb mm1, mm4 ; abs(p2-p1) > limit
-
- psubusb mm6, mm4 ; abs(q2-q1) > limit
- por mm7, mm6 ; or
-
- por mm0, mm1 ;
- por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
- movq mm1, mm5 ; p1
-
- movq mm7, mm3 ; mm3=mm7=p0
- psubusb mm7, mm5 ; p0 - p1
-
- psubusb mm5, mm3 ; p1 - p0
- por mm5, mm7 ; abs(p1-p0)
-
- movq t0, mm5 ; save abs(p1-p0)
- lea rdx, srct
-
- psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
- por mm0, mm5 ; mm0=mask
-
- movq mm5, [rdx+32] ; mm5=q0
- movq mm7, [rdx+40] ; mm7=q1
-
- movq mm6, mm5 ; mm6=q0
- movq mm2, mm7 ; q1
- psubusb mm5, mm7 ; q0-q1
-
- psubusb mm7, mm6 ; q1-q0
- por mm7, mm5 ; abs(q1-q0)
-
- movq t1, mm7 ; save abs(q1-q0)
- psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit
-
- por mm0, mm7 ; mask
-
- movq mm5, mm2 ; q1
- psubusb mm5, mm1 ; q1-=p1
- psubusb mm1, mm2 ; p1-=q1
- por mm5, mm1 ; abs(p1-q1)
- pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm5, 1 ; abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ;
-
- movq mm4, [rdx] ;blimit
- movq mm1, mm3 ; mm1=mm3=p0
-
- movq mm7, mm6 ; mm7=mm6=q0
- psubusb mm1, mm7 ; p0-q0
-
- psubusb mm7, mm3 ; q0-p0
- por mm1, mm7 ; abs(q0-p0)
- paddusb mm1, mm1 ; abs(q0-p0)*2
- paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm0; ; mask
-
- pxor mm0, mm0
- pcmpeqb mm1, mm0
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx]
- ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7 ; abs(q1 - q0) > thresh
-
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7 ; abs(p1 - p0)> thresh
-
- por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb mm4, mm0
-
- pcmpeqb mm0, mm0
- pxor mm4, mm0
-
-
-
-
- ; start work on filters
- lea rdx, srct
-
- ; start work on filters
- movq mm2, [rdx+16] ; p1
- movq mm7, [rdx+40] ; q1
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
-
- movq mm6, [rdx+24] ; p0
- movq mm0, [rdx+32] ; q0
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0)
- paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
-
- ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
- movq mm2, mm1 ; vp8_filter
- pand mm2, mm4; ; Filter2 = vp8_filter & hev
-
- movq mm5, mm2 ;
- paddsb mm5, [GLOBAL(t3)];
-
- pxor mm0, mm0 ; 0
- pxor mm7, mm7 ; 0
-
- punpcklbw mm0, mm5 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- punpckhbw mm7, mm5 ; a0b0c0d0
- psraw mm7, 11 ; sign extended shift right by 3
- packsswb mm0, mm7 ; Filter2 >>=3;
-
- movq mm5, mm0 ; Filter2
-
- paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
- pxor mm0, mm0 ; 0
- pxor mm7, mm7 ; 0
-
- punpcklbw mm0, mm2 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- punpckhbw mm7, mm2 ; a0b0c0d0
- psraw mm7, 11 ; sign extended shift right by 3
- packsswb mm0, mm7 ; Filter2 >>=3;
-
- ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
- psubsb mm3, mm0 ; qs0 =qs0 - filter1
- paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
-
- ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
- ; vp8_filter &= ~hev;
- ; Filter2 = vp8_filter;
- pandn mm4, mm1 ; vp8_filter&=~hev
-
-
- ; mm3=qs0, mm4=filter2, mm6=ps0
-
- ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
- ; s = vp8_signed_char_clamp(qs0 - u);
- ; *oq0 = s^0x80;
- ; s = vp8_signed_char_clamp(ps0 + u);
- ; *op0 = s^0x80;
- pxor mm0, mm0
-
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s27)]
- pmulhw mm2, [GLOBAL(s27)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- psubsb mm3, mm1
- paddsb mm6, mm1
-
- pxor mm3, [GLOBAL(t80)]
- pxor mm6, [GLOBAL(t80)]
- movq [rdx+24], mm6
- movq [rdx+32], mm3
-
- ; roughly 2/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
- ; s = vp8_signed_char_clamp(qs1 - u);
- ; *oq1 = s^0x80;
- ; s = vp8_signed_char_clamp(ps1 + u);
- ; *op1 = s^0x80;
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s18)]
- pmulhw mm2, [GLOBAL(s18)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- movq mm3, [rdx + 40]
- movq mm6, [rdx + 16] ; p1
- pxor mm3, [GLOBAL(t80)]
- pxor mm6, [GLOBAL(t80)]
-
- paddsb mm6, mm1
- psubsb mm3, mm1
-
- pxor mm6, [GLOBAL(t80)]
- pxor mm3, [GLOBAL(t80)]
- movq [rdx + 40], mm3
- movq [rdx + 16], mm6
-
- ; roughly 1/7th difference across boundary
- ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
- ; s = vp8_signed_char_clamp(qs2 - u);
- ; *oq2 = s^0x80;
- ; s = vp8_signed_char_clamp(ps2 + u);
- ; *op2 = s^0x80;
- pxor mm1, mm1
- pxor mm2, mm2
- punpcklbw mm1, mm4
- punpckhbw mm2, mm4
- pmulhw mm1, [GLOBAL(s9)]
- pmulhw mm2, [GLOBAL(s9)]
- paddw mm1, [GLOBAL(s63)]
- paddw mm2, [GLOBAL(s63)]
- psraw mm1, 7
- psraw mm2, 7
- packsswb mm1, mm2
-
- movq mm6, [rdx+ 8]
- movq mm3, [rdx+48]
-
- pxor mm6, [GLOBAL(t80)]
- pxor mm3, [GLOBAL(t80)]
-
- paddsb mm6, mm1
- psubsb mm3, mm1
-
- pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
- pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
-
- ; transpose and write back
- movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
- movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
-
- punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
- punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
-
- movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
- movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
-
- punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
- punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
-
- movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
- punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
-
- punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
- movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
-
- punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
- punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
-
- movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
- punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
-
- movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
- punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
-
- movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
- punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
-
- punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
- movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
-
- punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
- punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
-
- movq [rsi+rax*4], mm0 ; write out
- movq [rdi+rax*4], mm6 ; write out
-
- movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
- punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
-
- punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
- movq [rsi+rax*2], mm0 ; write out
-
- movq [rdi+rax*2], mm5 ; write out
- movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
-
- punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
- punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
-
- movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
- punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
-
- punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
- movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
-
- movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
- punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
-
- punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
- movq [rsi], mm0 ; write out
-
- movq [rdi], mm1 ; write out
- neg rax
-
- punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
- punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
-
- movq [rsi+rax*2], mm3
- movq [rdi+rax*2], mm4
-
- lea rsi, [rsi+rax*8]
- dec rcx
-
- jnz .next8_mbv
-
- add rsp, 96
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_loop_filter_simple_horizontal_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp8_loop_filter_simple_horizontal_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- mov rcx, 2 ; count
-.nexts8_h:
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm3, [rdx] ;
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
- neg rax
-
- ; calculate mask
- movq mm1, [rsi+2*rax] ; p1
- movq mm0, [rdi] ; q1
- movq mm2, mm1
- movq mm7, mm0
- movq mm4, mm0
- psubusb mm0, mm1 ; q1-=p1
- psubusb mm1, mm4 ; p1-=q1
- por mm1, mm0 ; abs(p1-q1)
- pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm1, 1 ; abs(p1-q1)/2
-
- movq mm5, [rsi+rax] ; p0
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- movq mm6, mm5 ; p0
- psubusb mm5, mm4 ; p0-=q0
- psubusb mm4, mm6 ; q0-=p0
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm3, mm3
- pcmpeqb mm5, mm3
-
- ; start work on filters
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
- pand mm5, mm2 ; mask filter values we don't care about
-
- ; do + 4 side
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- movq mm1, mm5 ; get a copy of filters
- psraw mm1, 11 ; arithmetic shift right 11
- psllw mm1, 8 ; shift left 8 to put it back
-
- por mm0, mm1 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
- movq [rsi], mm3 ; write back
-
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+rax], mm6 ; write back
-
- add rsi,8
- neg rax
- dec rcx
- jnz .nexts8_h
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_loop_filter_simple_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp8_loop_filter_simple_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4- 2]; ;
- mov rcx, 2 ; count
-.nexts8_v:
-
- lea rdi, [rsi + rax];
- movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
-
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
- punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
-
- movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
- movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
-
- punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
- movq mm5, mm4 ; 53 43 52 42 51 41 50 40
-
- punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
- punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
-
- neg rax
-
- movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
-
- punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
- movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
-
- movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
- punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
-
- movq mm2, mm0 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 13 03 12 02 11 01 10 00
-
- punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
- movq mm3, mm2 ; 33 23 13 03 32 22 12 02
-
- punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
- punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
-
- punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
-
-
- ; calculate mask
- movq mm6, mm0 ; p1
- movq mm7, mm3 ; q1
- psubusb mm7, mm6 ; q1-=p1
- psubusb mm6, mm3 ; p1-=q1
- por mm6, mm7 ; abs(p1-q1)
- pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm6, 1 ; abs(p1-q1)/2
-
- movq mm5, mm1 ; p0
- movq mm4, mm2 ; q0
-
- psubusb mm5, mm2 ; p0-=q0
- psubusb mm4, mm1 ; q0-=p0
-
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx]
-
- psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm7, mm7
- pcmpeqb mm5, mm7 ; mm5 = mask
-
- ; start work on filters
- movq t0, mm0
- movq t1, mm3
-
- pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb mm0, mm3 ; p1 - q1
- movq mm6, mm1 ; p0
-
- movq mm7, mm2 ; q0
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm7 ; offseted ; q0
-
- psubsb mm7, mm6 ; q0 - p0
- paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
-
- paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
-
- pand mm5, mm0 ; mask filter values we don't care about
-
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- movq mm7, mm5 ; get a copy of filters
- psraw mm7, 11 ; arithmetic shift right 11
- psllw mm7, 8 ; shift left 8 to put it back
-
- por mm0, mm7 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0sz add
- pxor mm3, [GLOBAL(t80)] ; unoffset
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
-
-
- movq mm0, t0
- movq mm4, t1
-
- ; mm0 = 70 60 50 40 30 20 10 00
- ; mm6 = 71 61 51 41 31 21 11 01
- ; mm3 = 72 62 52 42 32 22 12 02
- ; mm4 = 73 63 53 43 33 23 13 03
- ; transpose back to write out
-
- movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
-
- punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
- movq mm2, mm3 ;
-
- punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
- movq mm5, mm1 ; 71 70 61 60 51 50 41 40
-
- punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
- movq mm6, mm0 ; 31 30 21 20 11 10 01 00
-
- punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
- punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
-
- movd [rsi+rax*4], mm0 ; write 03 02 01 00
- punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
-
- psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
- punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
-
- movd [rdi+rax*4], mm0 ; write 13 12 11 10
- movd [rsi+rax*2], mm6 ; write 23 22 21 20
-
- psrlq mm6, 32 ; 33 32 31 30
- movd [rsi], mm1 ; write 43 42 41 40
-
- movd [rsi + rax], mm6 ; write 33 32 31 30
- neg rax
-
- movd [rsi + rax*2], mm5 ; write 63 62 61 60
- psrlq mm1, 32 ; 53 52 51 50
-
- movd [rdi], mm1 ; write out 53 52 51 50
- psrlq mm5, 32 ; 73 72 71 70
-
- movd [rdi + rax*2], mm5 ; write 73 72 71 70
-
- lea rsi, [rsi+rax*8] ; next 8
-
- dec rcx
- jnz .nexts8_v
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-; int y_stride,
-; loop_filter_info *lfi)
-;{
-;
-;
-; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
-SECTION_RODATA
-align 16
-tfe:
- times 8 db 0xfe
-align 16
-t80:
- times 8 db 0x80
-align 16
-t1s:
- times 8 db 0x01
-align 16
-t3:
- times 8 db 0x03
-align 16
-t4:
- times 8 db 0x04
-align 16
-ones:
- times 4 dw 0x0001
-align 16
-s27:
- times 4 dw 0x1b00
-align 16
-s18:
- times 4 dw 0x1200
-align 16
-s9:
- times 4 dw 0x0900
-align 16
-s63:
- times 4 dw 0x003f