summaryrefslogtreecommitdiff
path: root/thirdparty/libvpx/vp8/common/x86/copy_sse2.asm
blob: 86fae269563195bc7772ec55d3fa9b41d6e6910f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"


;void vp8_copy32xn_sse2(
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *dst_ptr,
;    int  dst_stride,
;    int height);
global sym(vp8_copy32xn_sse2) PRIVATE
sym(vp8_copy32xn_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 7
    push        rsi
    push        rdi
    ; end prolog

        mov             rsi,        arg(0) ;src_ptr
        mov             rdi,        arg(2) ;dst_ptr

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;dst_stride
        movsxd          rcx,        dword ptr arg(4) ;height

.block_copy_sse2_loopx4:
        movdqu          xmm0,       XMMWORD PTR [rsi]
        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]

        lea             rsi,        [rsi+rax*2]

        movdqu          xmm4,       XMMWORD PTR [rsi]
        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]

        lea             rsi,    [rsi+rax*2]

        movdqa          XMMWORD PTR [rdi], xmm0
        movdqa          XMMWORD PTR [rdi + 16], xmm1
        movdqa          XMMWORD PTR [rdi + rdx], xmm2
        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3

        lea             rdi,    [rdi+rdx*2]

        movdqa          XMMWORD PTR [rdi], xmm4
        movdqa          XMMWORD PTR [rdi + 16], xmm5
        movdqa          XMMWORD PTR [rdi + rdx], xmm6
        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7

        lea             rdi,    [rdi+rdx*2]

        sub             rcx,     4
        cmp             rcx,     4
        jge             .block_copy_sse2_loopx4

        cmp             rcx, 0
        je              .copy_is_done

.block_copy_sse2_loop:
        movdqu          xmm0,       XMMWORD PTR [rsi]
        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
        lea             rsi,    [rsi+rax]

        movdqa          XMMWORD PTR [rdi], xmm0
        movdqa          XMMWORD PTR [rdi + 16], xmm1
        lea             rdi,    [rdi+rdx]

        sub             rcx,     1
        jne             .block_copy_sse2_loop

.copy_is_done:
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret