1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
|
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
* *
********************************************************************
function:
last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
********************************************************************/
/*MMX acceleration of complete fragment reconstruction algorithm.
Originally written by Rudolf Marek.*/
#include <string.h>
#include "x86int.h"
#include "mmxfrag.h"
#include "mmxloop.h"
#if defined(OC_X86_ASM)
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
int mb_mode;
/*Apply the inverse transform.*/
/*Special case only having a DC component.*/
if(_last_zzi<2){
/*Note that this value must be unsigned, to keep the __asm__ block from
sign-extending it when it puts it in a register.*/
ogg_uint16_t p;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
/*Fill _dct_coeffs with p.*/
__asm__ __volatile__(
/*mm0=0000 0000 0000 AAAA*/
"movd %[p],%%mm0\n\t"
/*mm0=0000 0000 AAAA AAAA*/
"punpcklwd %%mm0,%%mm0\n\t"
/*mm0=AAAA AAAA AAAA AAAA*/
"punpckldq %%mm0,%%mm0\n\t"
"movq %%mm0,(%[y])\n\t"
"movq %%mm0,8(%[y])\n\t"
"movq %%mm0,16(%[y])\n\t"
"movq %%mm0,24(%[y])\n\t"
"movq %%mm0,32(%[y])\n\t"
"movq %%mm0,40(%[y])\n\t"
"movq %%mm0,48(%[y])\n\t"
"movq %%mm0,56(%[y])\n\t"
"movq %%mm0,64(%[y])\n\t"
"movq %%mm0,72(%[y])\n\t"
"movq %%mm0,80(%[y])\n\t"
"movq %%mm0,88(%[y])\n\t"
"movq %%mm0,96(%[y])\n\t"
"movq %%mm0,104(%[y])\n\t"
"movq %%mm0,112(%[y])\n\t"
"movq %%mm0,120(%[y])\n\t"
:
:[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
:"memory"
);
}
else{
/*Dequantize the DC coefficient.*/
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
}
/*Fill in the target buffer.*/
frag_buf_off=_state->frag_buf_offs[_fragi];
mb_mode=_state->frags[_fragi].mb_mode;
ystride=_state->ref_ystride[_pli];
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
else{
const unsigned char *ref;
int mvoffsets[2];
ref=
_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+frag_buf_off;
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
_dct_coeffs);
}
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
}
}
/*We copy these entire function to inline the actual MMX routines so that we
use only a single indirect call.*/
/*Copies the fragments specified by the lists of fragment indices from one
frame to another.
_fragis: A pointer to a list of fragment indices.
_nfragis: The number of fragment indices to copy.
_dst_frame: The reference frame to copy to.
_src_frame: The reference frame to copy from.
_pli: The color plane the fragments lie in.*/
void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
int _dst_frame,int _src_frame,int _pli){
const ptrdiff_t *frag_buf_offs;
const unsigned char *src_frame_data;
unsigned char *dst_frame_data;
ptrdiff_t fragii;
int ystride;
dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
ystride=_state->ref_ystride[_pli];
frag_buf_offs=_state->frag_buf_offs;
for(fragii=0;fragii<_nfragis;fragii++){
ptrdiff_t frag_buf_off;
frag_buf_off=frag_buf_offs[_fragis[fragii]];
OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
src_frame_data+frag_buf_off,ystride);
}
}
/*Apply the loop filter to a given set of fragment rows in the given plane.
The filter may be run on the bottom edge, affecting pixels in the next row of
fragments, so this row also needs to be available.
_bv: The bounding values array.
_refi: The index of the frame buffer to filter.
_pli: The color plane to filter.
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
OC_ALIGN8(unsigned char ll[8]);
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
unsigned char *ref_frame_data;
ptrdiff_t fragi_top;
ptrdiff_t fragi_bot;
ptrdiff_t fragi0;
ptrdiff_t fragi0_end;
int ystride;
int nhfrags;
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
ref_frame_data=_state->ref_frame_data[_refi];
/*The following loops are constructed somewhat non-intuitively on purpose.
The main idea is: if a block boundary has at least one coded fragment on
it, the filter is applied to it.
However, the order that the filters are applied in matters, and VP3 chose
the somewhat strange ordering used below.*/
while(fragi0<fragi0_end){
ptrdiff_t fragi;
ptrdiff_t fragi_end;
fragi=fragi0;
fragi_end=fragi+nhfrags;
while(fragi<fragi_end){
if(frags[fragi].coded){
unsigned char *ref;
ref=ref_frame_data+frag_buf_offs[fragi];
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
}
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
}
}
fragi++;
}
fragi0+=nhfrags;
}
}
#endif
|