summaryrefslogtreecommitdiff
path: root/drivers/theora/x86_vc/mmxfrag.c
blob: 4eb2084dc6f0e25961980fb399a96f9b0e89cba9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
 *                                                                  *
 ********************************************************************

  function:
    last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $

 ********************************************************************/

/*MMX acceleration of fragment reconstruction for motion compensation.
  Originally written by Rudolf Marek.
  Additional optimization by Nils Pipenbrinck.
  Note: Loops are unrolled for best performance.
  The iteration each instruction belongs to is marked in the comments as #i.*/
#include <stddef.h>
#include "x86int.h"
#include "mmxfrag.h"

#if defined(OC_X86_ASM)

/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
   between rows.*/
void oc_frag_copy_mmx(unsigned char *_dst,
 const unsigned char *_src,int _ystride){
#define SRC edx
#define DST eax
#define YSTRIDE ecx
#define YSTRIDE3 esi
  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
#undef SRC
#undef DST
#undef YSTRIDE
#undef YSTRIDE3
}

void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
 const ogg_int16_t *_residue){
  __asm{
#define DST edx
#define DST4 esi
#define YSTRIDE eax
#define YSTRIDE3 edi
#define RESIDUE ecx
    mov DST,_dst
    mov YSTRIDE,_ystride
    mov RESIDUE,_residue
    lea DST4,[DST+YSTRIDE*4]
    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
    pcmpeqw mm0,mm0
    /*#0 Load low residue.*/
    movq mm1,[0*8+RESIDUE]
    /*#0 Load high residue.*/
    movq mm2,[1*8+RESIDUE]
    /*Set mm0 to 0x8000800080008000.*/
    psllw mm0,15
    /*#1 Load low residue.*/
    movq mm3,[2*8+RESIDUE]
    /*#1 Load high residue.*/
    movq mm4,[3*8+RESIDUE]
    /*Set mm0 to 0x0080008000800080.*/
    psrlw mm0,8
    /*#2 Load low residue.*/
    movq mm5,[4*8+RESIDUE]
    /*#2 Load high residue.*/
    movq mm6,[5*8+RESIDUE]
    /*#0 Bias low  residue.*/
    paddsw mm1,mm0
    /*#0 Bias high residue.*/
    paddsw mm2,mm0
    /*#0 Pack to byte.*/
    packuswb mm1,mm2
    /*#1 Bias low  residue.*/
    paddsw mm3,mm0
    /*#1 Bias high residue.*/
    paddsw mm4,mm0
    /*#1 Pack to byte.*/
    packuswb mm3,mm4
    /*#2 Bias low  residue.*/
    paddsw mm5,mm0
    /*#2 Bias high residue.*/
    paddsw mm6,mm0
    /*#2 Pack to byte.*/
    packuswb mm5,mm6
    /*#0 Write row.*/
    movq [DST],mm1
    /*#1 Write row.*/
    movq [DST+YSTRIDE],mm3
    /*#2 Write row.*/
    movq [DST+YSTRIDE*2],mm5
    /*#3 Load low residue.*/
    movq mm1,[6*8+RESIDUE]
    /*#3 Load high residue.*/
    movq mm2,[7*8+RESIDUE]
    /*#4 Load high residue.*/
    movq mm3,[8*8+RESIDUE]
    /*#4 Load high residue.*/
    movq mm4,[9*8+RESIDUE]
    /*#5 Load high residue.*/
    movq mm5,[10*8+RESIDUE]
    /*#5 Load high residue.*/
    movq mm6,[11*8+RESIDUE]
    /*#3 Bias low  residue.*/
    paddsw mm1,mm0
    /*#3 Bias high residue.*/
    paddsw mm2,mm0
    /*#3 Pack to byte.*/
    packuswb mm1,mm2
    /*#4 Bias low  residue.*/
    paddsw mm3,mm0
    /*#4 Bias high residue.*/
    paddsw mm4,mm0
    /*#4 Pack to byte.*/
    packuswb mm3,mm4
    /*#5 Bias low  residue.*/
    paddsw mm5,mm0
    /*#5 Bias high residue.*/
    paddsw mm6,mm0
    /*#5 Pack to byte.*/
    packuswb mm5,mm6
    /*#3 Write row.*/
    movq [DST+YSTRIDE3],mm1
    /*#4 Write row.*/
    movq [DST4],mm3
    /*#5 Write row.*/
    movq [DST4+YSTRIDE],mm5
    /*#6 Load low residue.*/
    movq mm1,[12*8+RESIDUE]
    /*#6 Load high residue.*/
    movq mm2,[13*8+RESIDUE]
    /*#7 Load low residue.*/
    movq mm3,[14*8+RESIDUE]
    /*#7 Load high residue.*/
    movq mm4,[15*8+RESIDUE]
    /*#6 Bias low  residue.*/
    paddsw mm1,mm0
    /*#6 Bias high residue.*/
    paddsw mm2,mm0
    /*#6 Pack to byte.*/
    packuswb mm1,mm2
    /*#7 Bias low  residue.*/
    paddsw mm3,mm0
    /*#7 Bias high residue.*/
    paddsw mm4,mm0
    /*#7 Pack to byte.*/
    packuswb mm3,mm4
    /*#6 Write row.*/
    movq [DST4+YSTRIDE*2],mm1
    /*#7 Write row.*/
    movq [DST4+YSTRIDE3],mm3
#undef DST
#undef DST4
#undef YSTRIDE
#undef YSTRIDE3
#undef RESIDUE
  }
}

void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
 int _ystride,const ogg_int16_t *_residue){
  int i;
  /*Zero mm0.*/
  __asm pxor mm0,mm0;
  for(i=4;i-->0;){
    __asm{
#define DST edx
#define SRC ecx
#define YSTRIDE edi
#define RESIDUE eax
      mov DST,_dst
      mov SRC,_src
      mov YSTRIDE,_ystride
      mov RESIDUE,_residue
      /*#0 Load source.*/
      movq mm3,[SRC]
      /*#1 Load source.*/
      movq mm7,[SRC+YSTRIDE]
      /*#0 Get copy of src.*/
      movq mm4,mm3
      /*#0 Expand high source.*/
      punpckhbw mm4,mm0
      /*#0 Expand low  source.*/
      punpcklbw mm3,mm0
      /*#0 Add residue high.*/
      paddsw mm4,[8+RESIDUE]
      /*#1 Get copy of src.*/
      movq mm2,mm7
      /*#0 Add residue low.*/
      paddsw  mm3,[RESIDUE]
      /*#1 Expand high source.*/
      punpckhbw mm2,mm0
      /*#0 Pack final row pixels.*/
      packuswb mm3,mm4
      /*#1 Expand low  source.*/
      punpcklbw mm7,mm0
      /*#1 Add residue low.*/
      paddsw mm7,[16+RESIDUE]
      /*#1 Add residue high.*/
      paddsw mm2,[24+RESIDUE]
      /*Advance residue.*/
      lea RESIDUE,[32+RESIDUE]
      /*#1 Pack final row pixels.*/
      packuswb mm7,mm2
      /*Advance src.*/
      lea SRC,[SRC+YSTRIDE*2]
      /*#0 Write row.*/
      movq [DST],mm3
      /*#1 Write row.*/
      movq [DST+YSTRIDE],mm7
      /*Advance dst.*/
      lea DST,[DST+YSTRIDE*2]
      mov _residue,RESIDUE
      mov _dst,DST
      mov _src,SRC
#undef DST
#undef SRC
#undef YSTRIDE
#undef RESIDUE
    }
  }
}

void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
  int i;
  /*Zero mm7.*/
  __asm pxor mm7,mm7;
  for(i=4;i-->0;){
    __asm{
#define SRC1 ecx
#define SRC2 edi
#define YSTRIDE esi
#define RESIDUE edx
#define DST eax
      mov YSTRIDE,_ystride
      mov DST,_dst
      mov RESIDUE,_residue
      mov SRC1,_src1
      mov SRC2,_src2
      /*#0 Load src1.*/
      movq mm0,[SRC1]
      /*#0 Load src2.*/
      movq mm2,[SRC2]
      /*#0 Copy src1.*/
      movq mm1,mm0
      /*#0 Copy src2.*/
      movq mm3,mm2
      /*#1 Load src1.*/
      movq mm4,[SRC1+YSTRIDE]
      /*#0 Unpack lower src1.*/
      punpcklbw mm0,mm7
      /*#1 Load src2.*/
      movq mm5,[SRC2+YSTRIDE]
      /*#0 Unpack higher src1.*/
      punpckhbw mm1,mm7
      /*#0 Unpack lower src2.*/
      punpcklbw mm2,mm7
      /*#0 Unpack higher src2.*/
      punpckhbw mm3,mm7
      /*Advance src1 ptr.*/
      lea SRC1,[SRC1+YSTRIDE*2]
      /*Advance src2 ptr.*/
      lea SRC2,[SRC2+YSTRIDE*2]
      /*#0 Lower src1+src2.*/
      paddsw mm0,mm2
      /*#0 Higher src1+src2.*/
      paddsw mm1,mm3
      /*#1 Copy src1.*/
      movq mm2,mm4
      /*#0 Build lo average.*/
      psraw mm0,1
      /*#1 Copy src2.*/
      movq mm3,mm5
      /*#1 Unpack lower src1.*/
      punpcklbw mm4,mm7
      /*#0 Build hi average.*/
      psraw mm1,1
      /*#1 Unpack higher src1.*/
      punpckhbw mm2,mm7
      /*#0 low+=residue.*/
      paddsw mm0,[RESIDUE]
      /*#1 Unpack lower src2.*/
      punpcklbw mm5,mm7
      /*#0 high+=residue.*/
      paddsw mm1,[8+RESIDUE]
      /*#1 Unpack higher src2.*/
      punpckhbw mm3,mm7
      /*#1 Lower src1+src2.*/
      paddsw mm5,mm4
      /*#0 Pack and saturate.*/
      packuswb mm0,mm1
      /*#1 Higher src1+src2.*/
      paddsw mm3,mm2
      /*#0 Write row.*/
      movq [DST],mm0
      /*#1 Build lo average.*/
      psraw mm5,1
      /*#1 Build hi average.*/
      psraw mm3,1
      /*#1 low+=residue.*/
      paddsw mm5,[16+RESIDUE]
      /*#1 high+=residue.*/
      paddsw mm3,[24+RESIDUE]
      /*#1 Pack and saturate.*/
      packuswb  mm5,mm3
      /*#1 Write row ptr.*/
      movq [DST+YSTRIDE],mm5
      /*Advance residue ptr.*/
      add RESIDUE,32
      /*Advance dest ptr.*/
      lea DST,[DST+YSTRIDE*2]
      mov _dst,DST
      mov _residue,RESIDUE
      mov _src1,SRC1
      mov _src2,SRC2
#undef SRC1
#undef SRC2
#undef YSTRIDE
#undef RESIDUE
#undef DST
    }
  }
}

void oc_restore_fpu_mmx(void){
  __asm emms;
}

#endif