summaryrefslogtreecommitdiff
path: root/thirdparty/libtheora/x86_vc/mmxencfrag.c
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/libtheora/x86_vc/mmxencfrag.c')
-rw-r--r--thirdparty/libtheora/x86_vc/mmxencfrag.c122
1 files changed, 68 insertions, 54 deletions
diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c
index 94f1d06513..a6be819135 100644
--- a/thirdparty/libtheora/x86_vc/mmxencfrag.c
+++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c
@@ -266,7 +266,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
The transform is performed in place, except that outputs 0-3 are swapped with
outputs 4-7.
- Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+ Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
perform this stage in place with no temporary registers).*/
#define OC_HADAMARD_AB_8x4 __asm{ \
/*Stage A: \
@@ -299,7 +299,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
}
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
- Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+ Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
place with no temporary registers).*/
#define OC_HADAMARD_C_8x4 __asm{ \
/*Stage C:*/ \
@@ -468,12 +468,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
mm7 = d3 c3 b3 a3*/ \
}
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
- OC_ALIGN8(ogg_int16_t buf[64]);
- ogg_int16_t *bufp;
- unsigned ret1;
- unsigned ret2;
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+ OC_ALIGN8(ogg_int16_t buf[64]);
+ ogg_int16_t *bufp;
+ unsigned ret;
+ unsigned ret2;
+ int dc;
bufp=buf;
__asm{
#define SRC esi
@@ -481,8 +483,10 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
#define SRC_YSTRIDE ecx
#define REF_YSTRIDE edx
#define BUF edi
-#define RET eax
-#define RET2 edx
+#define RET edx
+#define RET2 ecx
+#define DC eax
+#define DC_WORD ax
mov SRC,_src
mov SRC_YSTRIDE,_src_ystride
mov REF,_ref
@@ -508,14 +512,18 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
movq mm2,[0x20+BUF]
movq mm3,[0x30+BUF]
movq mm0,[0x00+BUF]
- OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
+ /*We split out the stages here so we can save the DC coefficient in the
+ middle.*/
+ OC_HADAMARD_AB_8x4
+ OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+ movd DC,mm1
+ OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
for the factor of two we dropped + 3 for the vertical accumulation).
Now we finally have to promote things to dwords.
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
latency of pmaddwd by starting the next series of loads now.*/
- mov RET2,_thresh
pmaddwd mm0,mm7
movq mm1,[0x50+BUF]
movq mm5,[0x58+BUF]
@@ -525,29 +533,28 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
movq mm6,[0x68+BUF]
paddd mm4,mm0
movq mm3,[0x70+BUF]
- movd RET,mm4
+ movd RET2,mm4
movq mm7,[0x78+BUF]
- /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
- added to them, and a factor of two removed; correct the final sum here.*/
- lea RET,[RET+RET-32]
movq mm0,[0x40+BUF]
- cmp RET,RET2
movq mm4,[0x48+BUF]
- jae at_end
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
pmaddwd mm0,mm7
- /*There isn't much to stick in here to hide the latency this time, but the
- alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
- latency is even worse.*/
- sub RET,32
+ /*Subtract abs(dc) from 2*ret2.*/
+ movsx DC,DC_WORD
+ cdq
+ lea RET2,[RET+RET2*2]
movq mm4,mm0
punpckhdq mm0,mm0
+ xor RET,DC
paddd mm4,mm0
- movd RET2,mm4
- lea RET,[RET+RET2*2]
- align 16
-at_end:
- mov ret1,RET
+ /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+ added to them, a factor of two removed, and the DC value included;
+ correct the final sum here.*/
+ sub RET2,RET
+ movd RET,mm4
+ lea RET,[RET2+RET*2-64]
+ mov ret,RET
+ mov dc,DC
#undef SRC
#undef REF
#undef SRC_YSTRIDE
@@ -555,18 +562,21 @@ at_end:
#undef BUF
#undef RET
#undef RET2
+#undef DC
+#undef DC_WORD
}
- return ret1;
+ *_dc=dc;
+ return ret;
}
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
- return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+ return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
}
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
- we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+ we can share code with oc_enc_frag_satd2_mmxext().*/
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
__asm{
@@ -694,30 +704,31 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
}
}
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
- return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+ return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
}
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
int _ystride){
- OC_ALIGN8(ogg_int16_t buf[64]);
- ogg_int16_t *bufp;
- unsigned ret1;
- unsigned ret2;
+ OC_ALIGN8(ogg_int16_t buf[64]);
+ ogg_int16_t *bufp;
+ unsigned ret1;
+ unsigned ret2;
+ int dc;
bufp=buf;
__asm{
#define SRC eax
#define SRC4 esi
#define BUF edi
-#define RET eax
-#define RET_WORD ax
-#define RET2 ecx
#define YSTRIDE edx
#define YSTRIDE3 ecx
+#define RET eax
+#define RET2 ecx
+#define DC edx
+#define DC_WORD dx
mov SRC,_src
mov BUF,bufp
mov YSTRIDE,_ystride
@@ -749,7 +760,7 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
middle.*/
OC_HADAMARD_AB_8x4
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
- movd RET,mm1
+ movd DC,mm1
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
@@ -767,31 +778,34 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
movq mm3,[0x70+BUF]
paddd mm4,mm0
movq mm7,[0x78+BUF]
- movd RET2,mm4
+ movd RET,mm4
movq mm0,[0x40+BUF]
movq mm4,[0x48+BUF]
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
pmaddwd mm0,mm7
/*We assume that the DC coefficient is always positive (which is true,
because the input to the INTRA transform was not a difference).*/
- movzx RET,RET_WORD
- add RET2,RET2
- sub RET2,RET
+ movzx DC,DC_WORD
+ add RET,RET
+ sub RET,DC
movq mm4,mm0
punpckhdq mm0,mm0
paddd mm4,mm0
- movd RET,mm4
- lea RET,[-64+RET2+RET*2]
+ movd RET2,mm4
+ lea RET,[-64+RET+RET2*2]
+ mov [dc],DC
mov [ret1],RET
#undef SRC
#undef SRC4
#undef BUF
-#undef RET
-#undef RET_WORD
-#undef RET2
#undef YSTRIDE
#undef YSTRIDE3
+#undef RET
+#undef RET2
+#undef DC
+#undef DC_WORD
}
+ *_dc=dc;
return ret1;
}