summaryrefslogtreecommitdiff
path: root/thirdparty/libtheora/x86_vc/mmxfdct.c
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/libtheora/x86_vc/mmxfdct.c')
-rw-r--r--thirdparty/libtheora/x86_vc/mmxfdct.c128
1 files changed, 72 insertions, 56 deletions
diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c
index d908ce2413..c9ee530ea2 100644
--- a/thirdparty/libtheora/x86_vc/mmxfdct.c
+++ b/thirdparty/libtheora/x86_vc/mmxfdct.c
@@ -12,6 +12,7 @@
/*MMX fDCT implementation for x86_32*/
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include "x86enc.h"
+#include "x86zigzag.h"
#if defined(OC_X86_ASM)
@@ -462,18 +463,22 @@
}
/*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+ OC_ALIGN8(ogg_int16_t buf[64]);
+ ogg_int16_t *bufp;
+ bufp=buf;
__asm{
+#define X edx
#define Y eax
#define A ecx
-#define X edx
+#define BUF esi
/*Add two extra bits of working precision to improve accuracy; any more and
we could overflow.*/
/*We also add biases to correct for some systematic error that remains in
the full fDCT->iDCT round trip.*/
mov X, _x
mov Y, _y
+ mov BUF, bufp
movq mm0,[0x00+X]
movq mm1,[0x10+X]
movq mm2,[0x20+X]
@@ -591,79 +596,90 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
movq mm3,[0x30+Y]
OC_FDCT_STAGE1_8x4
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
- OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
/*mm0={-2}x4*/
- pcmpeqw mm0,mm0
- paddw mm0,mm0
- /*Round the results.*/
- psubw mm1,mm0
- psubw mm2,mm0
- psraw mm1,2
- psubw mm3,mm0
- movq [0x18+Y],mm1
- psraw mm2,2
- psubw mm4,mm0
- movq mm1,[0x08+Y]
- psraw mm3,2
- psubw mm5,mm0
+ pcmpeqw mm2,mm2
+ paddw mm2,mm2
+ /*Round and store the results (no transpose).*/
+ movq mm7,[Y+0x10]
+ psubw mm4,mm2
+ psubw mm6,mm2
psraw mm4,2
- psubw mm6,mm0
- psraw mm5,2
- psubw mm7,mm0
+ psubw mm0,mm2
+ movq [BUF+0x00],mm4
+ movq mm4,[Y+0x30]
psraw mm6,2
- psubw mm1,mm0
+ psubw mm5,mm2
+ movq [BUF+0x20],mm6
+ psraw mm0,2
+ psubw mm3,mm2
+ movq [BUF+0x40],mm0
+ psraw mm5,2
+ psubw mm1,mm2
+ movq [BUF+0x50],mm5
+ psraw mm3,2
+ psubw mm7,mm2
+ movq [BUF+0x60],mm3
+ psraw mm1,2
+ psubw mm4,mm2
+ movq [BUF+0x70],mm1
psraw mm7,2
+ movq [BUF+0x10],mm7
+ psraw mm4,2
+ movq [BUF+0x30],mm4
+ /*Load the next block.*/
movq mm0,[0x40+Y]
- psraw mm1,2
- movq [0x30+Y],mm7
movq mm7,[0x78+Y]
- movq [0x08+Y],mm1
movq mm1,[0x50+Y]
- movq [0x20+Y],mm6
movq mm6,[0x68+Y]
- movq [0x28+Y],mm2
movq mm2,[0x60+Y]
- movq [0x10+Y],mm5
movq mm5,[0x58+Y]
- movq [0x38+Y],mm3
movq mm3,[0x70+Y]
- movq [0x00+Y],mm4
movq mm4,[0x48+Y]
OC_FDCT_STAGE1_8x4
OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
- OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
/*mm0={-2}x4*/
- pcmpeqw mm0,mm0
- paddw mm0,mm0
- /*Round the results.*/
- psubw mm1,mm0
- psubw mm2,mm0
- psraw mm1,2
- psubw mm3,mm0
- movq [0x58+Y],mm1
- psraw mm2,2
- psubw mm4,mm0
- movq mm1,[0x48+Y]
- psraw mm3,2
- psubw mm5,mm0
- movq [0x68+Y],mm2
+ pcmpeqw mm2,mm2
+ paddw mm2,mm2
+ /*Round and store the results (no transpose).*/
+ movq mm7,[Y+0x50]
+ psubw mm4,mm2
+ psubw mm6,mm2
psraw mm4,2
- psubw mm6,mm0
- movq [0x78+Y],mm3
- psraw mm5,2
- psubw mm7,mm0
- movq [0x40+Y],mm4
+ psubw mm0,mm2
+ movq [BUF+0x08],mm4
+ movq mm4,[Y+0x70]
psraw mm6,2
- psubw mm1,mm0
- movq [0x50+Y],mm5
- psraw mm7,2
- movq [0x60+Y],mm6
+ psubw mm5,mm2
+ movq [BUF+0x28],mm6
+ psraw mm0,2
+ psubw mm3,mm2
+ movq [BUF+0x48],mm0
+ psraw mm5,2
+ psubw mm1,mm2
+ movq [BUF+0x58],mm5
+ psraw mm3,2
+ psubw mm7,mm2
+ movq [BUF+0x68],mm3
psraw mm1,2
- movq [0x70+Y],mm7
- movq [0x48+Y],mm1
+ psubw mm4,mm2
+ movq [BUF+0x78],mm1
+ psraw mm7,2
+ movq [BUF+0x18],mm7
+ psraw mm4,2
+ movq [BUF+0x38],mm4
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+ __asm movq _reg,[BUF+16*(_row)] \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+ __asm movq _reg,[BUF+16*(_row)+8] \
+
+ OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+#undef X
#undef Y
#undef A
-#undef X
+#undef BUF
}
}