summaryrefslogtreecommitdiff
path: root/thirdparty/libtheora/x86/mmxfdct.c
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/libtheora/x86/mmxfdct.c')
-rw-r--r--thirdparty/libtheora/x86/mmxfdct.c127
1 files changed, 70 insertions, 57 deletions
diff --git a/thirdparty/libtheora/x86/mmxfdct.c b/thirdparty/libtheora/x86/mmxfdct.c
index 211875255e..17668358b8 100644
--- a/thirdparty/libtheora/x86/mmxfdct.c
+++ b/thirdparty/libtheora/x86/mmxfdct.c
@@ -12,6 +12,7 @@
/*MMX fDCT implementation for x86_32*/
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
#include "x86enc.h"
+#include "x86zigzag.h"
#if defined(OC_X86_ASM)
@@ -462,8 +463,9 @@
mm7 = d3 c3 b3 a3*/ \
/*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+ OC_ALIGN8(ogg_int16_t buf[64]);
+ ptrdiff_t a;
__asm__ __volatile__(
/*Add two extra bits of working precision to improve accuracy; any more and
we could overflow.*/
@@ -586,77 +588,88 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
"movq 0x30(%[y]),%%mm3\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
- OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
- /*mm0={-2}x4*/
- "pcmpeqw %%mm0,%%mm0\n\t"
- "paddw %%mm0,%%mm0\n\t"
- /*Round the results.*/
- "psubw %%mm0,%%mm1\n\t"
- "psubw %%mm0,%%mm2\n\t"
- "psraw $2,%%mm1\n\t"
- "psubw %%mm0,%%mm3\n\t"
- "movq %%mm1,0x18(%[y])\n\t"
- "psraw $2,%%mm2\n\t"
- "psubw %%mm0,%%mm4\n\t"
- "movq 0x08(%[y]),%%mm1\n\t"
- "psraw $2,%%mm3\n\t"
- "psubw %%mm0,%%mm5\n\t"
+ /*mm2={-2}x4*/
+ "pcmpeqw %%mm2,%%mm2\n\t"
+ "paddw %%mm2,%%mm2\n\t"
+ /*Round and store the results (no transpose).*/
+ "movq 0x10(%[y]),%%mm7\n\t"
+ "psubw %%mm2,%%mm4\n\t"
+ "psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
- "psubw %%mm0,%%mm6\n\t"
- "psraw $2,%%mm5\n\t"
- "psubw %%mm0,%%mm7\n\t"
+ "psubw %%mm2,%%mm0\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+ "movq 0x30(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
- "psubw %%mm0,%%mm1\n\t"
+ "psubw %%mm2,%%mm5\n\t"
+ "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+ "psraw $2,%%mm0\n\t"
+ "psubw %%mm2,%%mm3\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
+ "psraw $2,%%mm5\n\t"
+ "psubw %%mm2,%%mm1\n\t"
+ "movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
+ "psraw $2,%%mm3\n\t"
+ "psubw %%mm2,%%mm7\n\t"
+ "movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
+ "psraw $2,%%mm1\n\t"
+ "psubw %%mm2,%%mm4\n\t"
+ "movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
"psraw $2,%%mm7\n\t"
+ "movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
+ "psraw $2,%%mm4\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
+ /*Load the next block.*/
"movq 0x40(%[y]),%%mm0\n\t"
- "psraw $2,%%mm1\n\t"
- "movq %%mm7,0x30(%[y])\n\t"
"movq 0x78(%[y]),%%mm7\n\t"
- "movq %%mm1,0x08(%[y])\n\t"
"movq 0x50(%[y]),%%mm1\n\t"
- "movq %%mm6,0x20(%[y])\n\t"
"movq 0x68(%[y]),%%mm6\n\t"
- "movq %%mm2,0x28(%[y])\n\t"
"movq 0x60(%[y]),%%mm2\n\t"
- "movq %%mm5,0x10(%[y])\n\t"
"movq 0x58(%[y]),%%mm5\n\t"
- "movq %%mm3,0x38(%[y])\n\t"
"movq 0x70(%[y]),%%mm3\n\t"
- "movq %%mm4,0x00(%[y])\n\t"
"movq 0x48(%[y]),%%mm4\n\t"
OC_FDCT_STAGE1_8x4
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
- OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
- /*mm0={-2}x4*/
- "pcmpeqw %%mm0,%%mm0\n\t"
- "paddw %%mm0,%%mm0\n\t"
- /*Round the results.*/
- "psubw %%mm0,%%mm1\n\t"
- "psubw %%mm0,%%mm2\n\t"
- "psraw $2,%%mm1\n\t"
- "psubw %%mm0,%%mm3\n\t"
- "movq %%mm1,0x58(%[y])\n\t"
- "psraw $2,%%mm2\n\t"
- "psubw %%mm0,%%mm4\n\t"
- "movq 0x48(%[y]),%%mm1\n\t"
- "psraw $2,%%mm3\n\t"
- "psubw %%mm0,%%mm5\n\t"
- "movq %%mm2,0x68(%[y])\n\t"
+ /*mm2={-2}x4*/
+ "pcmpeqw %%mm2,%%mm2\n\t"
+ "paddw %%mm2,%%mm2\n\t"
+ /*Round and store the results (no transpose).*/
+ "movq 0x50(%[y]),%%mm7\n\t"
+ "psubw %%mm2,%%mm4\n\t"
+ "psubw %%mm2,%%mm6\n\t"
"psraw $2,%%mm4\n\t"
- "psubw %%mm0,%%mm6\n\t"
- "movq %%mm3,0x78(%[y])\n\t"
- "psraw $2,%%mm5\n\t"
- "psubw %%mm0,%%mm7\n\t"
- "movq %%mm4,0x40(%[y])\n\t"
+ "psubw %%mm2,%%mm0\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
+ "movq 0x70(%[y]),%%mm4\n\t"
"psraw $2,%%mm6\n\t"
- "psubw %%mm0,%%mm1\n\t"
- "movq %%mm5,0x50(%[y])\n\t"
- "psraw $2,%%mm7\n\t"
- "movq %%mm6,0x60(%[y])\n\t"
+ "psubw %%mm2,%%mm5\n\t"
+ "movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
+ "psraw $2,%%mm0\n\t"
+ "psubw %%mm2,%%mm3\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
+ "psraw $2,%%mm5\n\t"
+ "psubw %%mm2,%%mm1\n\t"
+ "movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
+ "psraw $2,%%mm3\n\t"
+ "psubw %%mm2,%%mm7\n\t"
+ "movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
"psraw $2,%%mm1\n\t"
- "movq %%mm7,0x70(%[y])\n\t"
- "movq %%mm1,0x48(%[y])\n\t"
- :[a]"=&r"(a)
+ "psubw %%mm2,%%mm4\n\t"
+ "movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
+ "psraw $2,%%mm7\n\t"
+ "movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
+ "psraw $2,%%mm4\n\t"
+ "movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
+ /*Final transpose and zig-zag.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+ "movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+ "movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
+
+ OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+ :[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
:[y]"r"(_y),[x]"r"(_x)
:"memory"
);