diff options
Diffstat (limited to 'thirdparty/libtheora/x86/mmxidct.c')
-rw-r--r-- | thirdparty/libtheora/x86/mmxidct.c | 292 |
1 files changed, 143 insertions, 149 deletions
diff --git a/thirdparty/libtheora/x86/mmxidct.c b/thirdparty/libtheora/x86/mmxidct.c index 76424e6364..b8e3077066 100644 --- a/thirdparty/libtheora/x86/mmxidct.c +++ b/thirdparty/libtheora/x86/mmxidct.c @@ -11,7 +11,7 @@ ******************************************************************** function: - last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ + last mod: $Id$ ********************************************************************/ @@ -30,89 +30,66 @@ -/*A table of constants used by the MMX routines.*/ -static const ogg_uint16_t __attribute__((aligned(8),used)) - OC_IDCT_CONSTS[(7+1)*4]={ - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, - 8, 8, 8, 8 -}; - -/*Converts the expression in the argument to a string.*/ -#define OC_M2STR(_s) #_s - /*38 cycles*/ -#define OC_IDCT_BEGIN \ +#define OC_IDCT_BEGIN(_y,_x) \ "#OC_IDCT_BEGIN\n\t" \ - "movq "OC_I(3)",%%mm2\n\t" \ - "movq "OC_C(3)",%%mm6\n\t" \ + "movq "OC_I(3,_x)",%%mm2\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ "movq %%mm2,%%mm4\n\t" \ - "movq "OC_J(5)",%%mm7\n\t" \ + "movq "OC_J(5,_x)",%%mm7\n\t" \ "pmulhw %%mm6,%%mm4\n\t" \ - "movq "OC_C(5)",%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ "pmulhw %%mm7,%%mm6\n\t" \ "movq %%mm1,%%mm5\n\t" \ "pmulhw %%mm2,%%mm1\n\t" \ - "movq "OC_I(1)",%%mm3\n\t" \ + "movq "OC_I(1,_x)",%%mm3\n\t" \ "pmulhw %%mm7,%%mm5\n\t" \ - "movq "OC_C(1)",%%mm0\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ "paddw %%mm2,%%mm4\n\t" \ "paddw %%mm7,%%mm6\n\t" \ "paddw %%mm1,%%mm2\n\t" \ - "movq "OC_J(7)",%%mm1\n\t" \ + "movq "OC_J(7,_x)",%%mm1\n\t" \ "paddw %%mm5,%%mm7\n\t" \ "movq %%mm0,%%mm5\n\t" \ "pmulhw %%mm3,%%mm0\n\t" \ "paddw %%mm7,%%mm4\n\t" \ "pmulhw %%mm1,%%mm5\n\t" \ - "movq "OC_C(7)",%%mm7\n\t" \ + "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ "psubw %%mm2,%%mm6\n\t" \ "paddw %%mm3,%%mm0\n\t" \ "pmulhw %%mm7,%%mm3\n\t" \ - "movq "OC_I(2)",%%mm2\n\t" \ + "movq "OC_I(2,_x)",%%mm2\n\t" \ "pmulhw %%mm1,%%mm7\n\t" \ "paddw %%mm1,%%mm5\n\t" \ "movq %%mm2,%%mm1\n\t" \ - "pmulhw "OC_C(2)",%%mm2\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ "psubw %%mm5,%%mm3\n\t" \ - "movq "OC_J(6)",%%mm5\n\t" \ + "movq "OC_J(6,_x)",%%mm5\n\t" \ "paddw %%mm7,%%mm0\n\t" \ "movq %%mm5,%%mm7\n\t" \ "psubw %%mm4,%%mm0\n\t" \ - "pmulhw "OC_C(2)",%%mm5\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ "paddw %%mm1,%%mm2\n\t" \ - "pmulhw "OC_C(6)",%%mm1\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ "paddw %%mm4,%%mm4\n\t" \ "paddw %%mm0,%%mm4\n\t" \ "psubw %%mm6,%%mm3\n\t" \ "paddw %%mm7,%%mm5\n\t" \ "paddw %%mm6,%%mm6\n\t" \ - "pmulhw "OC_C(6)",%%mm7\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ "paddw %%mm3,%%mm6\n\t" \ - "movq %%mm4,"OC_I(1)"\n\t" \ + "movq %%mm4,"OC_I(1,_y)"\n\t" \ "psubw %%mm5,%%mm1\n\t" \ - "movq "OC_C(4)",%%mm4\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ "movq %%mm3,%%mm5\n\t" \ "pmulhw %%mm4,%%mm3\n\t" \ "paddw %%mm2,%%mm7\n\t" \ - "movq %%mm6,"OC_I(2)"\n\t" \ + "movq %%mm6,"OC_I(2,_y)"\n\t" \ "movq %%mm0,%%mm2\n\t" \ - "movq "OC_I(0)",%%mm6\n\t" \ + "movq "OC_I(0,_x)",%%mm6\n\t" \ "pmulhw %%mm4,%%mm0\n\t" \ "paddw %%mm3,%%mm5\n\t" \ - "movq "OC_J(4)",%%mm3\n\t" \ + "movq "OC_J(4,_x)",%%mm3\n\t" \ "psubw %%mm1,%%mm5\n\t" \ "paddw %%mm0,%%mm2\n\t" \ "psubw %%mm3,%%mm6\n\t" \ @@ -126,18 +103,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "paddw %%mm0,%%mm6\n\t" \ "psubw %%mm2,%%mm6\n\t" \ "paddw %%mm2,%%mm2\n\t" \ - "movq "OC_I(1)",%%mm0\n\t" \ + "movq "OC_I(1,_y)",%%mm0\n\t" \ "paddw %%mm6,%%mm2\n\t" \ "paddw %%mm3,%%mm4\n\t" \ "psubw %%mm1,%%mm2\n\t" \ "#end OC_IDCT_BEGIN\n\t" \ /*38+8=46 cycles.*/ -#define OC_ROW_IDCT \ +#define OC_ROW_IDCT(_y,_x) \ "#OC_ROW_IDCT\n" \ - OC_IDCT_BEGIN \ + OC_IDCT_BEGIN(_y,_x) \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r4=E'=E-G*/ \ "psubw %%mm7,%%mm4\n\t" \ /*r1=H'+H'*/ \ @@ -162,7 +139,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "psubw %%mm0,%%mm7\n\t" \ "paddw %%mm0,%%mm0\n\t" \ /*Save R1.*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r0=R0=G.+C.*/ \ "paddw %%mm7,%%mm0\n\t" \ "#end OC_ROW_IDCT\n\t" \ @@ -195,11 +172,11 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) Since r1 is free at entry, we calculate the Js first.*/ /*19 cycles.*/ -#define OC_TRANSPOSE \ +#define OC_TRANSPOSE(_y) \ "#OC_TRANSPOSE\n\t" \ "movq %%mm4,%%mm1\n\t" \ "punpcklwd %%mm5,%%mm4\n\t" \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "punpckhwd %%mm5,%%mm1\n\t" \ "movq %%mm6,%%mm0\n\t" \ "punpcklwd %%mm7,%%mm6\n\t" \ @@ -207,17 +184,17 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "punpckldq %%mm6,%%mm4\n\t" \ "punpckhdq %%mm6,%%mm5\n\t" \ "movq %%mm1,%%mm6\n\t" \ - "movq %%mm4,"OC_J(4)"\n\t" \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ "punpckhwd %%mm7,%%mm0\n\t" \ - "movq %%mm5,"OC_J(5)"\n\t" \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ "punpckhdq %%mm0,%%mm6\n\t" \ - "movq "OC_I(0)",%%mm4\n\t" \ + "movq "OC_I(0,_y)",%%mm4\n\t" \ "punpckldq %%mm0,%%mm1\n\t" \ - "movq "OC_I(1)",%%mm5\n\t" \ + "movq "OC_I(1,_y)",%%mm5\n\t" \ "movq %%mm4,%%mm0\n\t" \ - "movq %%mm6,"OC_J(7)"\n\t" \ + "movq %%mm6,"OC_J(7,_y)"\n\t" \ "punpcklwd %%mm5,%%mm0\n\t" \ - "movq %%mm1,"OC_J(6)"\n\t" \ + "movq %%mm1,"OC_J(6,_y)"\n\t" \ "punpckhwd %%mm5,%%mm4\n\t" \ "movq %%mm2,%%mm5\n\t" \ "punpcklwd %%mm3,%%mm2\n\t" \ @@ -225,20 +202,20 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "punpckldq %%mm2,%%mm0\n\t" \ "punpckhdq %%mm2,%%mm1\n\t" \ "movq %%mm4,%%mm2\n\t" \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "punpckhwd %%mm3,%%mm5\n\t" \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ "punpckhdq %%mm5,%%mm4\n\t" \ "punpckldq %%mm5,%%mm2\n\t" \ - "movq %%mm4,"OC_I(3)"\n\t" \ - "movq %%mm2,"OC_I(2)"\n\t" \ + "movq %%mm4,"OC_I(3,_y)"\n\t" \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ "#end OC_TRANSPOSE\n\t" \ /*38+19=57 cycles.*/ -#define OC_COLUMN_IDCT \ +#define OC_COLUMN_IDCT(_y) \ "#OC_COLUMN_IDCT\n" \ - OC_IDCT_BEGIN \ - "paddw "OC_8",%%mm2\n\t" \ + OC_IDCT_BEGIN(_y,_y) \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ /*r1=H'+H'*/ \ "paddw %%mm1,%%mm1\n\t" \ /*r1=R1=A''+H'*/ \ @@ -250,18 +227,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) /*r1=NR1*/ \ "psraw $4,%%mm1\n\t" \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r7=G+G*/ \ "paddw %%mm7,%%mm7\n\t" \ /*Store NR2 at I(2).*/ \ - "movq %%mm2,"OC_I(2)"\n\t" \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ /*r7=G'=E+G*/ \ "paddw %%mm4,%%mm7\n\t" \ /*Store NR1 at I(1).*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r4=R4=E'-D'*/ \ "psubw %%mm3,%%mm4\n\t" \ - "paddw "OC_8",%%mm4\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ /*r3=D'+D'*/ \ "paddw %%mm3,%%mm3\n\t" \ /*r3=R3=E'+D'*/ \ @@ -272,7 +249,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) "psubw %%mm5,%%mm6\n\t" \ /*r3=NR3*/ \ "psraw $4,%%mm3\n\t" \ - "paddw "OC_8",%%mm6\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ /*r5=B''+B''*/ \ "paddw %%mm5,%%mm5\n\t" \ /*r5=R5=F'+B''*/ \ @@ -280,14 +257,14 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) /*r6=NR6*/ \ "psraw $4,%%mm6\n\t" \ /*Store NR4 at J(4).*/ \ - "movq %%mm4,"OC_J(4)"\n\t" \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ /*r5=NR5*/ \ "psraw $4,%%mm5\n\t" \ /*Store NR3 at I(3).*/ \ - "movq %%mm3,"OC_I(3)"\n\t" \ + "movq %%mm3,"OC_I(3,_y)"\n\t" \ /*r7=R7=G'-C'*/ \ "psubw %%mm0,%%mm7\n\t" \ - "paddw "OC_8",%%mm7\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ /*r0=C'+C'*/ \ "paddw %%mm0,%%mm0\n\t" \ /*r0=R0=G'+C'*/ \ @@ -295,113 +272,121 @@ static const ogg_uint16_t __attribute__((aligned(8),used)) /*r7=NR7*/ \ "psraw $4,%%mm7\n\t" \ /*Store NR6 at J(6).*/ \ - "movq %%mm6,"OC_J(6)"\n\t" \ + "movq %%mm6,"OC_J(6,_y)"\n\t" \ /*r0=NR0*/ \ "psraw $4,%%mm0\n\t" \ /*Store NR5 at J(5).*/ \ - "movq %%mm5,"OC_J(5)"\n\t" \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ /*Store NR7 at J(7).*/ \ - "movq %%mm7,"OC_J(7)"\n\t" \ + "movq %%mm7,"OC_J(7,_y)"\n\t" \ /*Store NR0 at I(0).*/ \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "#end OC_COLUMN_IDCT\n\t" \ -#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])" -#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1) -#define OC_8 OC_MID(OC_EIGHT_OFFSET,0) - -static void oc_idct8x8_slow(ogg_int16_t _y[64]){ +static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ + int i; /*This routine accepts an 8x8 matrix, but in partially transposed form. Every 4x4 block is transposed.*/ __asm__ __volatile__( -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])" - OC_ROW_IDCT - OC_TRANSPOSE +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) + OC_ROW_IDCT(y,x) + OC_TRANSPOSE(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16)+64)"(%[y])" -#define OC_J(_k) OC_M2STR(((_k-4)*16)+72)"(%[y])" - OC_ROW_IDCT - OC_TRANSPOSE +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) + OC_ROW_IDCT(y,x) + OC_TRANSPOSE(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT(y) #undef OC_I #undef OC_J - : - :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS) + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) ); + __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); + for(i=0;i<4;i++){ + __asm__ __volatile__( + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" + :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) + ); + } } /*25 cycles.*/ -#define OC_IDCT_BEGIN_10 \ +#define OC_IDCT_BEGIN_10(_y,_x) \ "#OC_IDCT_BEGIN_10\n\t" \ - "movq "OC_I(3)",%%mm2\n\t" \ + "movq "OC_I(3,_x)",%%mm2\n\t" \ "nop\n\t" \ - "movq "OC_C(3)",%%mm6\n\t" \ + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ "movq %%mm2,%%mm4\n\t" \ - "movq "OC_C(5)",%%mm1\n\t" \ + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ "pmulhw %%mm6,%%mm4\n\t" \ - "movq "OC_I(1)",%%mm3\n\t" \ + "movq "OC_I(1,_x)",%%mm3\n\t" \ "pmulhw %%mm2,%%mm1\n\t" \ - "movq "OC_C(1)",%%mm0\n\t" \ + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ "paddw %%mm2,%%mm4\n\t" \ "pxor %%mm6,%%mm6\n\t" \ "paddw %%mm1,%%mm2\n\t" \ - "movq "OC_I(2)",%%mm5\n\t" \ + "movq "OC_I(2,_x)",%%mm5\n\t" \ "pmulhw %%mm3,%%mm0\n\t" \ "movq %%mm5,%%mm1\n\t" \ "paddw %%mm3,%%mm0\n\t" \ - "pmulhw "OC_C(7)",%%mm3\n\t" \ + "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ "psubw %%mm2,%%mm6\n\t" \ - "pmulhw "OC_C(2)",%%mm5\n\t" \ + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ "psubw %%mm4,%%mm0\n\t" \ - "movq "OC_I(2)",%%mm7\n\t" \ + "movq "OC_I(2,_x)",%%mm7\n\t" \ "paddw %%mm4,%%mm4\n\t" \ "paddw %%mm5,%%mm7\n\t" \ "paddw %%mm0,%%mm4\n\t" \ - "pmulhw "OC_C(6)",%%mm1\n\t" \ + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ "psubw %%mm6,%%mm3\n\t" \ - "movq %%mm4,"OC_I(1)"\n\t" \ + "movq %%mm4,"OC_I(1,_y)"\n\t" \ "paddw %%mm6,%%mm6\n\t" \ - "movq "OC_C(4)",%%mm4\n\t" \ + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ "paddw %%mm3,%%mm6\n\t" \ "movq %%mm3,%%mm5\n\t" \ "pmulhw %%mm4,%%mm3\n\t" \ - "movq %%mm6,"OC_I(2)"\n\t" \ + "movq %%mm6,"OC_I(2,_y)"\n\t" \ "movq %%mm0,%%mm2\n\t" \ - "movq "OC_I(0)",%%mm6\n\t" \ + "movq "OC_I(0,_x)",%%mm6\n\t" \ "pmulhw %%mm4,%%mm0\n\t" \ "paddw %%mm3,%%mm5\n\t" \ "paddw %%mm0,%%mm2\n\t" \ "psubw %%mm1,%%mm5\n\t" \ "pmulhw %%mm4,%%mm6\n\t" \ - "paddw "OC_I(0)",%%mm6\n\t" \ + "paddw "OC_I(0,_x)",%%mm6\n\t" \ "paddw %%mm1,%%mm1\n\t" \ "movq %%mm6,%%mm4\n\t" \ "paddw %%mm5,%%mm1\n\t" \ "psubw %%mm2,%%mm6\n\t" \ "paddw %%mm2,%%mm2\n\t" \ - "movq "OC_I(1)",%%mm0\n\t" \ + "movq "OC_I(1,_y)",%%mm0\n\t" \ "paddw %%mm6,%%mm2\n\t" \ "psubw %%mm1,%%mm2\n\t" \ "nop\n\t" \ "#end OC_IDCT_BEGIN_10\n\t" \ /*25+8=33 cycles.*/ -#define OC_ROW_IDCT_10 \ +#define OC_ROW_IDCT_10(_y,_x) \ "#OC_ROW_IDCT_10\n\t" \ - OC_IDCT_BEGIN_10 \ + OC_IDCT_BEGIN_10(_y,_x) \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r4=E'=E-G*/ \ "psubw %%mm7,%%mm4\n\t" \ /*r1=H'+H'*/ \ @@ -426,16 +411,16 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ "psubw %%mm0,%%mm7\n\t" \ "paddw %%mm0,%%mm0\n\t" \ /*Save R1.*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r0=R0=G'+C'*/ \ "paddw %%mm7,%%mm0\n\t" \ "#end OC_ROW_IDCT_10\n\t" \ /*25+19=44 cycles'*/ -#define OC_COLUMN_IDCT_10 \ +#define OC_COLUMN_IDCT_10(_y) \ "#OC_COLUMN_IDCT_10\n\t" \ - OC_IDCT_BEGIN_10 \ - "paddw "OC_8",%%mm2\n\t" \ + OC_IDCT_BEGIN_10(_y,_y) \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ /*r1=H'+H'*/ \ "paddw %%mm1,%%mm1\n\t" \ /*r1=R1=A''+H'*/ \ @@ -447,18 +432,18 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r1=NR1*/ \ "psraw $4,%%mm1\n\t" \ /*r3=D'*/ \ - "movq "OC_I(2)",%%mm3\n\t" \ + "movq "OC_I(2,_y)",%%mm3\n\t" \ /*r7=G+G*/ \ "paddw %%mm7,%%mm7\n\t" \ /*Store NR2 at I(2).*/ \ - "movq %%mm2,"OC_I(2)"\n\t" \ + "movq %%mm2,"OC_I(2,_y)"\n\t" \ /*r7=G'=E+G*/ \ "paddw %%mm4,%%mm7\n\t" \ /*Store NR1 at I(1).*/ \ - "movq %%mm1,"OC_I(1)"\n\t" \ + "movq %%mm1,"OC_I(1,_y)"\n\t" \ /*r4=R4=E'-D'*/ \ "psubw %%mm3,%%mm4\n\t" \ - "paddw "OC_8",%%mm4\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ /*r3=D'+D'*/ \ "paddw %%mm3,%%mm3\n\t" \ /*r3=R3=E'+D'*/ \ @@ -469,7 +454,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ "psubw %%mm5,%%mm6\n\t" \ /*r3=NR3*/ \ "psraw $4,%%mm3\n\t" \ - "paddw "OC_8",%%mm6\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ /*r5=B''+B''*/ \ "paddw %%mm5,%%mm5\n\t" \ /*r5=R5=F'+B''*/ \ @@ -477,14 +462,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r6=NR6*/ \ "psraw $4,%%mm6\n\t" \ /*Store NR4 at J(4).*/ \ - "movq %%mm4,"OC_J(4)"\n\t" \ + "movq %%mm4,"OC_J(4,_y)"\n\t" \ /*r5=NR5*/ \ "psraw $4,%%mm5\n\t" \ /*Store NR3 at I(3).*/ \ - "movq %%mm3,"OC_I(3)"\n\t" \ + "movq %%mm3,"OC_I(3,_y)"\n\t" \ /*r7=R7=G'-C'*/ \ "psubw %%mm0,%%mm7\n\t" \ - "paddw "OC_8",%%mm7\n\t" \ + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ /*r0=C'+C'*/ \ "paddw %%mm0,%%mm0\n\t" \ /*r0=R0=G'+C'*/ \ @@ -492,46 +477,55 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){ /*r7=NR7*/ \ "psraw $4,%%mm7\n\t" \ /*Store NR6 at J(6).*/ \ - "movq %%mm6,"OC_J(6)"\n\t" \ + "movq %%mm6,"OC_J(6,_y)"\n\t" \ /*r0=NR0*/ \ "psraw $4,%%mm0\n\t" \ /*Store NR5 at J(5).*/ \ - "movq %%mm5,"OC_J(5)"\n\t" \ + "movq %%mm5,"OC_J(5,_y)"\n\t" \ /*Store NR7 at J(7).*/ \ - "movq %%mm7,"OC_J(7)"\n\t" \ + "movq %%mm7,"OC_J(7,_y)"\n\t" \ /*Store NR0 at I(0).*/ \ - "movq %%mm0,"OC_I(0)"\n\t" \ + "movq %%mm0,"OC_I(0,_y)"\n\t" \ "#end OC_COLUMN_IDCT_10\n\t" \ -static void oc_idct8x8_10(ogg_int16_t _y[64]){ +static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ __asm__ __volatile__( -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])" +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) /*Done with dequant, descramble, and partial transpose. Now do the iDCT itself.*/ - OC_ROW_IDCT_10 - OC_TRANSPOSE + OC_ROW_IDCT_10(y,x) + OC_TRANSPOSE(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16))"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT_10 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(y) #undef OC_I #undef OC_J -#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])" -#define OC_J(_k) OC_I(_k) - OC_COLUMN_IDCT_10 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) +#define OC_J(_k,_y) OC_I(_k,_y) + OC_COLUMN_IDCT_10(y) #undef OC_I #undef OC_J - : - :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS) + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) + ); + __asm__ __volatile__( + "pxor %%mm0,%%mm0\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" + "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" + :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) ); } /*Performs an inverse 8x8 Type-II DCT transform. The input is assumed to be scaled by a factor of 4 relative to orthonormal version of the transform.*/ -void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){ +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ /*_last_zzi is subtly different from an actual count of the number of coefficients we decoded for this block. It contains the value of zzi BEFORE the final token in the block was @@ -557,8 +551,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){ gets. Needless to say we inherited this approach from VP3.*/ /*Then perform the iDCT.*/ - if(_last_zzi<10)oc_idct8x8_10(_y); - else oc_idct8x8_slow(_y); + if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); + else oc_idct8x8_slow_mmx(_y,_x); } #endif |