13 files changed, 882 insertions, 359 deletions
diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c
index 94f1d06513..a6be819135 100644
--- a/thirdparty/libtheora/x86_vc/mmxencfrag.c
+++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c
@@ -266,7 +266,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 /*Performs the first two stages of an 8-point 1-D Hadamard transform.
   The transform is performed in place, except that outputs 0-3 are swapped with
    outputs 4-7.
-  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
    perform this stage in place with no temporary registers).*/
 #define OC_HADAMARD_AB_8x4 __asm{ \
   /*Stage A: \
@@ -299,7 +299,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 }
 
 /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
-  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
    place with no temporary registers).*/
 #define OC_HADAMARD_C_8x4 __asm{ \
   /*Stage C:*/ \
@@ -468,12 +468,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
     mm7 = d3 c3 b3 a3*/ \
 }
 
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ogg_int16_t *bufp;
+  unsigned     ret;
+  unsigned     ret2;
+  int          dc;
   bufp=buf;
   __asm{
 #define SRC esi
@@ -481,8 +483,10 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
 #define SRC_YSTRIDE ecx
 #define REF_YSTRIDE edx
 #define BUF edi
-#define RET eax
-#define RET2 edx
+#define RET edx
+#define RET2 ecx
+#define DC eax
+#define DC_WORD ax
     mov SRC,_src
     mov SRC_YSTRIDE,_src_ystride
     mov REF,_ref
@@ -508,14 +512,18 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
     movq mm2,[0x20+BUF]
     movq mm3,[0x30+BUF]
     movq mm0,[0x00+BUF]
-    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    movd DC,mm1
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
       Now we finally have to promote things to dwords.
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
-    mov RET2,_thresh
     pmaddwd mm0,mm7
     movq mm1,[0x50+BUF]
     movq mm5,[0x58+BUF]
@@ -525,29 +533,28 @@ static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
     movq mm6,[0x68+BUF]
     paddd mm4,mm0
     movq mm3,[0x70+BUF]
-    movd RET,mm4
+    movd RET2,mm4
     movq mm7,[0x78+BUF]
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    lea RET,[RET+RET-32]
     movq mm0,[0x40+BUF]
-    cmp RET,RET2
     movq mm4,[0x48+BUF]
-    jae at_end
     OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     pmaddwd mm0,mm7
-    /*There isn't much to stick in here to hide the latency this time, but the
-       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
-       latency is even worse.*/
-    sub RET,32
+    /*Subtract abs(dc) from 2*ret2.*/
+    movsx DC,DC_WORD
+    cdq
+    lea RET2,[RET+RET2*2]
     movq mm4,mm0
     punpckhdq mm0,mm0
+    xor RET,DC
     paddd mm4,mm0
-    movd RET2,mm4
-    lea RET,[RET+RET2*2]
-    align 16
-at_end:
-    mov ret1,RET
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    sub RET2,RET
+    movd RET,mm4
+    lea RET,[RET2+RET*2-64]
+    mov ret,RET
+    mov dc,DC
 #undef SRC
 #undef REF
 #undef SRC_YSTRIDE
@@ -555,18 +562,21 @@ at_end:
 #undef BUF
 #undef RET
 #undef RET2
+#undef DC
+#undef DC_WORD
   }
-  return ret1;
+  *_dc=dc;
+  return ret;
 }
 
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
 }
 
 
 /*Our internal implementation of frag_copy2 takes an extra stride parameter so
-   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+   we can share code with oc_enc_frag_satd2_mmxext().*/
 static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
   __asm{
@@ -694,30 +704,31 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
   }
 }
 
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
 }
 
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
  int _ystride){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t           *bufp;
-  unsigned               ret1;
-  unsigned               ret2;
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ogg_int16_t *bufp;
+  unsigned     ret1;
+  unsigned     ret2;
+  int          dc;
   bufp=buf;
   __asm{
 #define SRC eax
 #define SRC4 esi
 #define BUF edi
-#define RET eax
-#define RET_WORD ax
-#define RET2 ecx
 #define YSTRIDE edx
 #define YSTRIDE3 ecx
+#define RET eax
+#define RET2 ecx
+#define DC edx
+#define DC_WORD dx
     mov SRC,_src
     mov BUF,bufp
     mov YSTRIDE,_ystride
@@ -749,7 +760,7 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
       middle.*/
     OC_HADAMARD_AB_8x4
     OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
-    movd RET,mm1
+    movd DC,mm1
     OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
@@ -767,31 +778,34 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
     movq mm3,[0x70+BUF]
     paddd mm4,mm0
     movq mm7,[0x78+BUF]
-    movd RET2,mm4
+    movd RET,mm4
     movq mm0,[0x40+BUF]
     movq mm4,[0x48+BUF]
     OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     pmaddwd mm0,mm7
     /*We assume that the DC coefficient is always positive (which is true,
     because the input to the INTRA transform was not a difference).*/
-    movzx RET,RET_WORD
-    add RET2,RET2
-    sub RET2,RET
+    movzx DC,DC_WORD
+    add RET,RET
+    sub RET,DC
     movq mm4,mm0
     punpckhdq mm0,mm0
     paddd mm4,mm0
-    movd RET,mm4
-    lea RET,[-64+RET2+RET*2]
+    movd RET2,mm4
+    lea RET,[-64+RET+RET2*2]
+    mov [dc],DC
     mov [ret1],RET
 #undef SRC
 #undef SRC4
 #undef BUF
-#undef RET
-#undef RET_WORD
-#undef RET2
 #undef YSTRIDE
 #undef YSTRIDE3
+#undef RET
+#undef RET2
+#undef DC
+#undef DC_WORD
   }
+  *_dc=dc;
   return ret1;
 }
 
diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c
index d908ce2413..c9ee530ea2 100644
--- a/thirdparty/libtheora/x86_vc/mmxfdct.c
+++ b/thirdparty/libtheora/x86_vc/mmxfdct.c
@@ -12,6 +12,7 @@
  /*MMX fDCT implementation for x86_32*/
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include "x86enc.h"
+#include "x86zigzag.h"
 
 #if defined(OC_X86_ASM)
 
@@ -462,18 +463,22 @@
 }
 
 /*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ogg_int16_t *bufp;
+  bufp=buf;
   __asm{
+#define X edx
 #define Y eax
 #define A ecx
-#define X edx
+#define BUF esi
     /*Add two extra bits of working precision to improve accuracy; any more and
        we could overflow.*/
     /*We also add biases to correct for some systematic error that remains in
        the full fDCT->iDCT round trip.*/
     mov X, _x
     mov Y, _y
+	mov BUF, bufp
     movq mm0,[0x00+X]
     movq mm1,[0x10+X]
     movq mm2,[0x20+X]
@@ -591,79 +596,90 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     movq mm3,[0x30+Y]
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
-    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
     /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x18+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x08+Y]
-    psraw mm3,2
-    psubw mm5,mm0
+    pcmpeqw mm2,mm2
+    paddw mm2,mm2
+    /*Round and store the results (no transpose).*/
+    movq mm7,[Y+0x10]
+    psubw mm4,mm2
+    psubw mm6,mm2
     psraw mm4,2
-    psubw mm6,mm0
-    psraw mm5,2
-    psubw mm7,mm0
+    psubw mm0,mm2
+    movq [BUF+0x00],mm4
+    movq mm4,[Y+0x30]
     psraw mm6,2
-    psubw mm1,mm0
+    psubw mm5,mm2
+    movq [BUF+0x20],mm6
+    psraw mm0,2
+    psubw mm3,mm2
+    movq [BUF+0x40],mm0
+    psraw mm5,2
+    psubw mm1,mm2
+    movq [BUF+0x50],mm5
+    psraw mm3,2
+    psubw mm7,mm2
+    movq [BUF+0x60],mm3
+    psraw mm1,2
+    psubw mm4,mm2
+    movq [BUF+0x70],mm1
     psraw mm7,2
+    movq [BUF+0x10],mm7
+    psraw mm4,2
+    movq [BUF+0x30],mm4
+    /*Load the next block.*/
     movq mm0,[0x40+Y]
-    psraw mm1,2
-    movq [0x30+Y],mm7
     movq mm7,[0x78+Y]
-    movq [0x08+Y],mm1
     movq mm1,[0x50+Y]
-    movq [0x20+Y],mm6
     movq mm6,[0x68+Y]
-    movq [0x28+Y],mm2
     movq mm2,[0x60+Y]
-    movq [0x10+Y],mm5
     movq mm5,[0x58+Y]
-    movq [0x38+Y],mm3
     movq mm3,[0x70+Y]
-    movq [0x00+Y],mm4
     movq mm4,[0x48+Y]
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
-    OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
     /*mm0={-2}x4*/
-    pcmpeqw mm0,mm0
-    paddw mm0,mm0
-    /*Round the results.*/
-    psubw mm1,mm0
-    psubw mm2,mm0
-    psraw mm1,2
-    psubw mm3,mm0
-    movq [0x58+Y],mm1
-    psraw mm2,2
-    psubw mm4,mm0
-    movq mm1,[0x48+Y]
-    psraw mm3,2
-    psubw mm5,mm0
-    movq [0x68+Y],mm2
+    pcmpeqw mm2,mm2
+    paddw mm2,mm2
+    /*Round and store the results (no transpose).*/
+    movq mm7,[Y+0x50]
+    psubw mm4,mm2
+    psubw mm6,mm2
     psraw mm4,2
-    psubw mm6,mm0
-    movq [0x78+Y],mm3
-    psraw mm5,2
-    psubw mm7,mm0
-    movq [0x40+Y],mm4
+    psubw mm0,mm2
+    movq [BUF+0x08],mm4
+    movq mm4,[Y+0x70]
     psraw mm6,2
-    psubw mm1,mm0
-    movq [0x50+Y],mm5
-    psraw mm7,2
-    movq [0x60+Y],mm6
+    psubw mm5,mm2
+    movq [BUF+0x28],mm6
+    psraw mm0,2
+    psubw mm3,mm2
+    movq [BUF+0x48],mm0
+    psraw mm5,2
+    psubw mm1,mm2
+    movq [BUF+0x58],mm5
+    psraw mm3,2
+    psubw mm7,mm2
+    movq [BUF+0x68],mm3
     psraw mm1,2
-    movq [0x70+Y],mm7
-    movq [0x48+Y],mm1
+    psubw mm4,mm2
+    movq [BUF+0x78],mm1
+    psraw mm7,2
+    movq [BUF+0x18],mm7
+    psraw mm4,2
+    movq [BUF+0x38],mm4
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    __asm movq _reg,[BUF+16*(_row)] \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    __asm movq _reg,[BUF+16*(_row)+8] \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+#undef X
 #undef Y
 #undef A
-#undef X
+#undef BUF
   }
 }
 
diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.c b/thirdparty/libtheora/x86_vc/mmxfrag.c
index 4eb2084dc6..248312ff90 100644
--- a/thirdparty/libtheora/x86_vc/mmxfrag.c
+++ b/thirdparty/libtheora/x86_vc/mmxfrag.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -22,12 +22,63 @@
   The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 
 #if defined(OC_X86_ASM)
 
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm  mov SRC,src \
+    __asm  mov DST,dst \
+    __asm  mov YSTRIDE,_ystride \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*ystride3=ystride*3*/ \
+    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*Pointer to next 4.*/ \
+    __asm  lea SRC,[SRC+YSTRIDE*4] \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+    /*Pointer to next 4.*/ \
+    __asm  lea DST,[DST+YSTRIDE*4] \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+  } \
+  while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride){
 #define SRC edx
@@ -41,6 +92,34 @@ void oc_frag_copy_mmx(unsigned char *_dst,
 #undef YSTRIDE3
 }
 
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+}
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
   __asm{
diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.h b/thirdparty/libtheora/x86_vc/mmxfrag.h
deleted file mode 100644
index 45ee93e777..0000000000
--- a/thirdparty/libtheora/x86_vc/mmxfrag.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#if !defined(_x86_vc_mmxfrag_H)
-# define _x86_vc_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm  mov SRC,src \
-    __asm  mov DST,dst \
-    __asm  mov YSTRIDE,_ystride \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*ystride3=ystride*3*/ \
-    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*Pointer to next 4.*/ \
-    __asm  lea SRC,[SRC+YSTRIDE*4] \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-    /*Pointer to next 4.*/ \
-    __asm  lea DST,[DST+YSTRIDE*4] \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-  } \
-  while(0)
-
-# endif
-#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxidct.c b/thirdparty/libtheora/x86_vc/mmxidct.c
index 8f5ff6803c..55e00aedcf 100644
--- a/thirdparty/libtheora/x86_vc/mmxidct.c
+++ b/thirdparty/libtheora/x86_vc/mmxidct.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -24,15 +24,15 @@
 
 /*These are offsets into the table of constants below.*/
 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (0)
+#define OC_COSINE_OFFSET (8)
 /*A row of 8's.*/
-#define OC_EIGHT_OFFSET  (56)
+#define OC_EIGHT_OFFSET  (0)
 
 
 
 /*A table of constants used by the MMX routines.*/
-static const __declspec(align(16))ogg_uint16_t
- OC_IDCT_CONSTS[(7+1)*4]={
+static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
+      8,    8,    8,    8,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
@@ -46,28 +46,27 @@ static const __declspec(align(16))ogg_uint16_t
   (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
   (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
   (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
 };
 
 /*38 cycles*/
-#define OC_IDCT_BEGIN __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
   __asm movq mm6,OC_C(3) \
   __asm movq mm4,mm2 \
-  __asm movq mm7,OC_J(5) \
+  __asm movq mm7,OC_J(5,_x) \
   __asm pmulhw mm4,mm6 \
   __asm movq mm1,OC_C(5) \
   __asm pmulhw mm6,mm7 \
   __asm movq mm5,mm1 \
   __asm pmulhw mm1,mm2 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
   __asm pmulhw mm5,mm7 \
   __asm movq mm0,OC_C(1) \
   __asm paddw mm4,mm2 \
   __asm paddw mm6,mm7 \
   __asm paddw mm2,mm1 \
-  __asm movq mm1,OC_J(7) \
+  __asm movq mm1,OC_J(7,_x) \
   __asm paddw mm7,mm5 \
   __asm movq mm5,mm0 \
   __asm pmulhw mm0,mm3 \
@@ -77,13 +76,13 @@ static const __declspec(align(16))ogg_uint16_t
   __asm psubw mm6,mm2 \
   __asm paddw mm0,mm3 \
   __asm pmulhw mm3,mm7 \
-  __asm movq mm2,OC_I(2) \
+  __asm movq mm2,OC_I(2,_x) \
   __asm pmulhw mm7,mm1 \
   __asm paddw mm5,mm1 \
   __asm movq mm1,mm2 \
   __asm pmulhw mm2,OC_C(2) \
   __asm psubw mm3,mm5 \
-  __asm movq mm5,OC_J(6) \
+  __asm movq mm5,OC_J(6,_x) \
   __asm paddw mm0,mm7 \
   __asm movq mm7,mm5 \
   __asm psubw mm0,mm4 \
@@ -97,18 +96,18 @@ static const __declspec(align(16))ogg_uint16_t
   __asm paddw mm6,mm6 \
   __asm pmulhw mm7,OC_C(6) \
   __asm paddw mm6,mm3 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
   __asm psubw mm1,mm5 \
   __asm movq mm4,OC_C(4) \
   __asm movq mm5,mm3 \
   __asm pmulhw mm3,mm4 \
   __asm paddw mm7,mm2 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
   __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
   __asm pmulhw mm0,mm4 \
   __asm paddw mm5,mm3 \
-  __asm movq mm3,OC_J(4) \
+  __asm movq mm3,OC_J(4,_x) \
   __asm psubw mm5,mm1 \
   __asm paddw mm2,mm0 \
   __asm psubw mm6,mm3 \
@@ -122,17 +121,17 @@ static const __declspec(align(16))ogg_uint16_t
   __asm paddw mm6,mm0 \
   __asm psubw mm6,mm2 \
   __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
   __asm paddw mm2,mm6 \
   __asm paddw mm4,mm3 \
   __asm psubw mm2,mm1 \
 }
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_ROW_IDCT(_y,_x) __asm{ \
+  OC_IDCT_BEGIN(_y,_x) \
   /*r3=D'*/ \
-  __asm  movq mm3,OC_I(2) \
+  __asm  movq mm3,OC_I(2,_y) \
   /*r4=E'=E-G*/ \
   __asm  psubw mm4,mm7 \
   /*r1=H'+H'*/ \
@@ -157,7 +156,7 @@ static const __declspec(align(16))ogg_uint16_t
   __asm  psubw mm7,mm0 \
   __asm  paddw mm0,mm0 \
   /*Save R1.*/ \
-  __asm  movq OC_I(1),mm1 \
+  __asm  movq OC_I(1,_y),mm1 \
   /*r0=R0=G.+C.*/ \
   __asm  paddw mm0,mm7 \
 }
@@ -190,10 +189,10 @@ static const __declspec(align(16))ogg_uint16_t
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE __asm{ \
+#define OC_TRANSPOSE(_y) __asm{ \
   __asm movq mm1,mm4 \
   __asm punpcklwd mm4,mm5 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
   __asm punpckhwd mm1,mm5 \
   __asm movq mm0,mm6 \
   __asm punpcklwd mm6,mm7 \
@@ -201,17 +200,17 @@ static const __declspec(align(16))ogg_uint16_t
   __asm punpckldq mm4,mm6 \
   __asm punpckhdq mm5,mm6 \
   __asm movq mm6,mm1 \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   __asm punpckhwd mm0,mm7 \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   __asm punpckhdq mm6,mm0 \
-  __asm movq mm4,OC_I(0) \
+  __asm movq mm4,OC_I(0,_y) \
   __asm punpckldq mm1,mm0 \
-  __asm movq mm5,OC_I(1) \
+  __asm movq mm5,OC_I(1,_y) \
   __asm movq mm0,mm4 \
-  __asm movq OC_J(7),mm6 \
+  __asm movq OC_J(7,_y),mm6 \
   __asm punpcklwd mm0,mm5 \
-  __asm movq OC_J(6),mm1 \
+  __asm movq OC_J(6,_y),mm1 \
   __asm punpckhwd mm4,mm5 \
   __asm movq mm5,mm2 \
   __asm punpcklwd mm2,mm3 \
@@ -219,18 +218,18 @@ static const __declspec(align(16))ogg_uint16_t
   __asm punpckldq mm0,mm2 \
   __asm punpckhdq mm1,mm2 \
   __asm movq mm2,mm4 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
   __asm punpckhwd mm5,mm3 \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   __asm punpckhdq mm4,mm5 \
   __asm punpckldq mm2,mm5 \
-  __asm movq OC_I(3),mm4 \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(3,_y),mm4 \
+  __asm movq OC_I(2,_y),mm2 \
 }
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_COLUMN_IDCT(_y) __asm{ \
+  OC_IDCT_BEGIN(_y,_y) \
   __asm paddw mm2,OC_8 \
   /*r1=H'+H'*/ \
   __asm paddw mm1,mm1 \
@@ -243,15 +242,15 @@ static const __declspec(align(16))ogg_uint16_t
   /*r1=NR1*/ \
   __asm psraw mm1,4 \
   /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
   /*r7=G+G*/ \
   __asm paddw mm7,mm7 \
   /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
   /*r7=G'=E+G*/ \
   __asm paddw mm7,mm4 \
   /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   /*r4=R4=E'-D'*/ \
   __asm psubw mm4,mm3 \
   __asm paddw mm4,OC_8 \
@@ -273,11 +272,11 @@ static const __declspec(align(16))ogg_uint16_t
   /*r6=NR6*/ \
   __asm psraw mm6,4 \
   /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   /*r5=NR5*/ \
   __asm psraw mm5,4 \
   /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
   /*r7=R7=G'-C'*/ \
   __asm psubw mm7,mm0 \
   __asm paddw mm7,OC_8 \
@@ -288,71 +287,89 @@ static const __declspec(align(16))ogg_uint16_t
   /*r7=NR7*/ \
   __asm psraw mm7,4 \
   /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
   /*r0=NR0*/ \
   __asm psraw mm0,4 \
   /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
   /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }
 
 #define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  int i;
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
     mov CONSTS,offset OC_IDCT_CONSTS
     mov Y,_y
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      [Y+(_k-4)*16+8]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+    mov X,_x
+#define OC_I(_k,_y)   [(_y)+(_k)*16]
+#define OC_J(_k,_y)   [(_y)+((_k)-4)*16+8]
+    OC_ROW_IDCT(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+(_k*16)+64]
-#define OC_J(_k)      [Y+(_k-4)*16+72]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   [(_y)+(_k)*16+64]
+#define OC_J(_k,_y)   [(_y)+((_k)-4)*16+72]
+    OC_ROW_IDCT(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+(_k)*16]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16+8]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+(_k)*16+8]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(Y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
+  }
+  __asm pxor mm0,mm0;
+  for(i=0;i<4;i++){
+    ogg_int16_t *x;
+    x=_x+16*i;
+#define X ecx
+    __asm{
+      mov X,x
+      movq [X+0x00],mm0
+      movq [X+0x08],mm0
+      movq [X+0x10],mm0
+      movq [X+0x18],mm0
+    }
+#undef  X
   }
 }
 
 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
   __asm nop \
   __asm movq mm6,OC_C(3) \
   __asm movq mm4,mm2 \
   __asm movq mm1,OC_C(5) \
   __asm pmulhw mm4,mm6 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
   __asm pmulhw mm1,mm2 \
   __asm movq mm0,OC_C(1) \
   __asm paddw mm4,mm2 \
   __asm pxor mm6,mm6 \
   __asm paddw mm2,mm1 \
-  __asm movq mm5,OC_I(2) \
+  __asm movq mm5,OC_I(2,_x) \
   __asm pmulhw mm0,mm3 \
   __asm movq mm1,mm5 \
   __asm paddw mm0,mm3 \
@@ -360,43 +377,43 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   __asm psubw mm6,mm2 \
   __asm pmulhw mm5,OC_C(2) \
   __asm psubw mm0,mm4 \
-  __asm movq mm7,OC_I(2) \
+  __asm movq mm7,OC_I(2,_x) \
   __asm paddw mm4,mm4 \
   __asm paddw mm7,mm5 \
   __asm paddw mm4,mm0 \
   __asm pmulhw mm1,OC_C(6) \
   __asm psubw mm3,mm6 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
   __asm paddw mm6,mm6 \
   __asm movq mm4,OC_C(4) \
   __asm paddw mm6,mm3 \
   __asm movq mm5,mm3 \
   __asm pmulhw mm3,mm4 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
   __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
   __asm pmulhw mm0,mm4 \
   __asm paddw mm5,mm3 \
   __asm paddw mm2,mm0 \
   __asm psubw mm5,mm1 \
   __asm pmulhw mm6,mm4 \
-  __asm paddw mm6,OC_I(0) \
+  __asm paddw mm6,OC_I(0,_x) \
   __asm paddw mm1,mm1 \
   __asm movq mm4,mm6 \
   __asm paddw mm1,mm5 \
   __asm psubw mm6,mm2 \
   __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
   __asm paddw mm2,mm6 \
   __asm psubw mm2,mm1 \
   __asm nop \
 }
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_ROW_IDCT_10(_y,_x) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_x) \
   /*r3=D'*/ \
-   __asm movq mm3,OC_I(2) \
+   __asm movq mm3,OC_I(2,_y) \
   /*r4=E'=E-G*/ \
    __asm psubw mm4,mm7 \
   /*r1=H'+H'*/ \
@@ -421,14 +438,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
    __asm psubw mm7,mm0 \
    __asm paddw mm0,mm0 \
   /*Save R1.*/ \
-   __asm movq OC_I(1),mm1 \
+   __asm movq OC_I(1,_y),mm1 \
   /*r0=R0=G'+C'*/ \
    __asm paddw mm0,mm7 \
 }
 
 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_COLUMN_IDCT_10(_y) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_y) \
   __asm paddw mm2,OC_8 \
   /*r1=H'+H'*/ \
   __asm paddw mm1,mm1 \
@@ -441,15 +458,15 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   /*r1=NR1*/ \
   __asm psraw mm1,4 \
   /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
   /*r7=G+G*/ \
   __asm paddw mm7,mm7 \
   /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
   /*r7=G'=E+G*/ \
   __asm paddw mm7,mm4 \
   /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   /*r4=R4=E'-D'*/ \
   __asm psubw mm4,mm3 \
   __asm paddw mm4,OC_8 \
@@ -471,11 +488,11 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   /*r6=NR6*/ \
   __asm psraw mm6,4 \
   /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   /*r5=NR5*/ \
   __asm psraw mm5,4 \
   /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
   /*r7=R7=G'-C'*/ \
   __asm psubw mm7,mm0 \
   __asm paddw mm7,OC_8 \
@@ -486,50 +503,63 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
   /*r7=NR7*/ \
   __asm psraw mm7,4 \
   /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
   /*r0=NR0*/ \
   __asm psraw mm0,4 \
   /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
   /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }
 
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
     mov CONSTS,offset OC_IDCT_CONSTS
     mov Y,_y
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) [Y+(_k-4)*16+8]
+    mov X,_x
+#define OC_I(_k,_y) [(_y)+(_k)*16]
+#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
     /*Done with dequant, descramble, and partial transpose.
       Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(Y,X)
+    OC_TRANSPOSE(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+(_k)*16]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(Y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16+8]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+(_k)*16+8]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(Y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
+  }
+#define X ecx
+  __asm{
+    pxor mm0,mm0;
+    mov X,_x
+    movq [X+0x00],mm0
+    movq [X+0x10],mm0
+    movq [X+0x20],mm0
+    movq [X+0x30],mm0
   }
+#undef  X
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -555,8 +585,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }
 
 #endif
diff --git a/thirdparty/libtheora/x86_vc/mmxstate.c b/thirdparty/libtheora/x86_vc/mmxstate.c
index 73bd1981cf..f532ee1b6f 100644
--- a/thirdparty/libtheora/x86_vc/mmxstate.c
+++ b/thirdparty/libtheora/x86_vc/mmxstate.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,17 +19,16 @@
   Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"
 
 #if defined(OC_X86_ASM)
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            mb_mode;
+  int            refi;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
@@ -45,6 +44,7 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
 #define P ecx
       mov Y,_dct_coeffs
       movzx P,p
+      lea Y,[Y+128]
       /*mm0=0000 0000 0000 AAAA*/
       movd mm0,P
       /*mm0=0000 0000 AAAA AAAA*/
@@ -74,65 +74,32 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
   else{
     /*Dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
   }
 }
 
 /*We copy these entire function to inline the actual MMX routines so that we
    use only a single indirect call.*/
 
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-#define SRC edx
-#define DST eax
-#define YSTRIDE ecx
-#define YSTRIDE3 edi
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-#undef SRC
-#undef DST
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -144,8 +111,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
-  OC_ALIGN8(unsigned char  ll[8]);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -156,13 +122,12 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
   ptrdiff_t                fragi0_end;
   int                      ystride;
   int                      nhfrags;
-  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
   fplane=_state->fplanes+_pli;
   nhfrags=fplane->nhfrags;
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
@@ -187,13 +152,13 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
 #define LL edx
 #define D esi
 #define D_WORD si
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
+        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
         }
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
         }
 #undef PIX
 #undef YSTRIDE3
diff --git a/thirdparty/libtheora/x86_vc/x86cpu.c b/thirdparty/libtheora/x86_vc/x86cpu.c
new file mode 100644
index 0000000000..6a1d8d850c
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86cpu.c
@@ -0,0 +1,192 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+/*Why does MSVC need this complicated rigamarole?
+  At this point I honestly do not care.*/
+
+/*Visual C cpuid helper function.
+  For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
+   for VS2003 users, so we do it in inline assembler.*/
+static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
+  _asm{
+    mov eax,[_op]
+    mov esi,_cpu_info
+    cpuid
+    mov [esi+0],eax
+    mov [esi+4],ebx
+    mov [esi+8],ecx
+    mov [esi+12],edx
+  }
+}
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  do{ \
+    ogg_uint32_t cpu_info[4]; \
+    oc_cpuid_helper(cpu_info,_op); \
+    (_eax)=cpu_info[0]; \
+    (_ebx)=cpu_info[1]; \
+    (_ecx)=cpu_info[2]; \
+    (_edx)=cpu_info[3]; \
+  }while(0)
+
+static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
+  _asm{
+    pushfd
+    pushfd
+    pop eax
+    mov ebx,eax
+    xor eax,200000h
+    push eax
+    popfd
+    pushfd
+    pop eax
+    popfd
+    mov ecx,_eax
+    mov [ecx],eax
+    mov ecx,_ebx
+    mov [ecx],ebx
+  }
+}
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+  oc_detect_cpuid_helper(&eax,&ebx);
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif
diff --git a/thirdparty/libtheora/x86_vc/x86cpu.h b/thirdparty/libtheora/x86_vc/x86cpu.h
new file mode 100644
index 0000000000..eea261d448
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86cpu.h
@@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86cpu_H)
+# define _x86_vc_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/x86enc.c b/thirdparty/libtheora/x86_vc/x86enc.c
index e1960e1f0b..e9d59e85e3 100644
--- a/thirdparty/libtheora/x86_vc/x86enc.c
+++ b/thirdparty/libtheora/x86_vc/x86enc.c
@@ -18,27 +18,25 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
   ogg_uint32_t cpu_flags;
-  cpu_flags=oc_cpu_flags_get();
-  oc_enc_vtable_init_c(_enc);
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
   if(cpu_flags&OC_CPU_X86_MMX){
     _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
     _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
     _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
   }
   if(cpu_flags&OC_CPU_X86_MMXEXT){
     _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
     _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
     _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
-    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
-    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
     _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
 # if defined(OC_X86_64_ASM)
diff --git a/thirdparty/libtheora/x86_vc/x86enc.h b/thirdparty/libtheora/x86_vc/x86enc.h
index 581484641f..885406a54d 100644
--- a/thirdparty/libtheora/x86_vc/x86enc.h
+++ b/thirdparty/libtheora/x86_vc/x86enc.h
@@ -17,10 +17,14 @@
 
 #if !defined(_x86_vc_x86enc_H)
 # define _x86_vc_x86enc_H (1)
-# include "../encint.h"
 # include "x86int.h"
+# if defined(OC_X86_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_x86
+#  define OC_ENC_USE_VTABLE (1)
+# endif
+# include "../encint.h"
 
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
 
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
@@ -29,19 +33,19 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
 unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
+ const unsigned char *_src,int _ystride);
 void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
  const unsigned char *_x,const unsigned char *_y,int _stride);
 void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
  const unsigned char *_x,int _stride);
 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
 
 #endif
diff --git a/thirdparty/libtheora/x86_vc/x86int.h b/thirdparty/libtheora/x86_vc/x86int.h
index 4cca485311..318a09dca0 100644
--- a/thirdparty/libtheora/x86_vc/x86int.h
+++ b/thirdparty/libtheora/x86_vc/x86int.h
@@ -11,32 +11,39 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
 #if !defined(_x86_vc_x86int_H)
 # define _x86_vc_x86int_H (1)
 # include "../internal.h"
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+# include "../state.h"
+# include "x86cpu.h"
 
-void oc_state_vtable_init_x86(oc_theora_state *_state);
+void oc_state_accel_init_x86(oc_theora_state *_state);
 
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 
 #endif
diff --git a/thirdparty/libtheora/x86_vc/x86state.c b/thirdparty/libtheora/x86_vc/x86state.c
index a786bec284..fa3a0d42fc 100644
--- a/thirdparty/libtheora/x86_vc/x86state.c
+++ b/thirdparty/libtheora/x86_vc/x86state.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,8 +19,6 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
    each quadrant of the destination.*/
 static const unsigned char OC_FZIG_ZAG_MMX[128]={
@@ -42,21 +40,22 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
   64,64,64,64,64,64,64,64,
 };
 
-void oc_state_vtable_init_x86(oc_theora_state *_state){
+void oc_state_accel_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
     _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
     _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
-  else oc_state_vtable_init_c(_state);
+  else oc_state_accel_init_c(_state);
 }
 #endif
diff --git a/thirdparty/libtheora/x86_vc/x86zigzag.h b/thirdparty/libtheora/x86_vc/x86zigzag.h
new file mode 100644
index 0000000000..26f5ed2ea5
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86zigzag.h
@@ -0,0 +1,244 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86zigzag_H)
+# define _x86_vc_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+   stores them in Y.
+  This relies on two macros to load the contents of each row:
+   OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
+   first four and second four entries of each row into the specified register,
+   respectively.
+  OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+   (because when the rows are already in SSE2 registers, loading the high half
+   destructively modifies the register).
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  8  1  2   9 16 24 17 B
+    C 10  3  4 11  18 25 32 40 E
+    F 33 26 19 12   5  6 13 20 D
+    G 27 34 41 48  56 49 42 35 I
+    L 28 21 14  7  15 22 29 36 M
+    H 43 50 57 58  51 44 37 30 O
+    N 23 31 38 45  52 59 60 53 J
+    P 46 39 47 54  61 62 55 63 K
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+  OC_ZZ_LOAD_ROW_LO(0,mm0)  /*mm0=03 02 01 00*/ \
+  OC_ZZ_LOAD_ROW_LO(1,mm1)  /*mm1=11 10 09 08*/ \
+  OC_ZZ_LOAD_ROW_LO(2,mm2)  /*mm2=19 18 17 16*/ \
+  OC_ZZ_LOAD_ROW_LO(3,mm3)  /*mm3=27 26 25 24*/ \
+  OC_ZZ_LOAD_ROW_HI(0,mm4)  /*mm4=07 06 05 04*/ \
+  OC_ZZ_LOAD_ROW_HI(1,mm5)  /*mm5=15 14 13 12*/ \
+  OC_ZZ_LOAD_ROW_HI(2,mm6)  /*mm6=23 22 21 20*/ \
+  __asm movq mm7,mm0        /*mm7=03 02 01 00*/ \
+  __asm punpckhdq mm0,mm1   /*mm0=11 10 03 02*/ \
+  __asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
+  __asm punpcklwd mm1,mm0   /*mm1=03 09 02 08*/ \
+  __asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
+  __asm punpcklwd mm7,mm1   /*mm7=02 01 08 00 *A*/ \
+  __asm movq [Y+0x00],mm7 \
+  __asm punpckhwd mm1,mm4   /*mm1=04 03 07 09*/ \
+  __asm movq mm7,mm2        /*mm7=19 18 17 16*/ \
+  __asm punpckhdq mm0,mm1   /*mm0=04 03 11 10*/ \
+  __asm punpckhwd mm7,mm5   /*mm7=12 19 15 18*/ \
+  __asm punpcklwd mm1,mm3   /*mm1=25 07 24 09*/ \
+  __asm punpcklwd mm5,mm6   /*mm5=21 14 20 13*/ \
+  __asm punpcklwd mm1,mm2   /*mm1=17 24 16 09 *B*/ \
+  OC_ZZ_LOAD_ROW_LO(4,mm2)  /*mm2=35 34 33 32*/ \
+  __asm movq [Y+0x08],mm1 \
+  OC_ZZ_LOAD_ROW_LO(5,mm1)  /*mm1=43 42 41 40*/ \
+  __asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
+  __asm movq [Y+0x10],mm0 \
+  __asm punpckhdq mm6,mm4   /*mm6=?? 07 23 22*/ \
+  __asm punpckldq mm4,mm5   /*mm4=20 13 06 05 *D*/ \
+  __asm movq [Y+0x28],mm4 \
+  __asm psrlq mm3,16        /*mm3=.. 27 26 25*/ \
+  __asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
+  __asm movq mm4,mm7        /*mm4=12 19 15 18*/ \
+  __asm punpcklwd mm2,mm3   /*mm2=26 33 25 32*/ \
+  __asm punpcklwd mm4,mm1   /*mm4=41 15 40 18*/ \
+  __asm punpckhwd mm3,mm1   /*mm3=43 .. 42 27*/ \
+  __asm punpckldq mm4,mm2   /*mm4=25 32 40 18*/ \
+  __asm punpcklwd mm3,mm0   /*mm3=35 42 34 27*/ \
+  OC_ZZ_LOAD_ROW_LO(6,mm0)  /*mm0=51 50 49 48*/ \
+  __asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
+  __asm movq [Y+0x18],mm4 \
+  OC_ZZ_LOAD_ROW_LO(7,mm4)  /*mm4=59 58 57 56*/ \
+  __asm punpckhdq mm2,mm7   /*mm2=12 19 26 33 *F*/ \
+  __asm movq [Y+0x20],mm2 \
+  __asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
+  __asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
+  __asm movq mm2,mm3        /*mm2=35 42 34 27*/ \
+  __asm punpckhwd mm1,mm0   /*mm1=50 43 48 41*/ \
+  __asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
+  __asm punpckldq mm3,mm1   /*mm3=48 41 34 27 *G*/ \
+  __asm movq [Y+0x30],mm3 \
+  __asm punpckhdq mm1,mm4   /*mm1=58 57 50 43 *H*/ \
+  __asm movq [Y+0x50],mm1 \
+  OC_ZZ_LOAD_ROW_HI(7,mm1)  /*mm1=63 62 61 60*/ \
+  __asm punpcklwd mm4,mm0   /*mm4=49 56 51 59*/ \
+  OC_ZZ_LOAD_ROW_HI(6,mm0)  /*mm0=55 54 53 52*/ \
+  __asm psllq mm6,16        /*mm6=07 23 22 ..*/ \
+  __asm movq mm3,mm4        /*mm3=49 56 51 59*/ \
+  __asm punpckhdq mm4,mm2   /*mm4=35 42 49 56 *I*/ \
+  OC_ZZ_LOAD_ROW_HI(3,mm2)  /*mm2=31 30 29 28*/ \
+  __asm movq [Y+0x38],mm4 \
+  __asm punpcklwd mm3,mm1   /*mm3=61 51 60 59*/ \
+  __asm punpcklwd mm7,mm6   /*mm7=22 15 .. ??*/ \
+  __asm movq mm4,mm3        /*mm4=61 51 60 59*/ \
+  __asm punpcklwd mm3,mm0   /*mm3=53 60 52 59*/ \
+  __asm punpckhwd mm4,mm0   /*mm4=55 61 54 51*/ \
+  OC_ZZ_LOAD_ROW_HI(4,mm0)  /*mm0=39 38 37 36*/ \
+  __asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
+  __asm movq [Y+0x68],mm3 \
+  __asm movq mm3,mm4        /*mm3=?? ?? 54 51*/ \
+  __asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
+  __asm punpckhwd mm4,mm1   /*mm4=63 55 62 61 *K*/ \
+  OC_ZZ_LOAD_ROW_HI(5,mm1)  /*mm1=47 46 45 44*/ \
+  __asm movq [Y+0x78],mm4 \
+  __asm punpckhwd mm6,mm2   /*mm6=28 07 31 23*/ \
+  __asm punpcklwd mm2,mm0   /*mm2=37 30 36 29*/ \
+  __asm punpckhdq mm5,mm6   /*mm5=28 07 21 14*/ \
+  __asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
+  __asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
+  __asm movq [Y+0x40],mm5 \
+  __asm punpckhdq mm7,mm2   /*mm7=36 29 22 15 *M*/ \
+  __asm movq [Y+0x48],mm7 \
+  __asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
+  __asm punpckhwd mm0,mm1   /*mm0=46 39 45 38*/ \
+  __asm punpcklwd mm3,mm1   /*mm3=47 54 44 51*/ \
+  __asm punpckldq mm6,mm0   /*mm6=45 38 31 23 *N*/ \
+  __asm movq [Y+0x60],mm6 \
+  __asm punpckhdq mm0,mm3   /*mm0=47 54 46 39*/ \
+  __asm punpckldq mm3,mm2   /*mm3=30 37 44 51 *O*/ \
+  __asm movq [Y+0x58],mm3 \
+  __asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
+  __asm movq [Y+0x70],mm0 \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+   order and stores them in %[qdct].
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif