11 files changed, 3229 insertions, 0 deletions
diff --git a/thirdparty/libtheora/x86_vc/mmxencfrag.c b/thirdparty/libtheora/x86_vc/mmxencfrag.c
new file mode 100644
index 0000000000..ac9dacf377
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/mmxencfrag.c
@@ -0,0 +1,969 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ptrdiff_t ret;
+  __asm{
+#define SRC esi
+#define REF edx
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF,_ref
+    /*Load the first 4 rows of each block.*/
+    movq mm0,[SRC]
+    movq mm1,[REF]
+    movq mm2,[SRC][YSTRIDE]
+    movq mm3,[REF][YSTRIDE]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    movq mm4,[SRC+YSTRIDE*2]
+    movq mm5,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE3]
+    movq mm7,[REF+YSTRIDE3]
+    /*Compute their SADs and add them in mm0*/
+    psadbw mm0,mm1
+    psadbw mm2,mm3
+    lea SRC,[SRC+YSTRIDE*4]
+    paddw mm0,mm2
+    lea REF,[REF+YSTRIDE*4]
+    /*Load the next 3 rows as registers become available.*/
+    movq mm2,[SRC]
+    movq mm3,[REF]
+    psadbw mm4,mm5
+    psadbw mm6,mm7
+    paddw mm0,mm4
+    movq mm5,[REF+YSTRIDE]
+    movq mm4,[SRC+YSTRIDE]
+    paddw mm0,mm6
+    movq mm7,[REF+YSTRIDE*2]
+    movq mm6,[SRC+YSTRIDE*2]
+    /*Start adding their SADs to mm0*/
+    psadbw mm2,mm3
+    psadbw mm4,mm5
+    paddw mm0,mm2
+    psadbw mm6,mm7
+    /*Load last row as registers become available.*/
+    movq mm2,[SRC+YSTRIDE3]
+    movq mm3,[REF+YSTRIDE3]
+    /*And finish adding up their SADs.*/
+    paddw mm0,mm4
+    psadbw mm2,mm3
+    paddw mm0,mm6
+    paddw mm0,mm2
+    movd [ret],mm0
+#undef SRC
+#undef REF
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return (unsigned)ret;
+}
+
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  /*Early termination is for suckers.*/
+  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
+}
+
+#define OC_SAD2_LOOP __asm{ \
+  /*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
+     pavgb computes (mm0+mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb.*/ \
+  __asm  movq mm6,mm0 \
+  __asm  lea REF1,[REF1+YSTRIDE*2] \
+  __asm  pxor mm0,mm1 \
+  __asm  pavgb mm6,mm1 \
+  __asm  lea REF2,[REF2+YSTRIDE*2] \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm0,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  movq mm3,[REF2+YSTRIDE] \
+  __asm  psubb mm6,mm0 \
+  __asm  movq mm0,[REF1] \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm6 \
+  __asm  movd mm6,RET \
+  __asm  psubb mm2,mm1 \
+  __asm  movq mm1,[REF2] \
+  __asm  lea SRC,[SRC+YSTRIDE*2] \
+  __asm  psadbw mm5,mm2 \
+  __asm  movq mm2,[REF1+YSTRIDE] \
+  __asm  paddw mm5,mm4 \
+  __asm  movq mm4,[SRC] \
+  __asm  paddw mm6,mm5 \
+  __asm  movq mm5,[SRC+YSTRIDE] \
+  __asm  movd RET,mm6 \
+}
+
+/*Same as above, but does not pre-load the next two rows.*/
+#define OC_SAD2_TAIL __asm{ \
+  __asm  movq mm6,mm0 \
+  __asm  pavgb mm0,mm1 \
+  __asm  pxor mm6,mm1 \
+  __asm  movq mm1,mm2 \
+  __asm  pand mm6,mm7 \
+  __asm  pavgb mm2,mm3 \
+  __asm  pxor mm1,mm3 \
+  __asm  psubb mm0,mm6 \
+  __asm  pand mm1,mm7 \
+  __asm  psadbw mm4,mm0 \
+  __asm  psubb mm2,mm1 \
+  __asm  movd mm6,RET \
+  __asm  psadbw mm5,mm2 \
+  __asm  paddw mm5,mm4 \
+  __asm  paddw mm6,mm5 \
+  __asm  movd RET,mm6 \
+}
+
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ptrdiff_t ret;
+  __asm{
+#define REF1 ecx
+#define REF2 edi
+#define YSTRIDE esi
+#define SRC edx
+#define RET eax
+    mov YSTRIDE,_ystride
+    mov SRC,_src
+    mov REF1,_ref1
+    mov REF2,_ref2
+    movq mm0,[REF1]
+    movq mm1,[REF2]
+    movq mm2,[REF1+YSTRIDE]
+    movq mm3,[REF2+YSTRIDE]
+    xor RET,RET
+    movq mm4,[SRC]
+    pxor mm7,mm7
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC+YSTRIDE]
+    psubb mm7,mm6
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_TAIL
+    mov [ret],RET
+#undef REF1
+#undef REF2
+#undef YSTRIDE
+#undef SRC
+#undef RET
+  }
+  return (unsigned)ret;
+}
+
+/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
+  16-bit difference in mm0...mm7.*/
+#define OC_LOAD_SUB_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm4,[_off+REF] \
+  __asm  movd mm1,[_off+SRC+SRC_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  movd mm5,[_off+REF+REF_YSTRIDE] \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  movd mm2,[_off+SRC] \
+  __asm  movd mm7,[_off+REF] \
+  __asm  movd mm3,[_off+SRC+SRC_YSTRIDE] \
+  __asm  movd mm6,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm0,mm4 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  psubw mm0,mm4 \
+  __asm  movd mm4,[_off+SRC] \
+  __asm  movq [_off*2+BUF],mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm1,mm5 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psubw mm1,mm5 \
+  __asm  movd mm5,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm2,mm7 \
+  __asm  movd mm7,[_off+REF+REF_YSTRIDE] \
+  __asm  punpcklbw mm3,mm6 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  psubw mm3,mm6 \
+  __asm  movd mm6,[_off+SRC] \
+  __asm  punpcklbw mm4,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*2] \
+  __asm  psubw mm4,mm0 \
+  __asm  movd mm0,[_off+REF] \
+  __asm  punpcklbw mm5,mm7 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psubw mm5,mm7 \
+  __asm  movd mm7,[_off+SRC+SRC_YSTRIDE] \
+  __asm  punpcklbw mm6,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*2] \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  psubw mm6,mm0 \
+  __asm  movd mm0,[_off+REF+REF_YSTRIDE] \
+  __asm  lea SRC,[SRC+SRC_YSTRIDE*8] \
+  __asm  punpcklbw mm7,mm0 \
+  __asm  neg SRC_YSTRIDE \
+  __asm  punpcklbw mm0,mm0 \
+  __asm  lea REF,[REF+REF_YSTRIDE*8] \
+  __asm  psubw mm7,mm0 \
+  __asm  neg REF_YSTRIDE \
+  __asm  movq mm0,[_off*2+BUF] \
+}
+
+/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
+#define OC_LOAD_8x4(_off) __asm{ \
+  __asm  movd mm0,[_off+SRC] \
+  __asm  movd mm1,[_off+SRC+YSTRIDE] \
+  __asm  movd mm2,[_off+SRC+YSTRIDE*2] \
+  __asm  pxor mm7,mm7 \
+  __asm  movd mm3,[_off+SRC+YSTRIDE3] \
+  __asm  punpcklbw mm0,mm7 \
+  __asm  movd mm4,[_off+SRC4] \
+  __asm  punpcklbw mm1,mm7 \
+  __asm  movd mm5,[_off+SRC4+YSTRIDE] \
+  __asm  punpcklbw mm2,mm7 \
+  __asm  movd mm6,[_off+SRC4+YSTRIDE*2] \
+  __asm  punpcklbw mm3,mm7 \
+  __asm  movd mm7,[_off+SRC4+YSTRIDE3] \
+  __asm  punpcklbw mm4,mm4 \
+  __asm  punpcklbw mm5,mm5 \
+  __asm  psrlw mm4,8 \
+  __asm  psrlw mm5,8 \
+  __asm  punpcklbw mm6,mm6 \
+  __asm  punpcklbw mm7,mm7 \
+  __asm  psrlw mm6,8 \
+  __asm  psrlw mm7,8 \
+}
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x4 __asm{ \
+  /*Stage A: \
+    Outputs 0-3 are swapped with 4-7 here.*/ \
+  __asm  paddw mm5,mm1 \
+  __asm  paddw mm6,mm2 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm2,mm2 \
+  __asm  psubw mm1,mm5 \
+  __asm  psubw mm2,mm6 \
+  __asm  paddw mm7,mm3 \
+  __asm  paddw mm4,mm0 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm0,mm0 \
+  __asm  psubw mm3,mm7 \
+  __asm  psubw mm0,mm4 \
+   /*Stage B:*/ \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm1,mm3 \
+  __asm  paddw mm4,mm6 \
+  __asm  paddw mm5,mm7 \
+  __asm  paddw mm2,mm2 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm6,mm6 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm2,mm0 \
+  __asm  psubw mm3,mm1 \
+  __asm  psubw mm6,mm4 \
+  __asm  psubw mm7,mm5 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x4 __asm{ \
+  /*Stage C:*/ \
+  __asm  paddw mm0,mm1 \
+  __asm  paddw mm2,mm3 \
+  __asm  paddw mm4,mm5 \
+  __asm  paddw mm6,mm7 \
+  __asm  paddw mm1,mm1 \
+  __asm  paddw mm3,mm3 \
+  __asm  paddw mm5,mm5 \
+  __asm  paddw mm7,mm7 \
+  __asm  psubw mm1,mm0 \
+  __asm  psubw mm3,mm2 \
+  __asm  psubw mm5,mm4 \
+  __asm  psubw mm7,mm6 \
+}
+
+/*Performs an 8-point 1-D Hadamard transform.
+  The transform is performed in place, except that outputs 0-3 are swapped with
+   outputs 4-7.
+  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x4 __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_8x4 \
+}
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, mm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
+  /*We use the fact that \
+      (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+     to merge the final butterfly with the abs and the first stage of \
+     accumulation. \
+    Thus we can avoid using pabsw, which is not available until SSSE3. \
+    Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
+     implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+     registers). \
+    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+    This implementation is only 26 (+4 for spilling registers).*/ \
+  __asm  movq [_r7+BUF],mm7 \
+  __asm  movq [_r6+BUF],mm6 \
+  /*mm7={0x7FFF}x4 \
+    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
+  __asm  pcmpeqb mm7,mm7 \
+  __asm  movq mm6,mm0 \
+  __asm  psrlw mm7,1 \
+  __asm  paddw mm6,mm1 \
+  __asm  pmaxsw mm0,mm1 \
+  __asm  paddsw mm6,mm7 \
+  __asm  psubw mm0,mm6 \
+  /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
+    mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm1,mm4 \
+  __asm  pmaxsw mm2,mm3 \
+  __asm  pmaxsw mm4,mm5 \
+  __asm  paddw mm6,mm3 \
+  __asm  paddw mm1,mm5 \
+  __asm  movq mm3,[_r7+BUF] \
+}
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
+  __asm  paddsw mm6,mm7 \
+  __asm  movq mm5,[_r6+BUF] \
+  __asm  paddsw mm1,mm7 \
+  __asm  psubw mm2,mm6 \
+  __asm  psubw mm4,mm1 \
+  /*mm7={1}x4 (needed for the horizontal add that follows) \
+    mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
+  __asm  movq mm6,mm3 \
+  __asm  pmaxsw mm3,mm5 \
+  __asm  paddw mm0,mm2 \
+  __asm  paddw mm6,mm5 \
+  __asm  paddw mm0,mm4 \
+  __asm  paddsw mm6,mm7 \
+  __asm  paddw mm0,mm3 \
+  __asm  psrlw mm7,14 \
+  __asm  psubw mm0,mm6 \
+}
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into mm0.
+  This is the only portion of SATD which requires MMXEXT (we could use plain
+   MMX, but it takes 4 instructions and an extra register to work around the
+   lack of a pmaxsw, which is a pretty serious penalty).*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
+  OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
+}
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into mm0.
+  Note that mm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
+  OC_HADAMARD_AB_8x4 \
+  OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
+}
+
+/*Performs two 4x4 transposes (mostly) in place.
+  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
+   contains rows {a,b,c,d}.
+  On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
+   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
+#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
+  /*First 4x4 transpose:*/ \
+  __asm  movq [0x10+_off+BUF],mm5 \
+  /*mm0 = e3 e2 e1 e0 \
+    mm1 = f3 f2 f1 f0 \
+    mm2 = g3 g2 g1 g0 \
+    mm3 = h3 h2 h1 h0*/ \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm2,mm3 \
+  __asm  punpckhwd mm5,mm3 \
+  __asm  movq mm3,mm0 \
+  __asm  punpcklwd mm0,mm1 \
+  __asm  punpckhwd mm3,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm3 = f3 e3 f2 e2 \
+    mm2 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm2 \
+  __asm  punpckhdq mm1,mm2 \
+  __asm  movq mm2,mm3 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq [0x40+_off+BUF],mm0 \
+  __asm  punpckldq mm2,mm5 \
+  /*mm0 = h0 g0 f0 e0 \
+    mm1 = h1 g1 f1 e1 \
+    mm2 = h2 g2 f2 e2 \
+    mm3 = h3 g3 f3 e3*/ \
+  __asm  movq mm5,[0x10+_off+BUF] \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm5 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm7 = d3 d2 d1 d0*/ \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm7 \
+  __asm  movq [0x50+_off+BUF],mm1 \
+  __asm  punpckhwd mm0,mm7 \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm5 \
+  __asm  movq [0x60+_off+BUF],mm2 \
+  __asm  punpckhwd mm7,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  movq [0x70+_off+BUF],mm3 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
+}
+
+static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
+ int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC esi
+#define REF eax
+#define SRC_YSTRIDE ecx
+#define REF_YSTRIDE edx
+#define BUF edi
+#define RET eax
+#define RET2 edx
+    mov SRC,_src
+    mov SRC_YSTRIDE,_src_ystride
+    mov REF,_ref
+    mov REF_YSTRIDE,_ref_ystride
+    mov BUF,bufp
+    OC_LOAD_SUB_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_SUB_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+       we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+       latency of pmaddwd by starting the next series of loads now.*/
+    mov RET2,_thresh
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm4,mm0
+    movq mm2,[0x60+BUF]
+    punpckhdq mm0,mm0
+    movq mm6,[0x68+BUF]
+    paddd mm4,mm0
+    movq mm3,[0x70+BUF]
+    movd RET,mm4
+    movq mm7,[0x78+BUF]
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, and a factor of two removed; correct the final sum here.*/
+    lea RET,[RET+RET-32]
+    movq mm0,[0x40+BUF]
+    cmp RET,RET2
+    movq mm4,[0x48+BUF]
+    jae at_end
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*There isn't much to stick in here to hide the latency this time, but the
+       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
+       latency is even worse.*/
+    sub RET,32
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET2,mm4
+    lea RET,[RET+RET2*2]
+    align 16
+at_end:
+    mov ret1,RET
+#undef SRC
+#undef REF
+#undef SRC_YSTRIDE
+#undef REF_YSTRIDE
+#undef BUF
+#undef RET
+#undef RET2
+  }
+  return ret1;
+}
+
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+}
+
+
+/*Our internal implementation of frag_copy2 takes an extra stride parameter so
+   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
+static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
+  __asm{
+    /*Load the first 3 rows.*/
+#define DST_YSTRIDE edi
+#define SRC_YSTRIDE esi
+#define DST eax
+#define SRC1 edx
+#define SRC2 ecx
+    mov DST_YSTRIDE,_dst_ystride
+    mov SRC_YSTRIDE,_src_ystride
+    mov DST,_dst
+    mov SRC1,_src1
+    mov SRC2,_src2
+    movq mm0,[SRC1]
+    movq mm1,[SRC2]
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pxor mm7,mm7
+    movq mm4,[SRC1]
+    pcmpeqb mm6,mm6
+    movq mm5,[SRC2]
+    /*mm7={1}x8.*/
+    psubb mm7,mm6
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm1 is free, start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1+SRC_YSTRIDE]
+    /*Start averaging mm5 and mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 0] is done; write it out.*/
+    movq [DST],mm6
+    pand mm1,mm7
+    pavgb mm4,mm5
+    psubb mm2,mm1
+    /*mm1 is free, continue loading the next row.*/
+    movq mm1,[SRC2+SRC_YSTRIDE]
+    pxor mm3,mm5
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    /*mm2 [row 1] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1]
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm4,mm3
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    /*mm4 [row 2] is done; write it out.*/
+    movq [DST],mm4
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2]
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    pxor mm0,mm1
+    /*Start loading the next row.*/
+    movq mm4,[SRC1+SRC_YSTRIDE]
+    pavgb mm6,mm1
+    /*mm1 is free; start averaging mm3 into mm2 using mm1.*/
+    movq mm1,mm2
+    pand mm0,mm7
+    /*Continue loading the next row.*/
+    movq mm5,[SRC2+SRC_YSTRIDE]
+    pavgb mm2,mm3
+    lea SRC1,[SRC1+SRC_YSTRIDE*2]
+    pxor mm1,mm3
+    /*mm3 is free.*/
+    psubb mm6,mm0
+    /*mm0 is free, start loading the next row.*/
+    movq mm0,[SRC1]
+    /*Start averaging mm5 into mm4 using mm3.*/
+    movq mm3,mm4
+    /*mm6 [row 3] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm6
+    pand mm1,mm7
+    lea SRC2,[SRC2+SRC_YSTRIDE*2]
+    pavgb mm4,mm5
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm2,mm1
+    /*mm1 is free; continue loading the next row.*/
+    movq mm1,[SRC2]
+    pxor mm3,mm5
+    /*mm2 [row 4] is done; write it out.*/
+    movq [DST],mm2
+    pand mm3,mm7
+    /*Start loading the next row.*/
+    movq mm2,[SRC1+SRC_YSTRIDE]
+    psubb mm4,mm3
+    /*Start averaging mm0 and mm1 into mm6.*/
+    movq mm6,mm0
+    /*Continue loading the next row.*/
+    movq mm3,[SRC2+SRC_YSTRIDE]
+    /*mm4 [row 5] is done; write it out.*/
+    movq [DST+DST_YSTRIDE],mm4
+    pxor mm0,mm1
+    pavgb mm6,mm1
+    /*mm4 is free; start averaging mm3 into mm2 using mm4.*/
+    movq mm4,mm2
+    pand mm0,mm7
+    pavgb mm2,mm3
+    pxor mm4,mm3
+    lea DST,[DST+DST_YSTRIDE*2]
+    psubb mm6,mm0
+    pand mm4,mm7
+    /*mm6 [row 6] is done, write it out.*/
+    movq [DST],mm6
+    psubb mm2,mm4
+    /*mm2 [row 7] is done, write it out.*/
+    movq [DST+DST_YSTRIDE],mm2
+#undef SRC1
+#undef SRC2
+#undef SRC_YSTRIDE
+#undef DST_YSTRIDE
+#undef DST
+  }
+}
+
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+}
+
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
+ int _ystride){
+  OC_ALIGN8(ogg_int16_t  buf[64]);
+  ogg_int16_t           *bufp;
+  unsigned               ret1;
+  unsigned               ret2;
+  bufp=buf;
+  __asm{
+#define SRC eax
+#define SRC4 esi
+#define BUF edi
+#define RET eax
+#define RET_WORD ax
+#define RET2 ecx
+#define YSTRIDE edx
+#define YSTRIDE3 ecx
+    mov SRC,_src
+    mov BUF,bufp
+    mov YSTRIDE,_ystride
+    /* src4 = src+4*ystride */
+    lea SRC4,[SRC+YSTRIDE*4]
+    /* ystride3 = 3*ystride */
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    OC_LOAD_8x4(0x00)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x00)
+    /*Finish swapping out this 8x4 block to make room for the next one.
+      mm0...mm3 have been swapped out already.*/
+    movq [0x00+BUF],mm4
+    movq [0x10+BUF],mm5
+    movq [0x20+BUF],mm6
+    movq [0x30+BUF],mm7
+    OC_LOAD_8x4(0x04)
+    OC_HADAMARD_8x4
+    OC_TRANSPOSE_4x4x2(0x08)
+    /*Here the first 4x4 block of output from the last transpose is the second
+      4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place, so
+      we only have to do half the loads.*/
+    movq mm1,[0x10+BUF]
+    movq mm2,[0x20+BUF]
+    movq mm3,[0x30+BUF]
+    movq mm0,[0x00+BUF]
+    /*We split out the stages here so we can save the DC coefficient in the
+      middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    movd RET,mm1
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+      difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+      for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
+      latency of pmaddwd by starting the next series of loads now.*/
+    pmaddwd mm0,mm7
+    movq mm1,[0x50+BUF]
+    movq mm5,[0x58+BUF]
+    movq mm2,[0x60+BUF]
+    movq mm4,mm0
+    movq mm6,[0x68+BUF]
+    punpckhdq mm0,mm0
+    movq mm3,[0x70+BUF]
+    paddd mm4,mm0
+    movq mm7,[0x78+BUF]
+    movd RET2,mm4
+    movq mm0,[0x40+BUF]
+    movq mm4,[0x48+BUF]
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
+    pmaddwd mm0,mm7
+    /*We assume that the DC coefficient is always positive (which is true,
+    because the input to the INTRA transform was not a difference).*/
+    movzx RET,RET_WORD
+    add RET2,RET2
+    sub RET2,RET
+    movq mm4,mm0
+    punpckhdq mm0,mm0
+    paddd mm4,mm0
+    movd RET,mm4
+    lea RET,[-64+RET2+RET*2]
+    mov [ret1],RET
+#undef SRC
+#undef SRC4
+#undef BUF
+#undef RET
+#undef RET_WORD
+#undef RET2
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+  return ret1;
+}
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src, const unsigned char *_ref,int _ystride){
+  int i;
+  __asm  pxor mm7,mm7
+  for(i=4;i-->0;){
+    __asm{
+#define SRC edx
+#define YSTRIDE esi
+#define RESIDUE eax
+#define REF ecx
+      mov YSTRIDE,_ystride
+      mov RESIDUE,_residue
+      mov SRC,_src
+      mov REF,_ref
+      /*mm0=[src]*/
+      movq mm0,[SRC]
+      /*mm1=[ref]*/
+      movq mm1,[REF]
+      /*mm4=[src+ystride]*/
+      movq mm4,[SRC+YSTRIDE]
+      /*mm5=[ref+ystride]*/
+      movq mm5,[REF+YSTRIDE]
+      /*Compute [src]-[ref].*/
+      movq mm2,mm0
+      punpcklbw mm0,mm7
+      movq mm3,mm1
+      punpckhbw mm2,mm7
+      punpcklbw mm1,mm7
+      punpckhbw mm3,mm7
+      psubw mm0,mm1
+      psubw mm2,mm3
+      /*Compute [src+ystride]-[ref+ystride].*/
+      movq mm1,mm4
+      punpcklbw mm4,mm7
+      movq mm3,mm5
+      punpckhbw mm1,mm7
+      lea SRC,[SRC+YSTRIDE*2]
+      punpcklbw mm5,mm7
+      lea REF,[REF+YSTRIDE*2]
+      punpckhbw mm3,mm7
+      psubw mm4,mm5
+      psubw mm1,mm3
+      /*Write the answer out.*/
+      movq [RESIDUE+0x00],mm0
+      movq [RESIDUE+0x08],mm2
+      movq [RESIDUE+0x10],mm4
+      movq [RESIDUE+0x18],mm1
+      lea RESIDUE,[RESIDUE+0x20]
+      mov _residue,RESIDUE
+      mov _src,SRC
+      mov _ref,REF
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+#undef REF
+    }
+  }
+}
+
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,int _ystride){
+   __asm{
+#define YSTRIDE edx
+#define YSTRIDE3 edi
+#define RESIDUE ecx
+#define SRC eax
+    mov YSTRIDE,_ystride
+    mov RESIDUE,_residue
+    mov SRC,_src
+    /*mm0=[src]*/
+    movq mm0,[SRC]
+    /*mm1=[src+ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*mm6={-1}x4*/
+    pcmpeqw mm6,mm6
+    /*mm2=[src+2*ystride]*/
+    movq mm2,[SRC+YSTRIDE*2]
+    /*[ystride3]=3*[ystride]*/
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    /*mm6={1}x4*/
+    psllw mm6,15
+    /*mm3=[src+3*ystride]*/
+    movq mm3,[SRC+YSTRIDE3]
+    /*mm6={128}x4*/
+    psrlw mm6,8
+    /*mm7=0*/ 
+    pxor mm7,mm7
+    /*[src]=[src]+4*[ystride]*/
+    lea SRC,[SRC+YSTRIDE*4]
+    /*Compute [src]-128 and [src+ystride]-128*/
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x00],mm0
+    movq [RESIDUE+0x08],mm4
+    movq [RESIDUE+0x10],mm1
+    movq [RESIDUE+0x18],mm5
+    /*mm0=[src+4*ystride]*/
+    movq mm0,[SRC]
+    /*mm1=[src+5*ystride]*/
+    movq mm1,[SRC+YSTRIDE]
+    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x20],mm2
+    movq [RESIDUE+0x28],mm4
+    movq [RESIDUE+0x30],mm3
+    movq [RESIDUE+0x38],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm2,[SRC+YSTRIDE*2]
+    movq mm3,[SRC+YSTRIDE3]
+    movq mm4,mm0
+    punpcklbw mm0,mm7
+    movq mm5,mm1
+    punpckhbw mm4,mm7
+    psubw mm0,mm6
+    punpcklbw mm1,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm1,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x40],mm0
+    movq [RESIDUE+0x48],mm4
+    movq [RESIDUE+0x50],mm1
+    movq [RESIDUE+0x58],mm5
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    movq mm4,mm2
+    punpcklbw mm2,mm7
+    movq mm5,mm3
+    punpckhbw mm4,mm7
+    psubw mm2,mm6
+    punpcklbw mm3,mm7
+    psubw mm4,mm6
+    punpckhbw mm5,mm7
+    psubw mm3,mm6
+    psubw mm5,mm6
+    /*Write the answer out.*/
+    movq [RESIDUE+0x60],mm2
+    movq [RESIDUE+0x68],mm4
+    movq [RESIDUE+0x70],mm3
+    movq [RESIDUE+0x78],mm5
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+#undef SRC
+  }
+}
+
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxfdct.c b/thirdparty/libtheora/x86_vc/mmxfdct.c
new file mode 100644
index 0000000000..dcf17c9fa7
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/mmxfdct.c
@@ -0,0 +1,670 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/ 
+ /*MMX fDCT implementation for x86_32*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+#define OC_FDCT_STAGE1_8x4  __asm{ \
+  /*Stage 1:*/ \
+  /*mm0=t7'=t0-t7*/ \
+  __asm  psubw mm0,mm7 \
+  __asm  paddw mm7,mm7 \
+  /*mm1=t6'=t1-t6*/ \
+  __asm  psubw mm1, mm6 \
+  __asm  paddw mm6,mm6 \
+  /*mm2=t5'=t2-t5*/ \
+  __asm  psubw mm2,mm5 \
+  __asm  paddw mm5,mm5 \
+  /*mm3=t4'=t3-t4*/ \
+  __asm  psubw mm3,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm7=t0'=t0+t7*/ \
+  __asm  paddw mm7,mm0 \
+  /*mm6=t1'=t1+t6*/  \
+  __asm  paddw mm6,mm1 \
+  /*mm5=t2'=t2+t5*/ \
+  __asm  paddw mm5,mm2 \
+  /*mm4=t3'=t3+t4*/ \
+  __asm  paddw mm4,mm3\
+}
+
+#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*Stage 2:*/ \
+  /*mm7=t3''=t0'-t3'*/ \
+  __asm  psubw mm7,mm4 \
+  __asm  paddw mm4,mm4 \
+  /*mm6=t2''=t1'-t2'*/ \
+  __asm  psubw mm6,mm5 \
+  __asm  movq [Y+_r6],mm7 \
+  __asm  paddw mm5,mm5 \
+  /*mm1=t5''=t6'-t5'*/ \
+  __asm  psubw mm1,mm2 \
+  __asm  movq [Y+_r2],mm6 \
+  /*mm4=t0''=t0'+t3'*/ \
+  __asm  paddw mm4,mm7 \
+  __asm  paddw mm2,mm2 \
+  /*mm5=t1''=t1'+t2'*/ \
+  __asm  movq [Y+_r0],mm4 \
+  __asm  paddw mm5,mm6 \
+  /*mm2=t6''=t6'+t5'*/ \
+  __asm  paddw mm2,mm1 \
+  __asm  movq [Y+_r4],mm5 \
+  /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
+  /*mm4, mm5, mm6, mm7 are free.*/ \
+  /*Stage 3:*/ \
+  /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
+  __asm  mov A,0x5A806A0A \
+  __asm  pcmpeqb mm6,mm6 \
+  __asm  movd mm7,A \
+  __asm  psrlw mm6,15 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm6,mm6 \
+  /*mm0=0, m2={-1}x4 \
+    mm5:mm4=t5''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm2,mm2 \
+  /*mm2=t6'', mm1=t5''+(t5''!=0) \
+    mm4=(t5''*27146+0xB500>>16)*/ \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm0,mm2 \
+  __asm  movq mm2, [Y+_r3] \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm1,mm0 \
+  __asm  packssdw mm4,mm5 \
+  /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm0, [Y+_r7] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm1,mm3 \
+  /*mm3=t4''=t4'+s*/ \
+  __asm  paddw mm3,mm4 \
+  /*mm1=t5'''=t4'-s*/ \
+  __asm  psubw mm1,mm4 \
+  /*mm1=0, mm3={-1}x4 \
+    mm5:mm4=t6''*27146+0xB500*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r1],mm3 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  pxor mm1,mm1 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqb mm3,mm3 \
+  /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqw mm1,mm2 \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm2,mm1 \
+  /*mm1=t1'' \
+    mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+  __asm  paddw mm4,mm2 \
+  __asm  movq mm1,[Y+_r4] \
+  __asm  psraw mm4,1 \
+  __asm  movq mm2,mm0 \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm0=t7''=t7'+s*/ \
+  __asm  paddw mm0,mm4 \
+  /*mm2=t6'''=t7'-s*/ \
+  __asm  psubw mm2,mm4 \
+  /*Stage 4:*/ \
+  /*mm0=0, mm2=t0'' \
+    mm5:mm4=t1''*27146+0xB500*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq mm2,[Y+_r0] \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movq [Y+_r7],mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm0,mm0 \
+  /*mm7={27146,0x4000>>1}x2 \
+    mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  mov A,0x20006A0A \
+  __asm  pcmpeqw mm0,mm1 \
+  __asm  movd mm7,A \
+  __asm  psrad mm5,16 \
+  __asm  psubw mm0,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  paddw mm0,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm0,mm4 \
+  /*mm6={0x00000E3D}x2 \
+    mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  mov A,0x0E3D \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pxor mm1,mm1 \
+  __asm  punpckldq mm6,mm6 \
+  __asm  pcmpeqw mm1,mm2 \
+  /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+  __asm  psrad mm4,16 \
+  __asm  psubw mm1,mm3 \
+  __asm  psrad mm5,16 \
+  __asm  paddw mm2,mm1 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movq mm1,[Y+_r5] \
+  __asm  paddw mm4,mm2 \
+  /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
+    The naive implementation could cause overflow, so we use \
+     u=(r&s)+((r^s)>>1).*/ \
+  __asm  movq mm2,[Y+_r3] \
+  __asm  movq mm7,mm0 \
+  __asm  pxor mm0,mm4 \
+  __asm  pand mm7,mm4 \
+  __asm  psraw mm0,1 \
+  __asm  mov A,0x7FFF54DC \
+  __asm  paddw mm0,mm7 \
+  __asm  movd mm7,A \
+  /*mm7={54491-0x7FFF,0x7FFF}x2 \
+    mm4=_y[4]=v=r-u*/ \
+  __asm  psubw mm4,mm0 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  movq [Y+_r4],mm4 \
+  /*mm0=0, mm7={36410}x4 \
+    mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  mov A,0x8E3A8E3A \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  movq [Y+_r0],mm0 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm0=0 \
+    mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  paddw mm1,mm2 \
+  __asm  pmullw mm3,mm7 \
+  __asm  pxor mm0,mm0 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t7'', mm7={26568,0x3400}x2 \
+    mm2=s=t6'''-(36410*u>>16)*/ \
+  __asm  movq mm1,mm4 \
+  __asm  mov A,0x340067C8 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  movd mm7,A \
+  __asm  movq [Y+_r5],mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddw mm4,mm1 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  psubw mm2,mm4 \
+  /*mm6={0x00007B1B}x2 \
+    mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
+  __asm  movq mm4,mm2 \
+  __asm  movq mm5,mm2 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x7B1B \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={64277-0x7FFF,0x7FFF}x2 \
+    mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+  __asm  psrad mm4,17 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,17 \
+  __asm  mov A,0x7FFF7B16 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={12785}x4 \
+    mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r3],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r1] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x31F131F1 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  movd mm7,A \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm3={-1}x4, mm6={1}x4 \
+    mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  pxor mm6,mm6 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm3,mm3 \
+  __asm  packssdw mm4,mm5 \
+  __asm  psubw mm6,mm3 \
+  /*mm1=t3'', mm7={20539,0x3000}x2 \
+    mm4=s=(12785*u>>16)-t4''*/ \
+  __asm  movq [Y+_r1],mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  mov A,0x3000503B \
+  __asm  movq mm1,[Y+_r6] \
+  __asm  movd mm7,A \
+  __asm  psubw mm4,mm2 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm6={0x00006CB7}x2 \
+    mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
+  __asm  movq mm5,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpcklwd mm4,mm6 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x6CB7 \
+  __asm  punpckhwd mm5,mm6 \
+  __asm  movd mm6,A \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  psubw mm0,mm3 \
+  __asm  punpckldq mm6,mm6 \
+  /*mm7={60547-0x7FFF,0x7FFF}x2 \
+    mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+  __asm  psrad mm4,20 \
+  __asm  paddw mm2,mm0 \
+  __asm  psrad mm5,20 \
+  __asm  mov A,0x7FFF6C84 \
+  __asm  packssdw mm4,mm5 \
+  __asm  movd mm7,A \
+  __asm  paddw mm2,mm4 \
+  __asm  punpckldq mm7,mm7 \
+  /*mm0=0, mm7={25080}x4 \
+    mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
+  __asm  movq mm4,mm1 \
+  __asm  movq mm5,mm1 \
+  __asm  movq [Y+_r7],mm2 \
+  __asm  punpcklwd mm4,mm1 \
+  __asm  movq mm2,[Y+_r2] \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  mov A,0x61F861F8 \
+  __asm  punpckhwd mm5,mm1 \
+  __asm  pxor mm0,mm0 \
+  __asm  pmaddwd mm5,mm7 \
+  __asm  movd mm7,A \
+  __asm  pcmpeqw mm1,mm0 \
+  __asm  psubw mm1,mm3 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  paddd mm4,mm6 \
+  __asm  paddd mm5,mm6 \
+  /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
+  __asm  movq mm6,mm2 \
+  __asm  movq mm3,mm2 \
+  __asm  pmulhw mm6,mm7 \
+  __asm  pmullw mm3,mm7 \
+  __asm  paddw mm6,mm1 \
+  __asm  movq mm1,mm3 \
+  __asm  punpckhwd mm3,mm6 \
+  __asm  punpcklwd mm1,mm6 \
+  /*mm1={-1}x4 \
+    mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+  __asm  paddd mm5,mm3 \
+  __asm  paddd mm4,mm1 \
+  __asm  psrad mm5,16 \
+  __asm  mov A,0x28005460 \
+  __asm  psrad mm4,16 \
+  __asm  pcmpeqb mm1,mm1 \
+  __asm  packssdw mm4,mm5 \
+  /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
+    mm4=s=(25080*u>>16)-t2''*/ \
+  __asm  movq mm6,mm4 \
+  __asm  pmulhw mm4,mm7 \
+  __asm  pxor mm5,mm5 \
+  __asm  movd mm7,A \
+  __asm  psubw mm5,mm1 \
+  __asm  punpckldq mm7,mm7 \
+  __asm  psubw mm4,mm2 \
+  /*mm2=s+(s!=0) \
+    mm4:mm3=s*21600+0x2800*/ \
+  __asm  movq mm3,mm4 \
+  __asm  movq mm2,mm4 \
+  __asm  punpckhwd mm4,mm5 \
+  __asm  pcmpeqw mm0,mm2 \
+  __asm  pmaddwd mm4,mm7 \
+  __asm  psubw mm0,mm1 \
+  __asm  punpcklwd mm3,mm5 \
+  __asm  paddw mm2,mm0 \
+  __asm  pmaddwd mm3,mm7 \
+  /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
+    mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+  __asm  movq mm0,[Y+_r4] \
+  __asm  psrad mm4,18 \
+  __asm  movq mm5,[Y+_r5] \
+  __asm  psrad mm3,18 \
+  __asm  movq mm1,[Y+_r7] \
+  __asm  packssdw mm3,mm4 \
+  __asm  movq mm4,[Y+_r0] \
+  __asm  paddw mm3,mm2 \
+}
+
+/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
+  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
+   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
+#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
+  /*First 4x4 transpose:*/ \
+  /*mm0 = e3 e2 e1 e0 \
+    mm5 = f3 f2 f1 f0 \
+    mm3 = g3 g2 g1 g0 \
+    mm1 = h3 h2 h1 h0*/ \
+  __asm  movq mm2,mm0 \
+  __asm  punpcklwd mm0,mm5 \
+  __asm  punpckhwd mm2,mm5 \
+  __asm  movq mm5,mm3 \
+  __asm  punpcklwd mm3,mm1 \
+  __asm  punpckhwd mm5,mm1 \
+  /*mm0 = f1 e1 f0 e0 \
+    mm2 = f3 e3 f2 e2 \
+    mm3 = h1 g1 h0 g0 \
+    mm5 = h3 g3 h2 g2*/ \
+  __asm  movq mm1,mm0 \
+  __asm  punpckldq mm0,mm3 \
+  __asm  movq [Y+_r4],mm0 \
+  __asm  punpckhdq mm1,mm3 \
+  __asm  movq mm0,[Y+_r1] \
+  __asm  movq mm3,mm2 \
+  __asm  punpckldq mm2,mm5 \
+  __asm  punpckhdq mm3,mm5 \
+  __asm  movq mm5,[Y+_r3] \
+  /*_y[4] = h0 g0 f0 e0 \
+   mm1  = h1 g1 f1 e1 \
+   mm2  = h2 g2 f2 e2 \
+   mm3  = h3 g3 f3 e3*/ \
+  /*Second 4x4 transpose:*/ \
+  /*mm4 = a3 a2 a1 a0 \
+    mm0 = b3 b2 b1 b0 \
+    mm6 = c3 c2 c1 c0 \
+    mm5 = d3 d2 d1 d0*/ \
+  __asm  movq mm7,mm4 \
+  __asm  punpcklwd mm4,mm0 \
+  __asm  punpckhwd mm7,mm0 \
+  __asm  movq mm0,mm6 \
+  __asm  punpcklwd mm6,mm5 \
+  __asm  punpckhwd mm0,mm5 \
+  /*mm4 = b1 a1 b0 a0 \
+    mm7 = b3 a3 b2 a2 \
+    mm6 = d1 c1 d0 c0 \
+    mm0 = d3 c3 d2 c2*/ \
+  __asm  movq mm5,mm4 \
+  __asm  punpckldq mm4,mm6 \
+  __asm  punpckhdq mm5,mm6 \
+  __asm  movq mm6,mm7 \
+  __asm  punpckhdq mm7,mm0 \
+  __asm  punpckldq mm6,mm0 \
+  /*mm4 = d0 c0 b0 a0 \
+    mm5 = d1 c1 b1 a1 \
+    mm6 = d2 c2 b2 a2 \
+    mm7 = d3 c3 b3 a3*/ \
+}
+
+/*MMX implementation of the fDCT.*/
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm{
+#define Y eax
+#define A ecx
+#define X edx
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add biases to correct for some systematic error that remains in
+       the full fDCT->iDCT round trip.*/
+    mov X, _x
+    mov Y, _y
+    movq mm0,[0x00+X]
+    movq mm1,[0x10+X]
+    movq mm2,[0x20+X]
+    movq mm3,[0x30+X]
+    pcmpeqb mm4,mm4
+    pxor mm7,mm7
+    movq mm5,mm0
+    psllw mm0,2
+    pcmpeqw mm5,mm7
+    movq mm7,[0x70+X]
+    psllw mm1,2
+    psubw mm5,mm4
+    psllw mm2,2
+    mov A,1
+    pslld mm5,16
+    movd mm6,A
+    psllq mm5,16
+    mov A,0x10001
+    psllw mm3,2
+    movd mm4,A
+    punpckhwd mm5,mm6
+    psubw mm1,mm6
+    movq mm6,[0x60+X]
+    paddw mm0,mm5
+    movq mm5,[0x50+X]
+    paddw mm0,mm4
+    movq mm4,[0x40+X]
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    psllw mm7,2
+    psubw mm0,mm7
+    psllw mm6,2
+    paddw mm7,mm7
+    /*mm1=t6'=t1-t6*/
+    psllw mm5,2
+    psubw mm1,mm6
+    psllw mm4,2
+    paddw mm6,mm6
+    /*mm2=t5'=t2-t5*/
+    psubw mm2,mm5
+    paddw mm5,mm5
+    /*mm3=t4'=t3-t4*/
+    psubw mm3,mm4
+    paddw mm4,mm4
+    /*mm7=t0'=t0+t7*/
+    paddw mm7,mm0
+    /*mm6=t1'=t1+t6*/
+    paddw mm6,mm1
+    /*mm5=t2'=t2+t5*/
+    paddw mm5,mm2
+    /*mm4=t3'=t3+t4*/
+    paddw mm4,mm3
+    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
+    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
+    /*Swap out this 8x4 block for the next one.*/
+    movq mm0,[0x08+X]
+    movq [0x30+Y],mm7
+    movq mm7,[0x78+X]
+    movq [0x50+Y],mm1
+    movq mm1,[0x18+X]
+    movq [0x20+Y],mm6
+    movq mm6,[0x68+X]
+    movq [0x60+Y],mm2
+    movq mm2,[0x28+X]
+    movq [0x10+Y],mm5
+    movq mm5,[0x58+X]
+    movq [0x70+Y],mm3
+    movq mm3,[0x38+X]
+    /*And increase its working precision, too.*/
+    psllw mm0,2
+    movq [0x00+Y],mm4
+    psllw mm7,2
+    movq mm4,[0x48+X]
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    psubw mm0,mm7
+    psllw mm1,2
+    paddw mm7,mm7
+    psllw mm6,2
+    /*mm1=t6'=t1-t6*/
+    psubw mm1,mm6
+    psllw mm2,2
+    paddw mm6,mm6
+    psllw mm5,2
+    /*mm2=t5'=t2-t5*/
+    psubw mm2,mm5
+    psllw mm3,2
+    paddw mm5,mm5
+    psllw mm4,2
+    /*mm3=t4'=t3-t4*/
+    psubw mm3,mm4
+    paddw mm4,mm4
+    /*mm7=t0'=t0+t7*/
+    paddw mm7,mm0
+    /*mm6=t1'=t1+t6*/
+    paddw mm6,mm1
+    /*mm5=t2'=t2+t5*/
+    paddw mm5,mm2
+    /*mm4=t3'=t3+t4*/
+    paddw mm4,mm3
+    OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
+    OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place,
+       so we only have to do half the stores and loads.*/
+    movq mm0,[0x00+Y]
+    movq [0x58+Y],mm1
+    movq mm1,[0x10+Y]
+    movq [0x68+Y],mm2
+    movq mm2,[0x20+Y]
+    movq [0x78+Y],mm3
+    movq mm3,[0x30+Y]
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
+    OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
+    /*mm0={-2}x4*/
+    pcmpeqw mm0,mm0
+    paddw mm0,mm0
+    /*Round the results.*/
+    psubw mm1,mm0
+    psubw mm2,mm0
+    psraw mm1,2
+    psubw mm3,mm0
+    movq [0x18+Y],mm1
+    psraw mm2,2
+    psubw mm4,mm0
+    movq mm1,[0x08+Y]
+    psraw mm3,2
+    psubw mm5,mm0
+    psraw mm4,2
+    psubw mm6,mm0
+    psraw mm5,2
+    psubw mm7,mm0
+    psraw mm6,2
+    psubw mm1,mm0
+    psraw mm7,2
+    movq mm0,[0x40+Y]
+    psraw mm1,2
+    movq [0x30+Y],mm7
+    movq mm7,[0x78+Y]
+    movq [0x08+Y],mm1
+    movq mm1,[0x50+Y]
+    movq [0x20+Y],mm6
+    movq mm6,[0x68+Y]
+    movq [0x28+Y],mm2
+    movq mm2,[0x60+Y]
+    movq [0x10+Y],mm5
+    movq mm5,[0x58+Y]
+    movq [0x38+Y],mm3
+    movq mm3,[0x70+Y]
+    movq [0x00+Y],mm4
+    movq mm4,[0x48+Y]
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
+    OC_TRANSPOSE8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
+    /*mm0={-2}x4*/
+    pcmpeqw mm0,mm0
+    paddw mm0,mm0
+    /*Round the results.*/
+    psubw mm1,mm0
+    psubw mm2,mm0
+    psraw mm1,2
+    psubw mm3,mm0
+    movq [0x58+Y],mm1
+    psraw mm2,2
+    psubw mm4,mm0
+    movq mm1,[0x48+Y]
+    psraw mm3,2
+    psubw mm5,mm0
+    movq [0x68+Y],mm2
+    psraw mm4,2
+    psubw mm6,mm0
+    movq [0x78+Y],mm3
+    psraw mm5,2
+    psubw mm7,mm0
+    movq [0x40+Y],mm4
+    psraw mm6,2
+    psubw mm1,mm0
+    movq [0x50+Y],mm5
+    psraw mm7,2
+    movq [0x60+Y],mm6
+    psraw mm1,2
+    movq [0x70+Y],mm7
+    movq [0x48+Y],mm1
+#undef Y
+#undef A
+#undef X
+  }
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.c b/thirdparty/libtheora/x86_vc/mmxfrag.c
new file mode 100644
index 0000000000..4eb2084dc6
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/mmxfrag.c
@@ -0,0 +1,337 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxfrag.c 16578 2009-09-25 19:50:48Z cristianadam $
+
+ ********************************************************************/
+
+/*MMX acceleration of fragment reconstruction for motion compensation.
+  Originally written by Rudolf Marek.
+  Additional optimization by Nils Pipenbrinck.
+  Note: Loops are unrolled for best performance.
+  The iteration each instruction belongs to is marked in the comments as #i.*/
+#include <stddef.h>
+#include "x86int.h"
+#include "mmxfrag.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 esi
+  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+}
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue){
+  __asm{
+#define DST edx
+#define DST4 esi
+#define YSTRIDE eax
+#define YSTRIDE3 edi
+#define RESIDUE ecx
+    mov DST,_dst
+    mov YSTRIDE,_ystride
+    mov RESIDUE,_residue
+    lea DST4,[DST+YSTRIDE*4]
+    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
+    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+    pcmpeqw mm0,mm0
+    /*#0 Load low residue.*/
+    movq mm1,[0*8+RESIDUE]
+    /*#0 Load high residue.*/
+    movq mm2,[1*8+RESIDUE]
+    /*Set mm0 to 0x8000800080008000.*/
+    psllw mm0,15
+    /*#1 Load low residue.*/
+    movq mm3,[2*8+RESIDUE]
+    /*#1 Load high residue.*/
+    movq mm4,[3*8+RESIDUE]
+    /*Set mm0 to 0x0080008000800080.*/
+    psrlw mm0,8
+    /*#2 Load low residue.*/
+    movq mm5,[4*8+RESIDUE]
+    /*#2 Load high residue.*/
+    movq mm6,[5*8+RESIDUE]
+    /*#0 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#0 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#0 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#1 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#1 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#1 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#2 Bias low  residue.*/
+    paddsw mm5,mm0
+    /*#2 Bias high residue.*/
+    paddsw mm6,mm0
+    /*#2 Pack to byte.*/
+    packuswb mm5,mm6
+    /*#0 Write row.*/
+    movq [DST],mm1
+    /*#1 Write row.*/
+    movq [DST+YSTRIDE],mm3
+    /*#2 Write row.*/
+    movq [DST+YSTRIDE*2],mm5
+    /*#3 Load low residue.*/
+    movq mm1,[6*8+RESIDUE]
+    /*#3 Load high residue.*/
+    movq mm2,[7*8+RESIDUE]
+    /*#4 Load high residue.*/
+    movq mm3,[8*8+RESIDUE]
+    /*#4 Load high residue.*/
+    movq mm4,[9*8+RESIDUE]
+    /*#5 Load high residue.*/
+    movq mm5,[10*8+RESIDUE]
+    /*#5 Load high residue.*/
+    movq mm6,[11*8+RESIDUE]
+    /*#3 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#3 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#3 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#4 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#4 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#4 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#5 Bias low  residue.*/
+    paddsw mm5,mm0
+    /*#5 Bias high residue.*/
+    paddsw mm6,mm0
+    /*#5 Pack to byte.*/
+    packuswb mm5,mm6
+    /*#3 Write row.*/
+    movq [DST+YSTRIDE3],mm1
+    /*#4 Write row.*/
+    movq [DST4],mm3
+    /*#5 Write row.*/
+    movq [DST4+YSTRIDE],mm5
+    /*#6 Load low residue.*/
+    movq mm1,[12*8+RESIDUE]
+    /*#6 Load high residue.*/
+    movq mm2,[13*8+RESIDUE]
+    /*#7 Load low residue.*/
+    movq mm3,[14*8+RESIDUE]
+    /*#7 Load high residue.*/
+    movq mm4,[15*8+RESIDUE]
+    /*#6 Bias low  residue.*/
+    paddsw mm1,mm0
+    /*#6 Bias high residue.*/
+    paddsw mm2,mm0
+    /*#6 Pack to byte.*/
+    packuswb mm1,mm2
+    /*#7 Bias low  residue.*/
+    paddsw mm3,mm0
+    /*#7 Bias high residue.*/
+    paddsw mm4,mm0
+    /*#7 Pack to byte.*/
+    packuswb mm3,mm4
+    /*#6 Write row.*/
+    movq [DST4+YSTRIDE*2],mm1
+    /*#7 Write row.*/
+    movq [DST4+YSTRIDE3],mm3
+#undef DST
+#undef DST4
+#undef YSTRIDE
+#undef YSTRIDE3
+#undef RESIDUE
+  }
+}
+
+void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm0.*/
+  __asm pxor mm0,mm0;
+  for(i=4;i-->0;){
+    __asm{
+#define DST edx
+#define SRC ecx
+#define YSTRIDE edi
+#define RESIDUE eax
+      mov DST,_dst
+      mov SRC,_src
+      mov YSTRIDE,_ystride
+      mov RESIDUE,_residue
+      /*#0 Load source.*/
+      movq mm3,[SRC]
+      /*#1 Load source.*/
+      movq mm7,[SRC+YSTRIDE]
+      /*#0 Get copy of src.*/
+      movq mm4,mm3
+      /*#0 Expand high source.*/
+      punpckhbw mm4,mm0
+      /*#0 Expand low  source.*/
+      punpcklbw mm3,mm0
+      /*#0 Add residue high.*/
+      paddsw mm4,[8+RESIDUE]
+      /*#1 Get copy of src.*/
+      movq mm2,mm7
+      /*#0 Add residue low.*/
+      paddsw  mm3,[RESIDUE]
+      /*#1 Expand high source.*/
+      punpckhbw mm2,mm0
+      /*#0 Pack final row pixels.*/
+      packuswb mm3,mm4
+      /*#1 Expand low  source.*/
+      punpcklbw mm7,mm0
+      /*#1 Add residue low.*/
+      paddsw mm7,[16+RESIDUE]
+      /*#1 Add residue high.*/
+      paddsw mm2,[24+RESIDUE]
+      /*Advance residue.*/
+      lea RESIDUE,[32+RESIDUE]
+      /*#1 Pack final row pixels.*/
+      packuswb mm7,mm2
+      /*Advance src.*/
+      lea SRC,[SRC+YSTRIDE*2]
+      /*#0 Write row.*/
+      movq [DST],mm3
+      /*#1 Write row.*/
+      movq [DST+YSTRIDE],mm7
+      /*Advance dst.*/
+      lea DST,[DST+YSTRIDE*2]
+      mov _residue,RESIDUE
+      mov _dst,DST
+      mov _src,SRC
+#undef DST
+#undef SRC
+#undef YSTRIDE
+#undef RESIDUE
+    }
+  }
+}
+
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
+  int i;
+  /*Zero mm7.*/
+  __asm pxor mm7,mm7;
+  for(i=4;i-->0;){
+    __asm{
+#define SRC1 ecx
+#define SRC2 edi
+#define YSTRIDE esi
+#define RESIDUE edx
+#define DST eax
+      mov YSTRIDE,_ystride
+      mov DST,_dst
+      mov RESIDUE,_residue
+      mov SRC1,_src1
+      mov SRC2,_src2
+      /*#0 Load src1.*/
+      movq mm0,[SRC1]
+      /*#0 Load src2.*/
+      movq mm2,[SRC2]
+      /*#0 Copy src1.*/
+      movq mm1,mm0
+      /*#0 Copy src2.*/
+      movq mm3,mm2
+      /*#1 Load src1.*/
+      movq mm4,[SRC1+YSTRIDE]
+      /*#0 Unpack lower src1.*/
+      punpcklbw mm0,mm7
+      /*#1 Load src2.*/
+      movq mm5,[SRC2+YSTRIDE]
+      /*#0 Unpack higher src1.*/
+      punpckhbw mm1,mm7
+      /*#0 Unpack lower src2.*/
+      punpcklbw mm2,mm7
+      /*#0 Unpack higher src2.*/
+      punpckhbw mm3,mm7
+      /*Advance src1 ptr.*/
+      lea SRC1,[SRC1+YSTRIDE*2]
+      /*Advance src2 ptr.*/
+      lea SRC2,[SRC2+YSTRIDE*2]
+      /*#0 Lower src1+src2.*/
+      paddsw mm0,mm2
+      /*#0 Higher src1+src2.*/
+      paddsw mm1,mm3
+      /*#1 Copy src1.*/
+      movq mm2,mm4
+      /*#0 Build lo average.*/
+      psraw mm0,1
+      /*#1 Copy src2.*/
+      movq mm3,mm5
+      /*#1 Unpack lower src1.*/
+      punpcklbw mm4,mm7
+      /*#0 Build hi average.*/
+      psraw mm1,1
+      /*#1 Unpack higher src1.*/
+      punpckhbw mm2,mm7
+      /*#0 low+=residue.*/
+      paddsw mm0,[RESIDUE]
+      /*#1 Unpack lower src2.*/
+      punpcklbw mm5,mm7
+      /*#0 high+=residue.*/
+      paddsw mm1,[8+RESIDUE]
+      /*#1 Unpack higher src2.*/
+      punpckhbw mm3,mm7
+      /*#1 Lower src1+src2.*/
+      paddsw mm5,mm4
+      /*#0 Pack and saturate.*/
+      packuswb mm0,mm1
+      /*#1 Higher src1+src2.*/
+      paddsw mm3,mm2
+      /*#0 Write row.*/
+      movq [DST],mm0
+      /*#1 Build lo average.*/
+      psraw mm5,1
+      /*#1 Build hi average.*/
+      psraw mm3,1
+      /*#1 low+=residue.*/
+      paddsw mm5,[16+RESIDUE]
+      /*#1 high+=residue.*/
+      paddsw mm3,[24+RESIDUE]
+      /*#1 Pack and saturate.*/
+      packuswb  mm5,mm3
+      /*#1 Write row ptr.*/
+      movq [DST+YSTRIDE],mm5
+      /*Advance residue ptr.*/
+      add RESIDUE,32
+      /*Advance dest ptr.*/
+      lea DST,[DST+YSTRIDE*2]
+      mov _dst,DST
+      mov _residue,RESIDUE
+      mov _src1,SRC1
+      mov _src2,SRC2
+#undef SRC1
+#undef SRC2
+#undef YSTRIDE
+#undef RESIDUE
+#undef DST
+    }
+  }
+}
+
+void oc_restore_fpu_mmx(void){
+  __asm emms;
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxfrag.h b/thirdparty/libtheora/x86_vc/mmxfrag.h
new file mode 100644
index 0000000000..45ee93e777
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/mmxfrag.h
@@ -0,0 +1,61 @@
+#if !defined(_x86_vc_mmxfrag_H)
+# define _x86_vc_mmxfrag_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm  mov SRC,src \
+    __asm  mov DST,dst \
+    __asm  mov YSTRIDE,_ystride \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*ystride3=ystride*3*/ \
+    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*Pointer to next 4.*/ \
+    __asm  lea SRC,[SRC+YSTRIDE*4] \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+    /*Pointer to next 4.*/ \
+    __asm  lea DST,[DST+YSTRIDE*4] \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+  } \
+  while(0)
+
+# endif
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxidct.c b/thirdparty/libtheora/x86_vc/mmxidct.c
new file mode 100644
index 0000000000..8f5ff6803c
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/mmxidct.c
@@ -0,0 +1,562 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*MMX acceleration of Theora's iDCT.
+  Originally written by Rudolf Marek, based on code from On2's VP3.*/
+#include "x86int.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*These are offsets into the table of constants below.*/
+/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
+#define OC_COSINE_OFFSET (0)
+/*A row of 8's.*/
+#define OC_EIGHT_OFFSET  (56)
+
+
+
+/*A table of constants used by the MMX routines.*/
+static const __declspec(align(16))ogg_uint16_t
+ OC_IDCT_CONSTS[(7+1)*4]={
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
+      8,    8,    8,    8
+};
+
+/*38 cycles*/
+#define OC_IDCT_BEGIN __asm{ \
+  __asm movq mm2,OC_I(3) \
+  __asm movq mm6,OC_C(3) \
+  __asm movq mm4,mm2 \
+  __asm movq mm7,OC_J(5) \
+  __asm pmulhw mm4,mm6 \
+  __asm movq mm1,OC_C(5) \
+  __asm pmulhw mm6,mm7 \
+  __asm movq mm5,mm1 \
+  __asm pmulhw mm1,mm2 \
+  __asm movq mm3,OC_I(1) \
+  __asm pmulhw mm5,mm7 \
+  __asm movq mm0,OC_C(1) \
+  __asm paddw mm4,mm2 \
+  __asm paddw mm6,mm7 \
+  __asm paddw mm2,mm1 \
+  __asm movq mm1,OC_J(7) \
+  __asm paddw mm7,mm5 \
+  __asm movq mm5,mm0 \
+  __asm pmulhw mm0,mm3 \
+  __asm paddw mm4,mm7 \
+  __asm pmulhw mm5,mm1 \
+  __asm movq mm7,OC_C(7) \
+  __asm psubw mm6,mm2 \
+  __asm paddw mm0,mm3 \
+  __asm pmulhw mm3,mm7 \
+  __asm movq mm2,OC_I(2) \
+  __asm pmulhw mm7,mm1 \
+  __asm paddw mm5,mm1 \
+  __asm movq mm1,mm2 \
+  __asm pmulhw mm2,OC_C(2) \
+  __asm psubw mm3,mm5 \
+  __asm movq mm5,OC_J(6) \
+  __asm paddw mm0,mm7 \
+  __asm movq mm7,mm5 \
+  __asm psubw mm0,mm4 \
+  __asm pmulhw mm5,OC_C(2) \
+  __asm paddw mm2,mm1 \
+  __asm pmulhw mm1,OC_C(6) \
+  __asm paddw mm4,mm4 \
+  __asm paddw mm4,mm0 \
+  __asm psubw mm3,mm6 \
+  __asm paddw mm5,mm7 \
+  __asm paddw mm6,mm6 \
+  __asm pmulhw mm7,OC_C(6) \
+  __asm paddw mm6,mm3 \
+  __asm movq OC_I(1),mm4 \
+  __asm psubw mm1,mm5 \
+  __asm movq mm4,OC_C(4) \
+  __asm movq mm5,mm3 \
+  __asm pmulhw mm3,mm4 \
+  __asm paddw mm7,mm2 \
+  __asm movq OC_I(2),mm6 \
+  __asm movq mm2,mm0 \
+  __asm movq mm6,OC_I(0) \
+  __asm pmulhw mm0,mm4 \
+  __asm paddw mm5,mm3 \
+  __asm movq mm3,OC_J(4) \
+  __asm psubw mm5,mm1 \
+  __asm paddw mm2,mm0 \
+  __asm psubw mm6,mm3 \
+  __asm movq mm0,mm6 \
+  __asm pmulhw mm6,mm4 \
+  __asm paddw mm3,mm3 \
+  __asm paddw mm1,mm1 \
+  __asm paddw mm3,mm0 \
+  __asm paddw mm1,mm5 \
+  __asm pmulhw mm4,mm3 \
+  __asm paddw mm6,mm0 \
+  __asm psubw mm6,mm2 \
+  __asm paddw mm2,mm2 \
+  __asm movq mm0,OC_I(1) \
+  __asm paddw mm2,mm6 \
+  __asm paddw mm4,mm3 \
+  __asm psubw mm2,mm1 \
+}
+
+/*38+8=46 cycles.*/
+#define OC_ROW_IDCT __asm{ \
+  OC_IDCT_BEGIN \
+  /*r3=D'*/ \
+  __asm  movq mm3,OC_I(2) \
+  /*r4=E'=E-G*/ \
+  __asm  psubw mm4,mm7 \
+  /*r1=H'+H'*/ \
+  __asm  paddw mm1,mm1 \
+  /*r7=G+G*/ \
+  __asm  paddw mm7,mm7 \
+  /*r1=R1=A''+H'*/ \
+  __asm  paddw mm1,mm2 \
+  /*r7=G'=E+G*/ \
+  __asm  paddw mm7,mm4 \
+  /*r4=R4=E'-D'*/ \
+  __asm  psubw mm4,mm3 \
+  __asm  paddw mm3,mm3 \
+  /*r6=R6=F'-B''*/ \
+  __asm  psubw mm6,mm5 \
+  __asm  paddw mm5,mm5 \
+  /*r3=R3=E'+D'*/ \
+  __asm  paddw mm3,mm4 \
+  /*r5=R5=F'+B''*/ \
+  __asm  paddw mm5,mm6 \
+  /*r7=R7=G'-C'*/ \
+  __asm  psubw mm7,mm0 \
+  __asm  paddw mm0,mm0 \
+  /*Save R1.*/ \
+  __asm  movq OC_I(1),mm1 \
+  /*r0=R0=G.+C.*/ \
+  __asm  paddw mm0,mm7 \
+}
+
+/*The following macro does two 4x4 transposes in place.
+  At entry, we assume:
+    r0 = a3 a2 a1 a0
+  I(1) = b3 b2 b1 b0
+    r2 = c3 c2 c1 c0
+    r3 = d3 d2 d1 d0
+
+    r4 = e3 e2 e1 e0
+    r5 = f3 f2 f1 f0
+    r6 = g3 g2 g1 g0
+    r7 = h3 h2 h1 h0
+
+  At exit, we have:
+  I(0) = d0 c0 b0 a0
+  I(1) = d1 c1 b1 a1
+  I(2) = d2 c2 b2 a2
+  I(3) = d3 c3 b3 a3
+
+  J(4) = h0 g0 f0 e0
+  J(5) = h1 g1 f1 e1
+  J(6) = h2 g2 f2 e2
+  J(7) = h3 g3 f3 e3
+
+  I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
+  J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
+
+  Since r1 is free at entry, we calculate the Js first.*/
+/*19 cycles.*/
+#define OC_TRANSPOSE __asm{ \
+  __asm movq mm1,mm4 \
+  __asm punpcklwd mm4,mm5 \
+  __asm movq OC_I(0),mm0 \
+  __asm punpckhwd mm1,mm5 \
+  __asm movq mm0,mm6 \
+  __asm punpcklwd mm6,mm7 \
+  __asm movq mm5,mm4 \
+  __asm punpckldq mm4,mm6 \
+  __asm punpckhdq mm5,mm6 \
+  __asm movq mm6,mm1 \
+  __asm movq OC_J(4),mm4 \
+  __asm punpckhwd mm0,mm7 \
+  __asm movq OC_J(5),mm5 \
+  __asm punpckhdq mm6,mm0 \
+  __asm movq mm4,OC_I(0) \
+  __asm punpckldq mm1,mm0 \
+  __asm movq mm5,OC_I(1) \
+  __asm movq mm0,mm4 \
+  __asm movq OC_J(7),mm6 \
+  __asm punpcklwd mm0,mm5 \
+  __asm movq OC_J(6),mm1 \
+  __asm punpckhwd mm4,mm5 \
+  __asm movq mm5,mm2 \
+  __asm punpcklwd mm2,mm3 \
+  __asm movq mm1,mm0 \
+  __asm punpckldq mm0,mm2 \
+  __asm punpckhdq mm1,mm2 \
+  __asm movq mm2,mm4 \
+  __asm movq OC_I(0),mm0 \
+  __asm punpckhwd mm5,mm3 \
+  __asm movq OC_I(1),mm1 \
+  __asm punpckhdq mm4,mm5 \
+  __asm punpckldq mm2,mm5 \
+  __asm movq OC_I(3),mm4 \
+  __asm movq OC_I(2),mm2 \
+}
+
+/*38+19=57 cycles.*/
+#define OC_COLUMN_IDCT __asm{ \
+  OC_IDCT_BEGIN \
+  __asm paddw mm2,OC_8 \
+  /*r1=H'+H'*/ \
+  __asm paddw mm1,mm1 \
+  /*r1=R1=A''+H'*/ \
+  __asm paddw mm1,mm2 \
+  /*r2=NR2*/ \
+  __asm psraw mm2,4 \
+  /*r4=E'=E-G*/ \
+  __asm psubw mm4,mm7 \
+  /*r1=NR1*/ \
+  __asm psraw mm1,4 \
+  /*r3=D'*/ \
+  __asm movq mm3,OC_I(2) \
+  /*r7=G+G*/ \
+  __asm paddw mm7,mm7 \
+  /*Store NR2 at I(2).*/ \
+  __asm movq OC_I(2),mm2 \
+  /*r7=G'=E+G*/ \
+  __asm paddw mm7,mm4 \
+  /*Store NR1 at I(1).*/ \
+  __asm movq OC_I(1),mm1 \
+  /*r4=R4=E'-D'*/ \
+  __asm psubw mm4,mm3 \
+  __asm paddw mm4,OC_8 \
+  /*r3=D'+D'*/ \
+  __asm paddw mm3,mm3 \
+  /*r3=R3=E'+D'*/ \
+  __asm paddw mm3,mm4 \
+  /*r4=NR4*/ \
+  __asm psraw mm4,4 \
+  /*r6=R6=F'-B''*/ \
+  __asm psubw mm6,mm5 \
+  /*r3=NR3*/ \
+  __asm psraw mm3,4 \
+  __asm paddw mm6,OC_8 \
+  /*r5=B''+B''*/ \
+  __asm paddw mm5,mm5 \
+  /*r5=R5=F'+B''*/ \
+  __asm paddw mm5,mm6 \
+  /*r6=NR6*/ \
+  __asm psraw mm6,4 \
+  /*Store NR4 at J(4).*/ \
+  __asm movq OC_J(4),mm4 \
+  /*r5=NR5*/ \
+  __asm psraw mm5,4 \
+  /*Store NR3 at I(3).*/ \
+  __asm movq OC_I(3),mm3 \
+  /*r7=R7=G'-C'*/ \
+  __asm psubw mm7,mm0 \
+  __asm paddw mm7,OC_8 \
+  /*r0=C'+C'*/ \
+  __asm paddw mm0,mm0 \
+  /*r0=R0=G'+C'*/ \
+  __asm paddw mm0,mm7 \
+  /*r7=NR7*/ \
+  __asm psraw mm7,4 \
+  /*Store NR6 at J(6).*/ \
+  __asm movq OC_J(6),mm6 \
+  /*r0=NR0*/ \
+  __asm psraw mm0,4 \
+  /*Store NR5 at J(5).*/ \
+  __asm movq OC_J(5),mm5 \
+  /*Store NR7 at J(7).*/ \
+  __asm movq OC_J(7),mm7 \
+  /*Store NR0 at I(0).*/ \
+  __asm movq OC_I(0),mm0 \
+}
+
+#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
+#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
+#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
+
+static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+  /*This routine accepts an 8x8 matrix, but in partially transposed form.
+    Every 4x4 block is transposed.*/
+  __asm{
+#define CONSTS eax
+#define Y edx
+    mov CONSTS,offset OC_IDCT_CONSTS
+    mov Y,_y
+#define OC_I(_k)      [Y+_k*16]
+#define OC_J(_k)      [Y+(_k-4)*16+8]
+    OC_ROW_IDCT
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      [Y+(_k*16)+64]
+#define OC_J(_k)      [Y+(_k-4)*16+72]
+    OC_ROW_IDCT
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      [Y+_k*16]
+#define OC_J(_k)      OC_I(_k)
+    OC_COLUMN_IDCT
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k)      [Y+_k*16+8]
+#define OC_J(_k)      OC_I(_k)
+    OC_COLUMN_IDCT
+#undef  OC_I
+#undef  OC_J
+#undef  CONSTS
+#undef  Y
+  }
+}
+
+/*25 cycles.*/
+#define OC_IDCT_BEGIN_10 __asm{ \
+  __asm movq mm2,OC_I(3) \
+  __asm nop \
+  __asm movq mm6,OC_C(3) \
+  __asm movq mm4,mm2 \
+  __asm movq mm1,OC_C(5) \
+  __asm pmulhw mm4,mm6 \
+  __asm movq mm3,OC_I(1) \
+  __asm pmulhw mm1,mm2 \
+  __asm movq mm0,OC_C(1) \
+  __asm paddw mm4,mm2 \
+  __asm pxor mm6,mm6 \
+  __asm paddw mm2,mm1 \
+  __asm movq mm5,OC_I(2) \
+  __asm pmulhw mm0,mm3 \
+  __asm movq mm1,mm5 \
+  __asm paddw mm0,mm3 \
+  __asm pmulhw mm3,OC_C(7) \
+  __asm psubw mm6,mm2 \
+  __asm pmulhw mm5,OC_C(2) \
+  __asm psubw mm0,mm4 \
+  __asm movq mm7,OC_I(2) \
+  __asm paddw mm4,mm4 \
+  __asm paddw mm7,mm5 \
+  __asm paddw mm4,mm0 \
+  __asm pmulhw mm1,OC_C(6) \
+  __asm psubw mm3,mm6 \
+  __asm movq OC_I(1),mm4 \
+  __asm paddw mm6,mm6 \
+  __asm movq mm4,OC_C(4) \
+  __asm paddw mm6,mm3 \
+  __asm movq mm5,mm3 \
+  __asm pmulhw mm3,mm4 \
+  __asm movq OC_I(2),mm6 \
+  __asm movq mm2,mm0 \
+  __asm movq mm6,OC_I(0) \
+  __asm pmulhw mm0,mm4 \
+  __asm paddw mm5,mm3 \
+  __asm paddw mm2,mm0 \
+  __asm psubw mm5,mm1 \
+  __asm pmulhw mm6,mm4 \
+  __asm paddw mm6,OC_I(0) \
+  __asm paddw mm1,mm1 \
+  __asm movq mm4,mm6 \
+  __asm paddw mm1,mm5 \
+  __asm psubw mm6,mm2 \
+  __asm paddw mm2,mm2 \
+  __asm movq mm0,OC_I(1) \
+  __asm paddw mm2,mm6 \
+  __asm psubw mm2,mm1 \
+  __asm nop \
+}
+
+/*25+8=33 cycles.*/
+#define OC_ROW_IDCT_10 __asm{ \
+  OC_IDCT_BEGIN_10 \
+  /*r3=D'*/ \
+   __asm movq mm3,OC_I(2) \
+  /*r4=E'=E-G*/ \
+   __asm psubw mm4,mm7 \
+  /*r1=H'+H'*/ \
+   __asm paddw mm1,mm1 \
+  /*r7=G+G*/ \
+   __asm paddw mm7,mm7 \
+  /*r1=R1=A''+H'*/ \
+   __asm paddw mm1,mm2 \
+  /*r7=G'=E+G*/ \
+   __asm paddw mm7,mm4 \
+  /*r4=R4=E'-D'*/ \
+   __asm psubw mm4,mm3 \
+   __asm paddw mm3,mm3 \
+  /*r6=R6=F'-B''*/ \
+   __asm psubw mm6,mm5 \
+   __asm paddw mm5,mm5 \
+  /*r3=R3=E'+D'*/ \
+   __asm paddw mm3,mm4 \
+  /*r5=R5=F'+B''*/ \
+   __asm paddw mm5,mm6 \
+  /*r7=R7=G'-C'*/ \
+   __asm psubw mm7,mm0 \
+   __asm paddw mm0,mm0 \
+  /*Save R1.*/ \
+   __asm movq OC_I(1),mm1 \
+  /*r0=R0=G'+C'*/ \
+   __asm paddw mm0,mm7 \
+}
+
+/*25+19=44 cycles'*/
+#define OC_COLUMN_IDCT_10 __asm{ \
+  OC_IDCT_BEGIN_10 \
+  __asm paddw mm2,OC_8 \
+  /*r1=H'+H'*/ \
+  __asm paddw mm1,mm1 \
+  /*r1=R1=A''+H'*/ \
+  __asm paddw mm1,mm2 \
+  /*r2=NR2*/ \
+  __asm psraw mm2,4 \
+  /*r4=E'=E-G*/ \
+  __asm psubw mm4,mm7 \
+  /*r1=NR1*/ \
+  __asm psraw mm1,4 \
+  /*r3=D'*/ \
+  __asm movq mm3,OC_I(2) \
+  /*r7=G+G*/ \
+  __asm paddw mm7,mm7 \
+  /*Store NR2 at I(2).*/ \
+  __asm movq OC_I(2),mm2 \
+  /*r7=G'=E+G*/ \
+  __asm paddw mm7,mm4 \
+  /*Store NR1 at I(1).*/ \
+  __asm movq OC_I(1),mm1 \
+  /*r4=R4=E'-D'*/ \
+  __asm psubw mm4,mm3 \
+  __asm paddw mm4,OC_8 \
+  /*r3=D'+D'*/ \
+  __asm paddw mm3,mm3 \
+  /*r3=R3=E'+D'*/ \
+  __asm paddw mm3,mm4 \
+  /*r4=NR4*/ \
+  __asm psraw mm4,4 \
+  /*r6=R6=F'-B''*/ \
+  __asm psubw mm6,mm5 \
+  /*r3=NR3*/ \
+  __asm psraw mm3,4 \
+  __asm paddw mm6,OC_8 \
+  /*r5=B''+B''*/ \
+  __asm paddw mm5,mm5 \
+  /*r5=R5=F'+B''*/ \
+  __asm paddw mm5,mm6 \
+  /*r6=NR6*/ \
+  __asm psraw mm6,4 \
+  /*Store NR4 at J(4).*/ \
+  __asm movq OC_J(4),mm4 \
+  /*r5=NR5*/ \
+  __asm psraw mm5,4 \
+  /*Store NR3 at I(3).*/ \
+  __asm movq OC_I(3),mm3 \
+  /*r7=R7=G'-C'*/ \
+  __asm psubw mm7,mm0 \
+  __asm paddw mm7,OC_8 \
+  /*r0=C'+C'*/ \
+  __asm paddw mm0,mm0 \
+  /*r0=R0=G'+C'*/ \
+  __asm paddw mm0,mm7 \
+  /*r7=NR7*/ \
+  __asm psraw mm7,4 \
+  /*Store NR6 at J(6).*/ \
+  __asm movq OC_J(6),mm6 \
+  /*r0=NR0*/ \
+  __asm psraw mm0,4 \
+  /*Store NR5 at J(5).*/ \
+  __asm movq OC_J(5),mm5 \
+  /*Store NR7 at J(7).*/ \
+  __asm movq OC_J(7),mm7 \
+  /*Store NR0 at I(0).*/ \
+  __asm movq OC_I(0),mm0 \
+}
+
+static void oc_idct8x8_10(ogg_int16_t _y[64]){
+  __asm{
+#define CONSTS eax
+#define Y edx
+    mov CONSTS,offset OC_IDCT_CONSTS
+    mov Y,_y
+#define OC_I(_k) [Y+_k*16]
+#define OC_J(_k) [Y+(_k-4)*16+8]
+    /*Done with dequant, descramble, and partial transpose.
+      Now do the iDCT itself.*/
+    OC_ROW_IDCT_10
+    OC_TRANSPOSE
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) [Y+_k*16]
+#define OC_J(_k) OC_I(_k)
+    OC_COLUMN_IDCT_10
+#undef  OC_I
+#undef  OC_J
+#define OC_I(_k) [Y+_k*16+8]
+#define OC_J(_k) OC_I(_k)
+    OC_COLUMN_IDCT_10
+#undef  OC_I
+#undef  OC_J
+#undef  CONSTS
+#undef  Y
+  }
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Perform the iDCT.*/
+  if(_last_zzi<10)oc_idct8x8_10(_y);
+  else oc_idct8x8_slow(_y);
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxloop.h b/thirdparty/libtheora/x86_vc/mmxloop.h
new file mode 100644
index 0000000000..2561fca2ae
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/mmxloop.h
@@ -0,0 +1,219 @@
+#if !defined(_x86_vc_mmxloop_H)
+# define _x86_vc_mmxloop_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
+#define OC_LOOP_FILTER8_MMX __asm{ \
+  /*mm7=0*/ \
+  __asm pxor mm7,mm7 \
+  /*mm6:mm0={a0,...,a7}*/ \
+  __asm movq mm6,mm0 \
+  __asm punpcklbw mm0,mm7 \
+  __asm punpckhbw mm6,mm7 \
+  /*mm3:mm5={d0,...,d7}*/ \
+  __asm movq mm5,mm3 \
+  __asm punpcklbw mm3,mm7 \
+  __asm punpckhbw mm5,mm7 \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  __asm psubw mm0,mm3 \
+  __asm psubw mm6,mm5 \
+  /*mm3:mm1={b0,...,b7}*/ \
+  __asm movq mm3,mm1 \
+  __asm punpcklbw mm1,mm7 \
+  __asm movq mm4,mm2 \
+  __asm punpckhbw mm3,mm7 \
+  /*mm5:mm4={c0,...,c7}*/ \
+  __asm movq mm5,mm2 \
+  __asm punpcklbw mm4,mm7 \
+  __asm punpckhbw mm5,mm7 \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  __asm pcmpeqw mm7,mm7 \
+  __asm psubw mm4,mm1 \
+  __asm psrlw mm7,14 \
+  __asm psubw mm5,mm3 \
+  /*Scale by 3.*/ \
+  __asm pmullw mm4,mm7 \
+  __asm pmullw mm5,mm7 \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  __asm psrlw mm7,1 \
+  __asm paddw mm4,mm0 \
+  __asm psllw mm7,2 \
+  __asm movq mm0,[LL] \
+  __asm paddw mm5,mm6 \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  __asm psubw mm4,mm7 \
+  __asm psubw mm5,mm7 \
+  __asm psraw mm4,3 \
+  __asm psraw mm5,3 \
+  __asm pcmpeqb mm7,mm7 \
+  __asm packsswb mm4,mm5 \
+  __asm pxor mm6,mm6 \
+  __asm pxor mm4,mm7 \
+  __asm packuswb mm1,mm3 \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  __asm pcmpgtb mm6,mm4 \
+  __asm psubb mm7,mm0 \
+  __asm pxor mm4,mm6 \
+  __asm psubb mm7,mm0 \
+  __asm psubb mm4,mm6 \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  __asm paddusb mm7,mm4 \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  __asm paddusb mm4,mm7 \
+  __asm psubusb mm4,mm7 \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  __asm movq mm5,mm4 \
+  __asm pand mm4,mm6 \
+  __asm pandn mm6,mm5 \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  __asm paddusb mm1,mm4 \
+  __asm psubusb mm2,mm4 \
+  __asm psubusb mm1,mm6 \
+  __asm paddusb mm2,mm6 \
+}
+
+#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+  do{ \
+    /*Used local variable pix__ in order to fix compilation errors like: \
+       "error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \
+    unsigned char *pix__; \
+    unsigned char *ll__; \
+    ll__=(_ll); \
+    pix__=(_pix); \
+    __asm mov YSTRIDE,_ystride \
+    __asm mov LL,ll__ \
+    __asm mov PIX,pix__ \
+    __asm sub PIX,YSTRIDE \
+    __asm sub PIX,YSTRIDE \
+    /*mm0={a0,...,a7}*/ \
+    __asm movq mm0,[PIX] \
+    /*ystride3=_ystride*3*/ \
+    __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*mm3={d0,...,d7}*/ \
+    __asm movq mm3,[PIX+YSTRIDE3] \
+    /*mm1={b0,...,b7}*/ \
+    __asm movq mm1,[PIX+YSTRIDE] \
+    /*mm2={c0,...,c7}*/ \
+    __asm movq mm2,[PIX+YSTRIDE*2] \
+    OC_LOOP_FILTER8_MMX \
+    /*Write it back out.*/ \
+    __asm movq [PIX+YSTRIDE],mm1 \
+    __asm movq [PIX+YSTRIDE*2],mm2 \
+  } \
+  while(0)
+
+#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+  do{ \
+    /*Used local variable ll__ in order to fix compilation errors like: \
+       "error C2443: operand size conflict".*/ \
+    unsigned char *ll__; \
+    unsigned char *pix__; \
+    ll__=(_ll); \
+    pix__=(_pix)-2; \
+    __asm mov PIX,pix__ \
+    __asm mov YSTRIDE,_ystride \
+    __asm mov LL,ll__ \
+    /*x x x x d0 c0 b0 a0*/ \
+    __asm movd mm0,[PIX] \
+    /*x x x x d1 c1 b1 a1*/ \
+    __asm movd mm1,[PIX+YSTRIDE] \
+    /*ystride3=_ystride*3*/ \
+    __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*x x x x d2 c2 b2 a2*/ \
+    __asm movd mm2,[PIX+YSTRIDE*2] \
+    /*x x x x d3 c3 b3 a3*/ \
+    __asm lea D,[PIX+YSTRIDE*4] \
+    __asm movd mm3,[PIX+YSTRIDE3] \
+    /*x x x x d4 c4 b4 a4*/ \
+    __asm movd mm4,[D] \
+    /*x x x x d5 c5 b5 a5*/ \
+    __asm movd mm5,[D+YSTRIDE] \
+    /*x x x x d6 c6 b6 a6*/ \
+    __asm movd mm6,[D+YSTRIDE*2] \
+    /*x x x x d7 c7 b7 a7*/ \
+    __asm movd mm7,[D+YSTRIDE3] \
+    /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+    __asm punpcklbw mm0,mm1 \
+    /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
+    __asm punpcklbw mm2,mm3 \
+    /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+    __asm movq mm3,mm0 \
+    /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+    __asm punpcklwd mm0,mm2 \
+    /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+    __asm punpckhwd mm3,mm2 \
+    /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+    __asm movq mm1,mm0 \
+    /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+    __asm punpcklbw mm4,mm5 \
+    /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
+    __asm punpcklbw mm6,mm7 \
+    /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+    __asm movq mm5,mm4 \
+    /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
+    __asm punpcklwd mm4,mm6 \
+    /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
+    __asm punpckhwd mm5,mm6 \
+    /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+    __asm movq mm2,mm3 \
+    /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
+    __asm punpckldq mm0,mm4 \
+    /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
+    __asm punpckhdq mm1,mm4 \
+    /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
+    __asm punpckldq mm2,mm5 \
+    /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
+    __asm punpckhdq mm3,mm5 \
+    OC_LOOP_FILTER8_MMX \
+    /*mm2={b0+R_0'',...,b7+R_7''}*/ \
+    __asm movq mm0,mm1 \
+    /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
+    __asm punpcklbw mm1,mm2 \
+    /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
+    __asm punpckhbw mm0,mm2 \
+    /*[d]=c1 b1 c0 b0*/ \
+    __asm movd D,mm1 \
+    __asm mov [PIX+1],D_WORD \
+    __asm psrlq mm1,32 \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE+1],D_WORD \
+    /*[d]=c3 b3 c2 b2*/ \
+    __asm movd D,mm1 \
+    __asm mov [PIX+YSTRIDE*2+1],D_WORD \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE3+1],D_WORD \
+    __asm lea PIX,[PIX+YSTRIDE*4] \
+    /*[d]=c5 b5 c4 b4*/ \
+    __asm movd D,mm0 \
+    __asm mov [PIX+1],D_WORD \
+    __asm psrlq mm0,32 \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE+1],D_WORD \
+    /*[d]=c7 b7 c6 b6*/ \
+    __asm movd D,mm0 \
+    __asm mov [PIX+YSTRIDE*2+1],D_WORD \
+    __asm shr D,16 \
+    __asm mov [PIX+YSTRIDE3+1],D_WORD \
+  } \
+  while(0)
+
+# endif
+#endif
diff --git a/thirdparty/libtheora/x86_vc/mmxstate.c b/thirdparty/libtheora/x86_vc/mmxstate.c
new file mode 100644
index 0000000000..73bd1981cf
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/mmxstate.c
@@ -0,0 +1,211 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
+
+ ********************************************************************/
+
+/*MMX acceleration of complete fragment reconstruction algorithm.
+  Originally written by Rudolf Marek.*/
+#include <string.h>
+#include "x86int.h"
+#include "mmxfrag.h"
+#include "mmxloop.h"
+
+#if defined(OC_X86_ASM)
+
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _dct_coeffs with p.*/
+    __asm{
+#define Y eax
+#define P ecx
+      mov Y,_dct_coeffs
+      movzx P,p
+      /*mm0=0000 0000 0000 AAAA*/
+      movd mm0,P
+      /*mm0=0000 0000 AAAA AAAA*/
+      punpcklwd mm0,mm0
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      punpckldq mm0,mm0
+      movq [Y],mm0
+      movq [8+Y],mm0
+      movq [16+Y],mm0
+      movq [24+Y],mm0
+      movq [32+Y],mm0
+      movq [40+Y],mm0
+      movq [48+Y],mm0
+      movq [56+Y],mm0
+      movq [64+Y],mm0
+      movq [72+Y],mm0
+      movq [80+Y],mm0
+      movq [88+Y],mm0
+      movq [96+Y],mm0
+      movq [104+Y],mm0
+      movq [112+Y],mm0
+      movq [120+Y],mm0
+#undef Y
+#undef P
+    }
+  }
+  else{
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs);
+    }
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+  }
+}
+
+/*We copy these entire function to inline the actual MMX routines so that we
+   use only a single indirect call.*/
+
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _fragis:    A pointer to a list of fragment indices.
+  _nfragis:   The number of fragment indices to copy.
+  _dst_frame: The reference frame to copy to.
+  _src_frame: The reference frame to copy from.
+  _pli:       The color plane the fragments lie in.*/
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli){
+  const ptrdiff_t     *frag_buf_offs;
+  const unsigned char *src_frame_data;
+  unsigned char       *dst_frame_data;
+  ptrdiff_t            fragii;
+  int                  ystride;
+  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
+  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
+  ystride=_state->ref_ystride[_pli];
+  frag_buf_offs=_state->frag_buf_offs;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
+     src_frame_data+frag_buf_off,ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  OC_ALIGN8(unsigned char  ll[8]);
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+#define PIX eax
+#define YSTRIDE3 edi
+#define YSTRIDE ecx
+#define LL edx
+#define D esi
+#define D_WORD si
+        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
+        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+        }
+#undef PIX
+#undef YSTRIDE3
+#undef YSTRIDE
+#undef LL
+#undef D
+#undef D_WORD
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/x86enc.c b/thirdparty/libtheora/x86_vc/x86enc.c
new file mode 100644
index 0000000000..e1960e1f0b
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86enc.c
@@ -0,0 +1,49 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+#include "../cpu.c"
+
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=oc_cpu_flags_get();
+  oc_enc_vtable_init_c(_enc);
+  if(cpu_flags&OC_CPU_X86_MMX){
+    _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
+    _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
+    _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
+  }
+  if(cpu_flags&OC_CPU_X86_MMXEXT){
+    _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
+    _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
+    _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
+    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
+    _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+  }
+  if(cpu_flags&OC_CPU_X86_SSE2){
+# if defined(OC_X86_64_ASM)
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+# endif
+  }
+}
+#endif
diff --git a/thirdparty/libtheora/x86_vc/x86enc.h b/thirdparty/libtheora/x86_vc/x86enc.h
new file mode 100644
index 0000000000..581484641f
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86enc.h
@@ -0,0 +1,47 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86enc_H)
+# define _x86_vc_x86enc_H (1)
+# include "../encint.h"
+# include "x86int.h"
+
+void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/x86int.h b/thirdparty/libtheora/x86_vc/x86int.h
new file mode 100644
index 0000000000..4cca485311
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86int.h
@@ -0,0 +1,42 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86int_H)
+# define _x86_vc_x86int_H (1)
+# include "../internal.h"
+
+void oc_state_vtable_init_x86(oc_theora_state *_state);
+
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
+ int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
+ int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_restore_fpu_mmx(void);
+
+#endif
diff --git a/thirdparty/libtheora/x86_vc/x86state.c b/thirdparty/libtheora/x86_vc/x86state.c
new file mode 100644
index 0000000000..a786bec284
--- /dev/null
+++ b/thirdparty/libtheora/x86_vc/x86state.c
@@ -0,0 +1,62 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+#include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+#include "../cpu.c"
+
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+};
+
+void oc_state_vtable_init_x86(oc_theora_state *_state){
+  _state->cpu_flags=oc_cpu_flags_get();
+  if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
+    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmx;
+    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
+  }
+  else oc_state_vtable_init_c(_state);
+}
+#endif