19 files changed, 2785 insertions, 687 deletions
diff --git a/thirdparty/libtheora/x86/mmxencfrag.c b/thirdparty/libtheora/x86/mmxencfrag.c
index c79ff01fcc..cc9be8d867 100644
--- a/thirdparty/libtheora/x86/mmxencfrag.c
+++ b/thirdparty/libtheora/x86/mmxencfrag.c
@@ -65,7 +65,7 @@ unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
     "paddw %%mm6,%%mm0\n\t"
     "paddw %%mm2,%%mm0\n\t"
     "movd %%mm0,%[ret]\n\t"
-    :[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
+    :[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
     :[ystride]"r"((ptrdiff_t)_ystride)
   );
   return (unsigned)ret;
@@ -87,7 +87,9 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
    The latter is exactly 1 too large when the low bit of two corresponding \
     bytes is only set in one of them. \
    Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
-    correct the output of pavgb.*/ \
+    correct the output of pavgb. \
+   TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
+    schedules better; currently, however, this function is unused.*/ \
  "movq %%mm0,%%mm6\n\t" \
  "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
  "pxor %%mm1,%%mm0\n\t" \
@@ -153,7 +155,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
     OC_SAD2_LOOP
     OC_SAD2_LOOP
     OC_SAD2_TAIL
-    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2)
+    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
     :[ystride]"r"((ptrdiff_t)_ystride)
   );
   return (unsigned)ret;
@@ -163,54 +165,54 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    16-bit difference in %%mm0...%%mm7.*/
 #define OC_LOAD_SUB_8x4(_off) \
  "#OC_LOAD_SUB_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[ref]),%%mm4\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[ref]),%%mm4\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "movd "_off"(%[src]),%%mm2\n\t" \
- "movd "_off"(%[ref]),%%mm7\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
+ "movd "#_off"(%[src]),%%mm2\n\t" \
+ "movd "#_off"(%[ref]),%%mm7\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
  "punpcklbw %%mm4,%%mm0\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "punpcklbw %%mm4,%%mm4\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
- "movd "_off"(%[src]),%%mm4\n\t" \
- "movq %%mm0,"_off"*2(%[buf])\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
+ "movd "#_off"(%[src]),%%mm4\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
  "punpcklbw %%mm5,%%mm1\n\t" \
  "punpcklbw %%mm5,%%mm5\n\t" \
  "psubw %%mm5,%%mm1\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
  "punpcklbw %%mm7,%%mm2\n\t" \
  "punpcklbw %%mm7,%%mm7\n\t" \
  "psubw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
  "punpcklbw %%mm6,%%mm3\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "punpcklbw %%mm6,%%mm6\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
- "movd "_off"(%[src]),%%mm6\n\t" \
+ "movd "#_off"(%[src]),%%mm6\n\t" \
  "punpcklbw %%mm0,%%mm4\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "punpcklbw %%mm0,%%mm0\n\t" \
  "lea (%[src],%[src_ystride],2),%[src]\n\t" \
  "psubw %%mm0,%%mm4\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
+ "movd "#_off"(%[ref]),%%mm0\n\t" \
  "punpcklbw %%mm7,%%mm5\n\t" \
  "neg %[src_ystride]\n\t" \
  "punpcklbw %%mm7,%%mm7\n\t" \
  "psubw %%mm7,%%mm5\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \
+ "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
  "punpcklbw %%mm0,%%mm6\n\t" \
  "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
  "punpcklbw %%mm0,%%mm0\n\t" \
  "neg %[ref_ystride]\n\t" \
  "psubw %%mm0,%%mm6\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
+ "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
  "lea (%[src],%[src_ystride],8),%[src]\n\t" \
  "punpcklbw %%mm0,%%mm7\n\t" \
  "neg %[src_ystride]\n\t" \
@@ -218,24 +220,24 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
  "psubw %%mm0,%%mm7\n\t" \
  "neg %[ref_ystride]\n\t" \
- "movq "_off"*2(%[buf]),%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
 
 /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
 #define OC_LOAD_8x4(_off) \
  "#OC_LOAD_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[src],%[ystride]),%%mm1\n\t" \
- "movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \
+ "movd "#_off"(%[src]),%%mm0\n\t" \
+ "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
+ "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
  "pxor %%mm7,%%mm7\n\t" \
- "movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \
+ "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
  "punpcklbw %%mm7,%%mm0\n\t" \
- "movd "_off"(%[src4]),%%mm4\n\t" \
+ "movd "#_off"(%[src4]),%%mm4\n\t" \
  "punpcklbw %%mm7,%%mm1\n\t" \
- "movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \
+ "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
  "punpcklbw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \
+ "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
  "punpcklbw %%mm7,%%mm3\n\t" \
- "movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \
+ "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
  "punpcklbw %%mm4,%%mm4\n\t" \
  "punpcklbw %%mm5,%%mm5\n\t" \
  "psrlw $8,%%mm4\n\t" \
@@ -248,7 +250,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 /*Performs the first two stages of an 8-point 1-D Hadamard transform.
   The transform is performed in place, except that outputs 0-3 are swapped with
    outputs 4-7.
-  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
+  Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
    perform this stage in place with no temporary registers).*/
 #define OC_HADAMARD_AB_8x4 \
  "#OC_HADAMARD_AB_8x4\n\t" \
@@ -281,7 +283,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "psubw %%mm5,%%mm7\n\t" \
 
 /*Performs the last stage of an 8-point 1-D Hadamard transform in place.
-  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
    place with no temporary registers).*/
 #define OC_HADAMARD_C_8x4 \
  "#OC_HADAMARD_C_8x4\n\t" \
@@ -324,8 +326,8 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
    This implementation is only 26 (+4 for spilling registers).*/ \
  "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
- "movq %%mm7,"_r7"(%[buf])\n\t" \
- "movq %%mm6,"_r6"(%[buf])\n\t" \
+ "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
+ "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
  /*mm7={0x7FFF}x4 \
    mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
  "pcmpeqb %%mm7,%%mm7\n\t" \
@@ -343,14 +345,14 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "pmaxsw %%mm5,%%mm4\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
  "paddw %%mm5,%%mm1\n\t" \
- "movq "_r7"(%[buf]),%%mm3\n\t" \
+ "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
 
 /*Performs the second part of the final stage of the Hadamard transform and
    summing of absolute values.*/
 #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
  "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
  "paddsw %%mm7,%%mm6\n\t" \
- "movq "_r6"(%[buf]),%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
  "paddsw %%mm7,%%mm1\n\t" \
  "psubw %%mm6,%%mm2\n\t" \
  "psubw %%mm1,%%mm4\n\t" \
@@ -391,7 +393,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
 #define OC_TRANSPOSE_4x4x2(_off) \
  "#OC_TRANSPOSE_4x4x2\n\t" \
  /*First 4x4 transpose:*/ \
- "movq %%mm5,0x10+"_off"(%[buf])\n\t" \
+ "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
  /*mm0 = e3 e2 e1 e0 \
    mm1 = f3 f2 f1 f0 \
    mm2 = g3 g2 g1 g0 \
@@ -411,13 +413,13 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  "punpckhdq %%mm2,%%mm1\n\t" \
  "movq %%mm3,%%mm2\n\t" \
  "punpckhdq %%mm5,%%mm3\n\t" \
- "movq %%mm0,0x40+"_off"(%[buf])\n\t" \
+ "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
  "punpckldq %%mm5,%%mm2\n\t" \
  /*mm0 = h0 g0 f0 e0 \
    mm1 = h1 g1 f1 e1 \
    mm2 = h2 g2 f2 e2 \
    mm3 = h3 g3 f3 e3*/ \
- "movq 0x10+"_off"(%[buf]),%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
  /*Second 4x4 transpose:*/ \
  /*mm4 = a3 a2 a1 a0 \
    mm5 = b3 b2 b1 b0 \
@@ -425,11 +427,11 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    mm7 = d3 d2 d1 d0*/ \
  "movq %%mm6,%%mm0\n\t" \
  "punpcklwd %%mm7,%%mm6\n\t" \
- "movq %%mm1,0x50+"_off"(%[buf])\n\t" \
+ "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
  "punpckhwd %%mm7,%%mm0\n\t" \
  "movq %%mm4,%%mm7\n\t" \
  "punpcklwd %%mm5,%%mm4\n\t" \
- "movq %%mm2,0x60+"_off"(%[buf])\n\t" \
+ "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
  "punpckhwd %%mm5,%%mm7\n\t" \
  /*mm4 = b1 a1 b0 a0 \
    mm7 = b3 a3 b2 a2 \
@@ -437,7 +439,7 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    mm0 = d3 c3 d2 c2*/ \
  "movq %%mm4,%%mm5\n\t" \
  "punpckldq %%mm6,%%mm4\n\t" \
- "movq %%mm3,0x70+"_off"(%[buf])\n\t" \
+ "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
  "punpckhdq %%mm6,%%mm5\n\t" \
  "movq %%mm7,%%mm6\n\t" \
  "punpckhdq %%mm0,%%mm7\n\t" \
@@ -447,100 +449,102 @@ unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
    mm6 = d2 c2 b2 a2 \
    mm7 = d3 c3 b3 a3*/ \
 
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  bufp=buf;
+static unsigned oc_int_frag_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  unsigned ret;
+  unsigned ret2;
+  int      dc;
   __asm__ __volatile__(
-    OC_LOAD_SUB_8x4("0x00")
+    OC_LOAD_SUB_8x4(0x00)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
+    OC_TRANSPOSE_4x4x2(0x00)
     /*Finish swapping out this 8x4 block to make room for the next one.
       mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_SUB_8x4("0x04")
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_SUB_8x4(0x04)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
+    OC_TRANSPOSE_4x4x2(0x08)
     /*Here the first 4x4 block of output from the last transpose is the second
        4x4 block of input for the next transform.
       We have cleverly arranged that it already be in the appropriate place, so
        we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x28","0x38")
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x4
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    "movd %%mm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
       Now we finally have to promote things to dwords.
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
-    "mov %[thresh],%[ret2]\n\t"
     "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
     "movq %%mm0,%%mm4\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
-    "movd %%mm4,%[ret]\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    "lea -32(%[ret],%[ret]),%[ret]\n\t"
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "cmp %[ret2],%[ret]\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    "jae 1f\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
+    "movd %%mm4,%[ret2]\n\t"
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     "pmaddwd %%mm7,%%mm0\n\t"
-    /*There isn't much to stick in here to hide the latency this time, but the
-       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
-       latency is even worse.*/
-    "sub $32,%[ret]\n\t"
+    /*Subtract abs(dc) from 2*ret2.*/
+    "movsx %w[dc],%[dc]\n\t"
+    "cdq\n\t"
+    "lea (%[ret],%[ret2],2),%[ret2]\n\t"
     "movq %%mm0,%%mm4\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
+    "xor %[dc],%[ret]\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movd %%mm4,%[ret2]\n\t"
-    "lea (%[ret],%[ret2],2),%[ret]\n\t"
-    ".p2align 4,,15\n\t"
-    "1:\n\t"
-    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    "sub %[ret],%[ret2]\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
        and %[ret2] with some of the inputs, since for once we don't write to
-       them until after we're done using everything but %[buf] (which is also
-       listed as an output to ensure gcc _doesn't_ alias them against it).*/
+       them until after we're done using everything but %[buf].*/
     /*Note that _src_ystride and _ref_ystride must be given non-overlapping
        constraints, otherewise if gcc can prove they're equal it will allocate
        them to the same register (which is bad); _src and _ref face a similar
        problem, though those are never actually the same.*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
+    :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
-     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride),
-     [thresh]"m"(_thresh)
+     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
     /*We have to use neg, so we actually clobber the condition codes for once
        (not to mention cmp, sub, and add).*/
     :"cc"
   );
+  *_dc=dc;
   return ret;
 }
 
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
 }
 
 /*Our internal implementation of frag_copy2 takes an extra stride parameter so
-   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
-static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+   we can share code with oc_enc_frag_satd2_mmxext().*/
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
   __asm__ __volatile__(
     /*Load the first 3 rows.*/
@@ -649,55 +653,53 @@ static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
     "psubb %%mm4,%%mm2\n\t"
     /*%%mm2 (row 7) is done, write it out.*/
     "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
-    :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
+    :[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
     :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
      [src_ystride]"r"((ptrdiff_t)_src_ystride)
     :"memory"
   );
 }
 
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
   OC_ALIGN8(unsigned char ref[64]);
   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
+  return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
 }
 
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
- int _ystride){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  bufp=buf;
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _ystride){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  unsigned ret;
+  unsigned ret2;
+  int      dc;
   __asm__ __volatile__(
-    OC_LOAD_8x4("0x00")
+    OC_LOAD_8x4(0x00)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
+    OC_TRANSPOSE_4x4x2(0x00)
     /*Finish swapping out this 8x4 block to make room for the next one.
       mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_8x4("0x04")
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    OC_LOAD_8x4(0x04)
     OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
+    OC_TRANSPOSE_4x4x2(0x08)
     /*Here the first 4x4 block of output from the last transpose is the second
        4x4 block of input for the next transform.
       We have cleverly arranged that it already be in the appropriate place, so
        we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
     /*We split out the stages here so we can save the DC coefficient in the
        middle.*/
     OC_HADAMARD_AB_8x4
-    OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
-    "movd %%mm1,%[ret]\n\t"
-    OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
+    OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
+    "movd %%mm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
     /*Up to this point, everything fit in 16 bits (8 input + 1 for the
        difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
        for the factor of two we dropped + 3 for the vertical accumulation).
@@ -705,41 +707,43 @@ unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
       We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
        latency of pmaddwd by starting the next series of loads now.*/
     "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
+    "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
     "movq %%mm0,%%mm4\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
+    "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
-    "movd %%mm4,%[ret2]\n\t"
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
+    "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
+    "movd %%mm4,%[ret]\n\t"
+    "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
+    OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
     "pmaddwd %%mm7,%%mm0\n\t"
     /*We assume that the DC coefficient is always positive (which is true,
        because the input to the INTRA transform was not a difference).*/
-    "movzx %w[ret],%[ret]\n\t"
-    "add %[ret2],%[ret2]\n\t"
-    "sub %[ret],%[ret2]\n\t"
+    "movzx %w[dc],%[dc]\n\t"
+    "add %[ret],%[ret]\n\t"
+    "sub %[dc],%[ret]\n\t"
     "movq %%mm0,%%mm4\n\t"
     "punpckhdq %%mm0,%%mm0\n\t"
     "paddd %%mm0,%%mm4\n\t"
-    "movd %%mm4,%[ret]\n\t"
-    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
-    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+    "movd %%mm4,%[ret2]\n\t"
+    "lea -64(%[ret],%[ret2],2),%[ret]\n\t"
+    /*Although it looks like we're using 8 registers here, gcc can alias %[ret]
        and %[ret2] with some of the inputs, since for once we don't write to
        them until after we're done using everything but %[buf] (which is also
        listed as an output to ensure gcc _doesn't_ alias them against it).*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
+    :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
      [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
     /*We have to use sub, so we actually clobber the condition codes for once
        (not to mention add).*/
     :"cc"
   );
+  *_dc=dc;
   return ret;
 }
 
diff --git a/thirdparty/libtheora/x86/mmxfdct.c b/thirdparty/libtheora/x86/mmxfdct.c
index 211875255e..17668358b8 100644
--- a/thirdparty/libtheora/x86/mmxfdct.c
+++ b/thirdparty/libtheora/x86/mmxfdct.c
@@ -12,6 +12,7 @@
 /*MMX fDCT implementation for x86_32*/
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include "x86enc.h"
+#include "x86zigzag.h"
 
 #if defined(OC_X86_ASM)
 
@@ -462,8 +463,9 @@
    mm7 = d3 c3 b3 a3*/ \
 
 /*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  OC_ALIGN8(ogg_int16_t buf[64]);
+  ptrdiff_t   a;
   __asm__ __volatile__(
     /*Add two extra bits of working precision to improve accuracy; any more and
        we could overflow.*/
@@ -586,77 +588,88 @@ void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     "movq 0x30(%[y]),%%mm3\n\t"
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x18(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x08(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
+    /*mm2={-2}x4*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
+    "paddw %%mm2,%%mm2\n\t"
+    /*Round and store the results (no transpose).*/
+    "movq 0x10(%[y]),%%mm7\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
+    "psubw %%mm2,%%mm0\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
+    "movq 0x30(%[y]),%%mm4\n\t"
     "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm2,%%mm5\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
+    "psraw $2,%%mm0\n\t"
+    "psubw %%mm2,%%mm3\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm2,%%mm1\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm2,%%mm7\n\t"
+    "movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
     "psraw $2,%%mm7\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
+    "psraw $2,%%mm4\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
+    /*Load the next block.*/
     "movq 0x40(%[y]),%%mm0\n\t"
-    "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x30(%[y])\n\t"
     "movq 0x78(%[y]),%%mm7\n\t"
-    "movq %%mm1,0x08(%[y])\n\t"
     "movq 0x50(%[y]),%%mm1\n\t"
-    "movq %%mm6,0x20(%[y])\n\t"
     "movq 0x68(%[y]),%%mm6\n\t"
-    "movq %%mm2,0x28(%[y])\n\t"
     "movq 0x60(%[y]),%%mm2\n\t"
-    "movq %%mm5,0x10(%[y])\n\t"
     "movq 0x58(%[y]),%%mm5\n\t"
-    "movq %%mm3,0x38(%[y])\n\t"
     "movq 0x70(%[y]),%%mm3\n\t"
-    "movq %%mm4,0x00(%[y])\n\t"
     "movq 0x48(%[y]),%%mm4\n\t"
     OC_FDCT_STAGE1_8x4
     OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x58(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x48(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
-    "movq %%mm2,0x68(%[y])\n\t"
+    /*mm2={-2}x4*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
+    "paddw %%mm2,%%mm2\n\t"
+    /*Round and store the results (no transpose).*/
+    "movq 0x50(%[y]),%%mm7\n\t"
+    "psubw %%mm2,%%mm4\n\t"
+    "psubw %%mm2,%%mm6\n\t"
     "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "movq %%mm3,0x78(%[y])\n\t"
-    "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
-    "movq %%mm4,0x40(%[y])\n\t"
+    "psubw %%mm2,%%mm0\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
+    "movq 0x70(%[y]),%%mm4\n\t"
     "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
-    "movq %%mm5,0x50(%[y])\n\t"
-    "psraw $2,%%mm7\n\t"
-    "movq %%mm6,0x60(%[y])\n\t"
+    "psubw %%mm2,%%mm5\n\t"
+    "movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
+    "psraw $2,%%mm0\n\t"
+    "psubw %%mm2,%%mm3\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm2,%%mm1\n\t"
+    "movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm2,%%mm7\n\t"
+    "movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
     "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x70(%[y])\n\t"
-    "movq %%mm1,0x48(%[y])\n\t"
-    :[a]"=&r"(a)
+    "psubw %%mm2,%%mm4\n\t"
+    "movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
+    "psraw $2,%%mm4\n\t"
+    "movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
+    /*Final transpose and zig-zag.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    "movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    "movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
+    :[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
     :[y]"r"(_y),[x]"r"(_x)
     :"memory"
   );
diff --git a/thirdparty/libtheora/x86/mmxfrag.c b/thirdparty/libtheora/x86/mmxfrag.c
index 2c732939c3..b3ec508956 100644
--- a/thirdparty/libtheora/x86/mmxfrag.c
+++ b/thirdparty/libtheora/x86/mmxfrag.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -22,17 +22,92 @@
   The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 
 #if defined(OC_X86_ASM)
 
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    ptrdiff_t            ystride3; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride){
   OC_FRAG_COPY_MMX(_dst,_src,_ystride);
 }
 
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
+}
+
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
   __asm__ __volatile__(
@@ -280,7 +355,7 @@ void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
       /*Advance dest ptr.*/
       "lea (%[dst],%[ystride],2),%[dst]\n\t"
      :[dst]"+r"(_dst),[residue]"+r"(_residue),
-      [src1]"+%r"(_src1),[src2]"+r"(_src2)
+      [src1]"+r"(_src1),[src2]"+r"(_src2)
      :[ystride]"r"((ptrdiff_t)_ystride)
      :"memory"
     );
diff --git a/thirdparty/libtheora/x86/mmxfrag.h b/thirdparty/libtheora/x86/mmxfrag.h
deleted file mode 100644
index a398427629..0000000000
--- a/thirdparty/libtheora/x86/mmxfrag.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#if !defined(_x86_mmxfrag_H)
-# define _x86_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    ptrdiff_t            ystride3; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm__ __volatile__( \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*ystride3=ystride*3*/ \
-      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[src],%[ystride],4),%[src]\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
-      :[ystride]"r"((ptrdiff_t)(_ystride)) \
-      :"memory" \
-    ); \
-  } \
-  while(0)
-
-# endif
-#endif
diff --git a/thirdparty/libtheora/x86/mmxidct.c b/thirdparty/libtheora/x86/mmxidct.c
index 76424e6364..b8e3077066 100644
--- a/thirdparty/libtheora/x86/mmxidct.c
+++ b/thirdparty/libtheora/x86/mmxidct.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -30,89 +30,66 @@
 
 
 
-/*A table of constants used by the MMX routines.*/
-static const ogg_uint16_t __attribute__((aligned(8),used))
- OC_IDCT_CONSTS[(7+1)*4]={
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
-};
-
-/*Converts the expression in the argument to a string.*/
-#define OC_M2STR(_s) #_s
-
 /*38 cycles*/
-#define OC_IDCT_BEGIN \
+#define OC_IDCT_BEGIN(_y,_x) \
   "#OC_IDCT_BEGIN\n\t" \
-  "movq "OC_I(3)",%%mm2\n\t" \
-  "movq "OC_C(3)",%%mm6\n\t" \
+  "movq "OC_I(3,_x)",%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
   "movq %%mm2,%%mm4\n\t" \
-  "movq "OC_J(5)",%%mm7\n\t" \
+  "movq "OC_J(5,_x)",%%mm7\n\t" \
   "pmulhw %%mm6,%%mm4\n\t" \
-  "movq "OC_C(5)",%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
   "pmulhw %%mm7,%%mm6\n\t" \
   "movq %%mm1,%%mm5\n\t" \
   "pmulhw %%mm2,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm3\n\t" \
+  "movq "OC_I(1,_x)",%%mm3\n\t" \
   "pmulhw %%mm7,%%mm5\n\t" \
-  "movq "OC_C(1)",%%mm0\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
   "paddw %%mm2,%%mm4\n\t" \
   "paddw %%mm7,%%mm6\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
-  "movq "OC_J(7)",%%mm1\n\t" \
+  "movq "OC_J(7,_x)",%%mm1\n\t" \
   "paddw %%mm5,%%mm7\n\t" \
   "movq %%mm0,%%mm5\n\t" \
   "pmulhw %%mm3,%%mm0\n\t" \
   "paddw %%mm7,%%mm4\n\t" \
   "pmulhw %%mm1,%%mm5\n\t" \
-  "movq "OC_C(7)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
   "psubw %%mm2,%%mm6\n\t" \
   "paddw %%mm3,%%mm0\n\t" \
   "pmulhw %%mm7,%%mm3\n\t" \
-  "movq "OC_I(2)",%%mm2\n\t" \
+  "movq "OC_I(2,_x)",%%mm2\n\t" \
   "pmulhw %%mm1,%%mm7\n\t" \
   "paddw %%mm1,%%mm5\n\t" \
   "movq %%mm2,%%mm1\n\t" \
-  "pmulhw "OC_C(2)",%%mm2\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
   "psubw %%mm5,%%mm3\n\t" \
-  "movq "OC_J(6)",%%mm5\n\t" \
+  "movq "OC_J(6,_x)",%%mm5\n\t" \
   "paddw %%mm7,%%mm0\n\t" \
   "movq %%mm5,%%mm7\n\t" \
   "psubw %%mm4,%%mm0\n\t" \
-  "pmulhw "OC_C(2)",%%mm5\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
-  "pmulhw "OC_C(6)",%%mm1\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
   "paddw %%mm4,%%mm4\n\t" \
   "paddw %%mm0,%%mm4\n\t" \
   "psubw %%mm6,%%mm3\n\t" \
   "paddw %%mm7,%%mm5\n\t" \
   "paddw %%mm6,%%mm6\n\t" \
-  "pmulhw "OC_C(6)",%%mm7\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
   "paddw %%mm3,%%mm6\n\t" \
-  "movq %%mm4,"OC_I(1)"\n\t" \
+  "movq %%mm4,"OC_I(1,_y)"\n\t" \
   "psubw %%mm5,%%mm1\n\t" \
-  "movq "OC_C(4)",%%mm4\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
   "movq %%mm3,%%mm5\n\t" \
   "pmulhw %%mm4,%%mm3\n\t" \
   "paddw %%mm2,%%mm7\n\t" \
-  "movq %%mm6,"OC_I(2)"\n\t" \
+  "movq %%mm6,"OC_I(2,_y)"\n\t" \
   "movq %%mm0,%%mm2\n\t" \
-  "movq "OC_I(0)",%%mm6\n\t" \
+  "movq "OC_I(0,_x)",%%mm6\n\t" \
   "pmulhw %%mm4,%%mm0\n\t" \
   "paddw %%mm3,%%mm5\n\t" \
-  "movq "OC_J(4)",%%mm3\n\t" \
+  "movq "OC_J(4,_x)",%%mm3\n\t" \
   "psubw %%mm1,%%mm5\n\t" \
   "paddw %%mm0,%%mm2\n\t" \
   "psubw %%mm3,%%mm6\n\t" \
@@ -126,18 +103,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "paddw %%mm0,%%mm6\n\t" \
   "psubw %%mm2,%%mm6\n\t" \
   "paddw %%mm2,%%mm2\n\t" \
-  "movq "OC_I(1)",%%mm0\n\t" \
+  "movq "OC_I(1,_y)",%%mm0\n\t" \
   "paddw %%mm6,%%mm2\n\t" \
   "paddw %%mm3,%%mm4\n\t" \
   "psubw %%mm1,%%mm2\n\t" \
   "#end OC_IDCT_BEGIN\n\t" \
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT \
+#define OC_ROW_IDCT(_y,_x) \
   "#OC_ROW_IDCT\n" \
-  OC_IDCT_BEGIN \
+  OC_IDCT_BEGIN(_y,_x) \
   /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
   /*r4=E'=E-G*/ \
   "psubw %%mm7,%%mm4\n\t" \
   /*r1=H'+H'*/ \
@@ -162,7 +139,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "psubw %%mm0,%%mm7\n\t" \
   "paddw %%mm0,%%mm0\n\t" \
   /*Save R1.*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   /*r0=R0=G.+C.*/ \
   "paddw %%mm7,%%mm0\n\t" \
   "#end OC_ROW_IDCT\n\t" \
@@ -195,11 +172,11 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE \
+#define OC_TRANSPOSE(_y) \
   "#OC_TRANSPOSE\n\t" \
   "movq %%mm4,%%mm1\n\t" \
   "punpcklwd %%mm5,%%mm4\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "punpckhwd %%mm5,%%mm1\n\t" \
   "movq %%mm6,%%mm0\n\t" \
   "punpcklwd %%mm7,%%mm6\n\t" \
@@ -207,17 +184,17 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "punpckldq %%mm6,%%mm4\n\t" \
   "punpckhdq %%mm6,%%mm5\n\t" \
   "movq %%mm1,%%mm6\n\t" \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
   "punpckhwd %%mm7,%%mm0\n\t" \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
   "punpckhdq %%mm0,%%mm6\n\t" \
-  "movq "OC_I(0)",%%mm4\n\t" \
+  "movq "OC_I(0,_y)",%%mm4\n\t" \
   "punpckldq %%mm0,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm5\n\t" \
+  "movq "OC_I(1,_y)",%%mm5\n\t" \
   "movq %%mm4,%%mm0\n\t" \
-  "movq %%mm6,"OC_J(7)"\n\t" \
+  "movq %%mm6,"OC_J(7,_y)"\n\t" \
   "punpcklwd %%mm5,%%mm0\n\t" \
-  "movq %%mm1,"OC_J(6)"\n\t" \
+  "movq %%mm1,"OC_J(6,_y)"\n\t" \
   "punpckhwd %%mm5,%%mm4\n\t" \
   "movq %%mm2,%%mm5\n\t" \
   "punpcklwd %%mm3,%%mm2\n\t" \
@@ -225,20 +202,20 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "punpckldq %%mm2,%%mm0\n\t" \
   "punpckhdq %%mm2,%%mm1\n\t" \
   "movq %%mm4,%%mm2\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "punpckhwd %%mm3,%%mm5\n\t" \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   "punpckhdq %%mm5,%%mm4\n\t" \
   "punpckldq %%mm5,%%mm2\n\t" \
-  "movq %%mm4,"OC_I(3)"\n\t" \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm4,"OC_I(3,_y)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
   "#end OC_TRANSPOSE\n\t" \
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT \
+#define OC_COLUMN_IDCT(_y) \
   "#OC_COLUMN_IDCT\n" \
-  OC_IDCT_BEGIN \
-  "paddw "OC_8",%%mm2\n\t" \
+  OC_IDCT_BEGIN(_y,_y) \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
   /*r1=H'+H'*/ \
   "paddw %%mm1,%%mm1\n\t" \
   /*r1=R1=A''+H'*/ \
@@ -250,18 +227,18 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   /*r1=NR1*/ \
   "psraw $4,%%mm1\n\t" \
   /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
   /*r7=G+G*/ \
   "paddw %%mm7,%%mm7\n\t" \
   /*Store NR2 at I(2).*/ \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
   /*r7=G'=E+G*/ \
   "paddw %%mm4,%%mm7\n\t" \
   /*Store NR1 at I(1).*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   /*r4=R4=E'-D'*/ \
   "psubw %%mm3,%%mm4\n\t" \
-  "paddw "OC_8",%%mm4\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
   /*r3=D'+D'*/ \
   "paddw %%mm3,%%mm3\n\t" \
   /*r3=R3=E'+D'*/ \
@@ -272,7 +249,7 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   "psubw %%mm5,%%mm6\n\t" \
   /*r3=NR3*/ \
   "psraw $4,%%mm3\n\t" \
-  "paddw "OC_8",%%mm6\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
   /*r5=B''+B''*/ \
   "paddw %%mm5,%%mm5\n\t" \
   /*r5=R5=F'+B''*/ \
@@ -280,14 +257,14 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   /*r6=NR6*/ \
   "psraw $4,%%mm6\n\t" \
   /*Store NR4 at J(4).*/ \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
   /*r5=NR5*/ \
   "psraw $4,%%mm5\n\t" \
   /*Store NR3 at I(3).*/ \
-  "movq %%mm3,"OC_I(3)"\n\t" \
+  "movq %%mm3,"OC_I(3,_y)"\n\t" \
   /*r7=R7=G'-C'*/ \
   "psubw %%mm0,%%mm7\n\t" \
-  "paddw "OC_8",%%mm7\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
   /*r0=C'+C'*/ \
   "paddw %%mm0,%%mm0\n\t" \
   /*r0=R0=G'+C'*/ \
@@ -295,113 +272,121 @@ static const ogg_uint16_t __attribute__((aligned(8),used))
   /*r7=NR7*/ \
   "psraw $4,%%mm7\n\t" \
   /*Store NR6 at J(6).*/ \
-  "movq %%mm6,"OC_J(6)"\n\t" \
+  "movq %%mm6,"OC_J(6,_y)"\n\t" \
   /*r0=NR0*/ \
   "psraw $4,%%mm0\n\t" \
   /*Store NR5 at J(5).*/ \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
   /*Store NR7 at J(7).*/ \
-  "movq %%mm7,"OC_J(7)"\n\t" \
+  "movq %%mm7,"OC_J(7,_y)"\n\t" \
   /*Store NR0 at I(0).*/ \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "#end OC_COLUMN_IDCT\n\t" \
 
-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
-#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
-
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  int i;
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm__ __volatile__(
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+8,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+64,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+72,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
   );
+  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+  for(i=0;i<4;i++){
+    __asm__ __volatile__(
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+      :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+    );
+  }
 }
 
 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 \
+#define OC_IDCT_BEGIN_10(_y,_x) \
  "#OC_IDCT_BEGIN_10\n\t" \
- "movq "OC_I(3)",%%mm2\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
  "nop\n\t" \
- "movq "OC_C(3)",%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
  "movq %%mm2,%%mm4\n\t" \
- "movq "OC_C(5)",%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
  "pmulhw %%mm6,%%mm4\n\t" \
- "movq "OC_I(1)",%%mm3\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
  "pmulhw %%mm2,%%mm1\n\t" \
- "movq "OC_C(1)",%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
  "paddw %%mm2,%%mm4\n\t" \
  "pxor %%mm6,%%mm6\n\t" \
  "paddw %%mm1,%%mm2\n\t" \
- "movq "OC_I(2)",%%mm5\n\t" \
+ "movq "OC_I(2,_x)",%%mm5\n\t" \
  "pmulhw %%mm3,%%mm0\n\t" \
  "movq %%mm5,%%mm1\n\t" \
  "paddw %%mm3,%%mm0\n\t" \
- "pmulhw "OC_C(7)",%%mm3\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
- "pmulhw "OC_C(2)",%%mm5\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
- "movq "OC_I(2)",%%mm7\n\t" \
+ "movq "OC_I(2,_x)",%%mm7\n\t" \
  "paddw %%mm4,%%mm4\n\t" \
  "paddw %%mm5,%%mm7\n\t" \
  "paddw %%mm0,%%mm4\n\t" \
- "pmulhw "OC_C(6)",%%mm1\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
- "movq %%mm4,"OC_I(1)"\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
  "paddw %%mm6,%%mm6\n\t" \
- "movq "OC_C(4)",%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
  "movq %%mm3,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm3\n\t" \
- "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
  "movq %%mm0,%%mm2\n\t" \
- "movq "OC_I(0)",%%mm6\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
  "pmulhw %%mm4,%%mm0\n\t" \
  "paddw %%mm3,%%mm5\n\t" \
  "paddw %%mm0,%%mm2\n\t" \
  "psubw %%mm1,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm6\n\t" \
- "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw "OC_I(0,_x)",%%mm6\n\t" \
  "paddw %%mm1,%%mm1\n\t" \
  "movq %%mm6,%%mm4\n\t" \
  "paddw %%mm5,%%mm1\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
  "paddw %%mm2,%%mm2\n\t" \
- "movq "OC_I(1)",%%mm0\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
  "paddw %%mm6,%%mm2\n\t" \
  "psubw %%mm1,%%mm2\n\t" \
  "nop\n\t" \
  "#end OC_IDCT_BEGIN_10\n\t" \
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 \
+#define OC_ROW_IDCT_10(_y,_x) \
  "#OC_ROW_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
+ OC_IDCT_BEGIN_10(_y,_x) \
  /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r4=E'=E-G*/ \
  "psubw %%mm7,%%mm4\n\t" \
  /*r1=H'+H'*/ \
@@ -426,16 +411,16 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  "psubw %%mm0,%%mm7\n\t" \
  "paddw %%mm0,%%mm0\n\t" \
  /*Save R1.*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r0=R0=G'+C'*/ \
  "paddw %%mm7,%%mm0\n\t" \
  "#end OC_ROW_IDCT_10\n\t" \
 
 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 \
+#define OC_COLUMN_IDCT_10(_y) \
  "#OC_COLUMN_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
- "paddw "OC_8",%%mm2\n\t" \
+ OC_IDCT_BEGIN_10(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
  /*r1=H'+H'*/ \
  "paddw %%mm1,%%mm1\n\t" \
  /*r1=R1=A''+H'*/ \
@@ -447,18 +432,18 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r1=NR1*/ \
  "psraw $4,%%mm1\n\t" \
  /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r7=G+G*/ \
  "paddw %%mm7,%%mm7\n\t" \
  /*Store NR2 at I(2).*/ \
- "movq %%mm2,"OC_I(2)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
  /*r7=G'=E+G*/ \
  "paddw %%mm4,%%mm7\n\t" \
  /*Store NR1 at I(1).*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r4=R4=E'-D'*/ \
  "psubw %%mm3,%%mm4\n\t" \
- "paddw "OC_8",%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
  /*r3=D'+D'*/ \
  "paddw %%mm3,%%mm3\n\t" \
  /*r3=R3=E'+D'*/ \
@@ -469,7 +454,7 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  "psubw %%mm5,%%mm6\n\t" \
  /*r3=NR3*/ \
  "psraw $4,%%mm3\n\t" \
- "paddw "OC_8",%%mm6\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
  /*r5=B''+B''*/ \
  "paddw %%mm5,%%mm5\n\t" \
  /*r5=R5=F'+B''*/ \
@@ -477,14 +462,14 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r6=NR6*/ \
  "psraw $4,%%mm6\n\t" \
  /*Store NR4 at J(4).*/ \
- "movq %%mm4,"OC_J(4)"\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
  /*r5=NR5*/ \
  "psraw $4,%%mm5\n\t" \
  /*Store NR3 at I(3).*/ \
- "movq %%mm3,"OC_I(3)"\n\t" \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
  /*r7=R7=G'-C'*/ \
  "psubw %%mm0,%%mm7\n\t" \
- "paddw "OC_8",%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
  /*r0=C'+C'*/ \
  "paddw %%mm0,%%mm0\n\t" \
  /*r0=R0=G'+C'*/ \
@@ -492,46 +477,55 @@ static void oc_idct8x8_slow(ogg_int16_t _y[64]){
  /*r7=NR7*/ \
  "psraw $4,%%mm7\n\t" \
  /*Store NR6 at J(6).*/ \
- "movq %%mm6,"OC_J(6)"\n\t" \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
  /*r0=NR0*/ \
  "psraw $4,%%mm0\n\t" \
  /*Store NR5 at J(5).*/ \
- "movq %%mm5,"OC_J(5)"\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
  /*Store NR7 at J(7).*/ \
- "movq %%mm7,"OC_J(7)"\n\t" \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
  /*Store NR0 at I(0).*/ \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
  "#end OC_COLUMN_IDCT_10\n\t" \
 
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   __asm__ __volatile__(
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
     /*Done with dequant, descramble, and partial transpose.
       Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
+  );
+  __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+    :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
   );
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -557,8 +551,8 @@ void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
+  else oc_idct8x8_slow_mmx(_y,_x);
 }
 
 #endif
diff --git a/thirdparty/libtheora/x86/mmxloop.h b/thirdparty/libtheora/x86/mmxloop.h
index 2e870c795d..1f6090b567 100644
--- a/thirdparty/libtheora/x86/mmxloop.h
+++ b/thirdparty/libtheora/x86/mmxloop.h
@@ -9,88 +9,191 @@
   On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
    mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
 #define OC_LOOP_FILTER8_MMX \
- "#OC_LOOP_FILTER8_MMX\n\t" \
- /*mm7=0*/ \
- "pxor %%mm7,%%mm7\n\t" \
- /*mm6:mm0={a0,...,a7}*/ \
- "movq %%mm0,%%mm6\n\t" \
- "punpcklbw %%mm7,%%mm0\n\t" \
- "punpckhbw %%mm7,%%mm6\n\t" \
- /*mm3:mm5={d0,...,d7}*/ \
- "movq %%mm3,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm3\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm6:mm0={a0-d0,...,a7-d7}*/ \
- "psubw %%mm3,%%mm0\n\t" \
- "psubw %%mm5,%%mm6\n\t" \
- /*mm3:mm1={b0,...,b7}*/ \
- "movq %%mm1,%%mm3\n\t" \
- "punpcklbw %%mm7,%%mm1\n\t" \
- "movq %%mm2,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm3\n\t" \
- /*mm5:mm4={c0,...,c7}*/ \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm7={3}x4 \
-   mm5:mm4={c0-b0,...,c7-b7}*/ \
- "pcmpeqw %%mm7,%%mm7\n\t" \
- "psubw %%mm1,%%mm4\n\t" \
- "psrlw $14,%%mm7\n\t" \
- "psubw %%mm3,%%mm5\n\t" \
- /*Scale by 3.*/ \
- "pmullw %%mm7,%%mm4\n\t" \
- "pmullw %%mm7,%%mm5\n\t" \
- /*mm7={4}x4 \
-   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
- "psrlw $1,%%mm7\n\t" \
- "paddw %%mm0,%%mm4\n\t" \
- "psllw $2,%%mm7\n\t" \
- "movq (%[ll]),%%mm0\n\t" \
- "paddw %%mm6,%%mm5\n\t" \
- /*R_i has the range [-127,128], so we compute -R_i instead. \
-   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
- "psubw %%mm7,%%mm4\n\t" \
- "psubw %%mm7,%%mm5\n\t" \
- "psraw $3,%%mm4\n\t" \
- "psraw $3,%%mm5\n\t" \
- "pcmpeqb %%mm7,%%mm7\n\t" \
- "packsswb %%mm5,%%mm4\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "pxor %%mm7,%%mm4\n\t" \
- "packuswb %%mm3,%%mm1\n\t" \
- /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
- /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
-    we have to split things by sign (the other option is to work in 16 bits, \
-    but working in 8 bits gives much better parallelism). \
-   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
-   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
-   Finally, we split mm4 into positive and negative pieces using the mask in \
-    mm6, and add and subtract them as appropriate.*/ \
- /*mm4=abs(-R_i)*/ \
- /*mm7=255-2*L*/ \
- "pcmpgtb %%mm4,%%mm6\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "pxor %%mm6,%%mm4\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "psubb %%mm6,%%mm4\n\t" \
- /*mm7=255-max(2*L-abs(R_i),0)*/ \
- "paddusb %%mm4,%%mm7\n\t" \
- /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
- "paddusb %%mm7,%%mm4\n\t" \
- "psubusb %%mm7,%%mm4\n\t" \
- /*Now split mm4 by the original sign of -R_i.*/ \
- "movq %%mm4,%%mm5\n\t" \
- "pand %%mm6,%%mm4\n\t" \
- "pandn %%mm5,%%mm6\n\t" \
- /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
- /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
- "paddusb %%mm4,%%mm1\n\t" \
- "psubusb %%mm4,%%mm2\n\t" \
- "psubusb %%mm6,%%mm1\n\t" \
- "paddusb %%mm6,%%mm2\n\t" \
+  "#OC_LOOP_FILTER8_MMX\n\t" \
+  /*mm7=0*/ \
+  "pxor %%mm7,%%mm7\n\t" \
+  /*mm6:mm0={a0,...,a7}*/ \
+  "movq %%mm0,%%mm6\n\t" \
+  "punpcklbw %%mm7,%%mm0\n\t" \
+  "punpckhbw %%mm7,%%mm6\n\t" \
+  /*mm3:mm5={d0,...,d7}*/ \
+  "movq %%mm3,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm3\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+  "psubw %%mm3,%%mm0\n\t" \
+  "psubw %%mm5,%%mm6\n\t" \
+  /*mm3:mm1={b0,...,b7}*/ \
+  "movq %%mm1,%%mm3\n\t" \
+  "punpcklbw %%mm7,%%mm1\n\t" \
+  "movq %%mm2,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm3\n\t" \
+  /*mm5:mm4={c0,...,c7}*/ \
+  "movq %%mm2,%%mm5\n\t" \
+  "punpcklbw %%mm7,%%mm4\n\t" \
+  "punpckhbw %%mm7,%%mm5\n\t" \
+  /*mm7={3}x4 \
+    mm5:mm4={c0-b0,...,c7-b7}*/ \
+  "pcmpeqw %%mm7,%%mm7\n\t" \
+  "psubw %%mm1,%%mm4\n\t" \
+  "psrlw $14,%%mm7\n\t" \
+  "psubw %%mm3,%%mm5\n\t" \
+  /*Scale by 3.*/ \
+  "pmullw %%mm7,%%mm4\n\t" \
+  "pmullw %%mm7,%%mm5\n\t" \
+  /*mm7={4}x4 \
+    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+  "psrlw $1,%%mm7\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "psllw $2,%%mm7\n\t" \
+  "movq (%[ll]),%%mm0\n\t" \
+  "paddw %%mm6,%%mm5\n\t" \
+  /*R_i has the range [-127,128], so we compute -R_i instead. \
+    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm7,%%mm5\n\t" \
+  "psraw $3,%%mm4\n\t" \
+  "psraw $3,%%mm5\n\t" \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "packsswb %%mm5,%%mm4\n\t" \
+  "pxor %%mm6,%%mm6\n\t" \
+  "pxor %%mm7,%%mm4\n\t" \
+  "packuswb %%mm3,%%mm1\n\t" \
+  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but working in 8 bits gives much better parallelism). \
+    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+    Finally, we split mm4 into positive and negative pieces using the mask in \
+     mm6, and add and subtract them as appropriate.*/ \
+  /*mm4=abs(-R_i)*/ \
+  /*mm7=255-2*L*/ \
+  "pcmpgtb %%mm4,%%mm6\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "pxor %%mm6,%%mm4\n\t" \
+  "psubb %%mm0,%%mm7\n\t" \
+  "psubb %%mm6,%%mm4\n\t" \
+  /*mm7=255-max(2*L-abs(R_i),0)*/ \
+  "paddusb %%mm4,%%mm7\n\t" \
+  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+  "paddusb %%mm7,%%mm4\n\t" \
+  "psubusb %%mm7,%%mm4\n\t" \
+  /*Now split mm4 by the original sign of -R_i.*/ \
+  "movq %%mm4,%%mm5\n\t" \
+  "pand %%mm6,%%mm4\n\t" \
+  "pandn %%mm5,%%mm6\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm4,%%mm1\n\t" \
+  "psubusb %%mm4,%%mm2\n\t" \
+  "psubusb %%mm6,%%mm1\n\t" \
+  "paddusb %%mm6,%%mm2\n\t" \
 
-#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
+  All other MMX registers are clobbered.*/
+#define OC_LOOP_FILTER8_MMXEXT \
+  "#OC_LOOP_FILTER8_MMXEXT\n\t" \
+  /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
+     -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
+  /*This first part is based on the transformation \
+      f = -(3*(c-b)+a-d+4>>3) \
+        = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
+        = -(3*(c+~b)+(a+~d)-1016>>3) \
+        = 127-(3*(c+~b)+(a+~d)>>3) \
+        = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
+    Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
+     fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
+    Using this, the last expression above can be computed in 8 bits of working \
+     precision via: \
+      u = ~pavgb(~b,c); \
+      v = pavgb(b,~c); \
+      This mask is 0 or 0xFF, and controls whether t is biased up or down: \
+      m = u-v; \
+      t = m^pavgb(m^~a,m^d); \
+      f = 128+pavgb(pavgb(t,u),v); \
+    This required some careful analysis to ensure that carries are propagated \
+     correctly in all cases, but has been checked exhaustively.*/ \
+  /*input (a, b, c, d, ., ., ., .)*/ \
+  /*ff=0xFF; \
+    u=b; \
+    v=c; \
+    ll=255-2*L;*/ \
+  "pcmpeqb %%mm7,%%mm7\n\t" \
+  "movq %%mm1,%%mm4\n\t" \
+  "movq %%mm2,%%mm5\n\t" \
+  "movq (%[ll]),%%mm6\n\t" \
+  /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u^=ff; \
+    v^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm5\n\t" \
+  /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
+  /*u=pavgb(u,c); \
+    v=pavgb(v,b);*/ \
+  "pavgb %%mm2,%%mm4\n\t" \
+  "pavgb %%mm1,%%mm5\n\t" \
+  /*u^=ff; \
+    a^=ff;*/ \
+  "pxor %%mm7,%%mm4\n\t" \
+  "pxor %%mm7,%%mm0\n\t" \
+  /*m=u-v;*/ \
+  "psubb %%mm5,%%mm4\n\t" \
+  /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
+  /*a^=m; \
+    d^=m;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "pxor %%mm4,%%mm3\n\t" \
+  /*t=pavgb(a,d);*/ \
+  "pavgb %%mm3,%%mm0\n\t" \
+  "psllw $7,%%mm7\n\t" \
+  /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
+  /*t^=m; \
+    u=m+v;*/ \
+  "pxor %%mm4,%%mm0\n\t" \
+  "paddb %%mm5,%%mm4\n\t" \
+  /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
+  /*f=pavgb(f,u); \
+    of=128;*/ \
+  "pavgb %%mm4,%%mm0\n\t" \
+  "packsswb %%mm7,%%mm7\n\t" \
+  /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
+  /*f=pavgb(f,v);*/ \
+  "pavgb %%mm5,%%mm0\n\t" \
+  "movq %%mm7,%%mm3\n\t" \
+  "movq %%mm6,%%mm4\n\t" \
+  /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
+  /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
+  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+     we have to split things by sign (the other option is to work in 16 bits, \
+     but staying in 8 bits gives much better parallelism).*/ \
+  /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
+    This is the same number of instructions as computing a mask and splitting \
+     after the lflim computation, but has shorter dependency chains.*/ \
+  /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
+    mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
+  "psubusb %%mm0,%%mm3\n\t" \
+  "psubusb %%mm7,%%mm0\n\t" \
+  /*mm6=255-max(2*L-abs(R_i<0),0) \
+    mm4=255-max(2*L-abs(R_i>0),0)*/ \
+  "paddusb %%mm3,%%mm4\n\t" \
+  "paddusb %%mm0,%%mm6\n\t" \
+  /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
+    mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
+  "paddusb %%mm4,%%mm3\n\t" \
+  "paddusb %%mm6,%%mm0\n\t" \
+  "psubusb %%mm4,%%mm3\n\t" \
+  "psubusb %%mm6,%%mm0\n\t" \
+  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+  "paddusb %%mm3,%%mm1\n\t" \
+  "psubusb %%mm3,%%mm2\n\t" \
+  "psubusb %%mm0,%%mm1\n\t" \
+  "paddusb %%mm0,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
   do{ \
     ptrdiff_t ystride3__; \
     __asm__ __volatile__( \
@@ -104,7 +207,7 @@
       "movq (%[pix],%[ystride]),%%mm1\n\t" \
       /*mm2={c0,...,c7}*/ \
       "movq (%[pix],%[ystride],2),%%mm2\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
       /*Write it back out.*/ \
       "movq %%mm1,(%[pix],%[ystride])\n\t" \
       "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
@@ -116,7 +219,7 @@
   } \
   while(0)
 
-#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
   do{ \
     unsigned char *pix__; \
     ptrdiff_t      ystride3__; \
@@ -174,7 +277,7 @@
       "punpckldq %%mm5,%%mm2\n\t" \
       /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
       "punpckhdq %%mm5,%%mm3\n\t" \
-      OC_LOOP_FILTER8_MMX \
+      _filter \
       /*mm2={b0+R_0'',...,b7+R_7''}*/ \
       "movq %%mm1,%%mm0\n\t" \
       /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
diff --git a/thirdparty/libtheora/x86/mmxstate.c b/thirdparty/libtheora/x86/mmxstate.c
index 808b0a789b..eebea14fba 100644
--- a/thirdparty/libtheora/x86/mmxstate.c
+++ b/thirdparty/libtheora/x86/mmxstate.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,23 +19,23 @@
   Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"
 
 #if defined(OC_X86_ASM)
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
-  int            mb_mode;
+  int            refi;
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
     /*Note that this value must be unsigned, to keep the __asm__ block from
        sign-extending it when it puts it in a register.*/
     ogg_uint16_t p;
+    int          i;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
@@ -47,81 +47,48 @@ void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
       "punpcklwd %%mm0,%%mm0\n\t"
       /*mm0=AAAA AAAA AAAA AAAA*/
       "punpckldq %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
       :
-      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
-      :"memory"
+      :[p]"r"((unsigned)p)
     );
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
+        :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
+      );
+    }
   }
   else{
     /*Dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
+  refi=_state->frags[_fragi].refi;
   ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
+  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
+    ref=_state->ref_frame_data[refi]+frag_buf_off;
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
+     _state->frag_mvs[_fragi])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
   }
 }
 
 /*We copy these entire function to inline the actual MMX routines so that we
    use only a single indirect call.*/
 
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,_flimit,8);
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -133,7 +100,7 @@ void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   OC_ALIGN8(unsigned char   ll[8]);
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
@@ -170,13 +137,84 @@ void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
       if(frags[fragi].coded){
         unsigned char *ref;
         ref=ref_frame_data+frag_buf_offs[fragi];
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
+        }
+        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
+        }
+        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
+        }
+      }
+      fragi++;
+    }
+    fragi0+=nhfrags;
+  }
+}
+
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
+}
+
+/*Apply the loop filter to a given set of fragment rows in the given plane.
+  The filter may be run on the bottom edge, affecting pixels in the next row of
+   fragments, so this row also needs to be available.
+  _bv:        The bounding values array.
+  _refi:      The index of the frame buffer to filter.
+  _pli:       The color plane to filter.
+  _fragy0:    The Y coordinate of the first fragment row to filter.
+  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+  const oc_fragment_plane *fplane;
+  const oc_fragment       *frags;
+  const ptrdiff_t         *frag_buf_offs;
+  unsigned char           *ref_frame_data;
+  ptrdiff_t                fragi_top;
+  ptrdiff_t                fragi_bot;
+  ptrdiff_t                fragi0;
+  ptrdiff_t                fragi0_end;
+  int                      ystride;
+  int                      nhfrags;
+  fplane=_state->fplanes+_pli;
+  nhfrags=fplane->nhfrags;
+  fragi_top=fplane->froffset;
+  fragi_bot=fragi_top+fplane->nfrags;
+  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
+  ystride=_state->ref_ystride[_pli];
+  frags=_state->frags;
+  frag_buf_offs=_state->frag_buf_offs;
+  ref_frame_data=_state->ref_frame_data[_refi];
+  /*The following loops are constructed somewhat non-intuitively on purpose.
+    The main idea is: if a block boundary has at least one coded fragment on
+     it, the filter is applied to it.
+    However, the order that the filters are applied in matters, and VP3 chose
+     the somewhat strange ordering used below.*/
+  while(fragi0<fragi0_end){
+    ptrdiff_t fragi;
+    ptrdiff_t fragi_end;
+    fragi=fragi0;
+    fragi_end=fragi+nhfrags;
+    while(fragi<fragi_end){
+      if(frags[fragi].coded){
+        unsigned char *ref;
+        ref=ref_frame_data+frag_buf_offs[fragi];
+        if(fragi>fragi0){
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
+        if(fragi0>fragi_top){
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
+        }
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
         }
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
         }
       }
       fragi++;
diff --git a/thirdparty/libtheora/x86/sse2encfrag.c b/thirdparty/libtheora/x86/sse2encfrag.c
new file mode 100644
index 0000000000..43aeb17711
--- /dev/null
+++ b/thirdparty/libtheora/x86/sse2encfrag.c
@@ -0,0 +1,501 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+#include "sse2trans.h"
+
+#if defined(OC_X86_ASM)
+
+/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
+   16-bit differences.
+  On output, these are stored in _m0, xmm1, xmm2, and xmm3.
+  xmm4 and xmm5 are clobbered.*/
+#define OC_LOAD_SUB_4x8(_m0) \
+ "#OC_LOAD_SUB_4x8\n\t" \
+ /*Load the first three rows.*/ \
+ "movq (%[src]),"_m0"\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "movq (%[ref],%[ystride],2),%%xmm5\n\t" \
+ /*Unpack and subtract.*/ \
+ "punpcklbw %%xmm4,"_m0"\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm4,"_m0"\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+ /*Load the last row.*/ \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "movq (%[ref],%[ystride3]),%%xmm4\n\t" \
+ /*Unpack, subtract, and advance the pointers.*/ \
+ "punpcklbw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "lea (%[src],%[ystride],4),%[src]\n\t" \
+ "psubw %%xmm5,%%xmm2\n\t" \
+ "punpcklbw %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ystride],4),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm3\n\t" \
+
+/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
+  On output, xmm0 contains the sum of two of the rows, and the other two are
+   added to xmm7.*/
+#define OC_SSD_4x8(_m0) \
+ "pmaddwd "_m0","_m0"\n\t" \
+ "pmaddwd %%xmm1,%%xmm1\n\t" \
+ "pmaddwd %%xmm2,%%xmm2\n\t" \
+ "pmaddwd %%xmm3,%%xmm3\n\t" \
+ "paddd %%xmm1,"_m0"\n\t" \
+ "paddd %%xmm3,%%xmm2\n\t" \
+ "paddd %%xmm2,%%xmm7\n\t" \
+
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned ret;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_4x8("%%xmm7")
+    OC_SSD_4x8("%%xmm7")
+    OC_LOAD_SUB_4x8("%%xmm0")
+    OC_SSD_4x8("%%xmm0")
+    "paddd %%xmm0,%%xmm7\n\t"
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+    :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
+     [ystride3]"r"((ptrdiff_t)_ystride*3)
+  );
+  return ret;
+}
+
+static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
+  0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
+};
+
+/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
+   horizontal sums as well as their 16-bit differences subject to a mask.
+  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
+#define OC_LOAD_SUB_MASK_2x8 \
+ "#OC_LOAD_SUB_MASK_2x8\n\t" \
+ /*Start the loads and expand the next 8 bits of the mask.*/ \
+ "shl $8,%[m]\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "movq (%[ref]),%%xmm2\n\t" \
+ "movd %[m],%%xmm4\n\t" \
+ "shr $8,%[m]\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "mov %h[m],%b[m]\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ /*Perform the masking.*/ \
+ "pand %%xmm4,%%xmm0\n\t" \
+ "pand %%xmm4,%%xmm2\n\t" \
+ /*Finish the loads while unpacking the first set of rows, and expand the next
+    8 bits of the mask.*/ \
+ "movd %[m],%%xmm4\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
+ "movq (%[ref],%[ystride]),%%xmm3\n\t" \
+ "pand %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm0\n\t" \
+ "pcmpeqb %%xmm6,%%xmm4\n\t" \
+ "punpcklbw %%xmm2,%%xmm2\n\t" \
+ /*Mask and unpack the second set of rows.*/ \
+ "pand %%xmm4,%%xmm1\n\t" \
+ "pand %%xmm4,%%xmm3\n\t" \
+ "punpcklbw %%xmm3,%%xmm1\n\t" \
+ "punpcklbw %%xmm3,%%xmm3\n\t" \
+ "psubw %%xmm2,%%xmm0\n\t" \
+ "psubw %%xmm3,%%xmm1\n\t" \
+
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+  ptrdiff_t ystride;
+  unsigned  ret;
+  int       i;
+  ystride=_ystride;
+  __asm__ __volatile__(
+    "pxor %%xmm7,%%xmm7\n\t"
+    "movq %[c],%%xmm6\n\t"
+    :
+    :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
+  );
+  for(i=0;i<4;i++){
+    unsigned m;
+    m=_mask&0xFFFF;
+    _mask>>=16;
+    if(m){
+      __asm__ __volatile__(
+        OC_LOAD_SUB_MASK_2x8
+        "pmaddwd %%xmm0,%%xmm0\n\t"
+        "pmaddwd %%xmm1,%%xmm1\n\t"
+        "paddd %%xmm0,%%xmm7\n\t"
+        "paddd %%xmm1,%%xmm7\n\t"
+        :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
+      );
+    }
+    _src+=2*ystride;
+    _ref+=2*ystride;
+  }
+  __asm__ __volatile__(
+    "movdqa %%xmm7,%%xmm6\n\t"
+    "punpckhqdq %%xmm7,%%xmm7\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "pshufd $1,%%xmm7,%%xmm6\n\t"
+    "paddd %%xmm6,%%xmm7\n\t"
+    "movd %%xmm7,%[ret]\n\t"
+    :[ret]"=a"(ret)
+  );
+  return ret;
+}
+
+
+/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
+   16-bit difference in %%xmm0...%%xmm7.*/
+#define OC_LOAD_SUB_8x8 \
+ "#OC_LOAD_SUB_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[ref]),%%xmm4\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm1\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "movq (%[src]),%%xmm2\n\t" \
+ "movq (%[ref]),%%xmm7\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm3\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
+ "punpcklbw %%xmm4,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "psubw %%xmm4,%%xmm0\n\t" \
+ "movq (%[src]),%%xmm4\n\t" \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm5,%%xmm1\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psubw %%xmm5,%%xmm1\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm2\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm6,%%xmm3\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm3\n\t" \
+ "movq (%[src]),%%xmm6\n\t" \
+ "punpcklbw %%xmm0,%%xmm4\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "lea (%[src],%[src_ystride],2),%[src]\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ "movq (%[ref]),%%xmm0\n\t" \
+ "punpcklbw %%xmm7,%%xmm5\n\t" \
+ "neg %[src_ystride]\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm7,%%xmm5\n\t" \
+ "movq (%[src],%[src_ystride]),%%xmm7\n\t" \
+ "punpcklbw %%xmm0,%%xmm6\n\t" \
+ "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "neg %[ref_ystride]\n\t" \
+ "psubw %%xmm0,%%xmm6\n\t" \
+ "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
+ "punpcklbw %%xmm0,%%xmm7\n\t" \
+ "punpcklbw %%xmm0,%%xmm0\n\t" \
+ "psubw %%xmm0,%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+
+/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
+#define OC_LOAD_8x8 \
+ "#OC_LOAD_8x8\n\t" \
+ "movq (%[src]),%%xmm0\n\t" \
+ "movq (%[src],%[ystride]),%%xmm1\n\t" \
+ "movq (%[src],%[ystride],2),%%xmm2\n\t" \
+ "pxor %%xmm7,%%xmm7\n\t" \
+ "movq (%[src],%[ystride3]),%%xmm3\n\t" \
+ "punpcklbw %%xmm7,%%xmm0\n\t" \
+ "movq (%[src4]),%%xmm4\n\t" \
+ "punpcklbw %%xmm7,%%xmm1\n\t" \
+ "movq (%[src4],%[ystride]),%%xmm5\n\t" \
+ "punpcklbw %%xmm7,%%xmm2\n\t" \
+ "movq (%[src4],%[ystride],2),%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm3\n\t" \
+ "movq (%[src4],%[ystride3]),%%xmm7\n\t" \
+ "punpcklbw %%xmm4,%%xmm4\n\t" \
+ "punpcklbw %%xmm5,%%xmm5\n\t" \
+ "psrlw $8,%%xmm4\n\t" \
+ "psrlw $8,%%xmm5\n\t" \
+ "punpcklbw %%xmm6,%%xmm6\n\t" \
+ "punpcklbw %%xmm7,%%xmm7\n\t" \
+ "psrlw $8,%%xmm6\n\t" \
+ "psrlw $8,%%xmm7\n\t" \
+
+/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
+   perform this stage in place with no temporary registers).*/
+#define OC_HADAMARD_AB_8x8 \
+ "#OC_HADAMARD_AB_8x8\n\t" \
+ /*Stage A:*/ \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "psubw %%xmm1,%%xmm5\n\t" \
+ "psubw %%xmm2,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm3\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "paddw %%xmm4,%%xmm4\n\t" \
+ "psubw %%xmm3,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ /*Stage B:*/ \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm1\n\t" \
+ "paddw %%xmm6,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm5\n\t" \
+ "paddw %%xmm2,%%xmm2\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm6,%%xmm6\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm6\n\t" \
+ "psubw %%xmm5,%%xmm7\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
+   place with no temporary registers).*/
+#define OC_HADAMARD_C_8x8 \
+ "#OC_HADAMARD_C_8x8\n\t" \
+ /*Stage C:*/ \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm2\n\t" \
+ "paddw %%xmm5,%%xmm4\n\t" \
+ "paddw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm1,%%xmm1\n\t" \
+ "paddw %%xmm3,%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm5\n\t" \
+ "paddw %%xmm7,%%xmm7\n\t" \
+ "psubw %%xmm0,%%xmm1\n\t" \
+ "psubw %%xmm2,%%xmm3\n\t" \
+ "psubw %%xmm4,%%xmm5\n\t" \
+ "psubw %%xmm6,%%xmm7\n\t" \
+
+/*Performs an 8-point 1-D Hadamard transform in place.
+  Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
+   in place with no temporary registers).*/
+#define OC_HADAMARD_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_8x8 \
+
+/*Performs the first part of the final stage of the Hadamard transform and
+   summing of absolute values.
+  At the end of this part, %%xmm1 will contain the DC coefficient of the
+   transform.*/
+#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ /*We use the fact that \
+     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
+    to merge the final butterfly with the abs and the first stage of \
+    accumulation. \
+   Thus we can avoid using pabsw, which is not available until SSSE3. \
+   Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
+    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
+    registers). \
+   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
+   This implementation is only 26 (+4 for spilling registers).*/ \
+ "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm7={0x7FFF}x4 \
+   xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
+ "pcmpeqb %%xmm7,%%xmm7\n\t" \
+ "movdqa %%xmm4,%%xmm6\n\t" \
+ "psrlw $1,%%xmm7\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm4\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "psubw %%xmm6,%%xmm4\n\t" \
+ /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
+   xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
+ "movdqa %%xmm2,%%xmm6\n\t" \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ "pmaxsw %%xmm3,%%xmm2\n\t" \
+ "pmaxsw %%xmm1,%%xmm0\n\t" \
+ "paddw %%xmm3,%%xmm6\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
+
+/*Performs the second part of the final stage of the Hadamard transform and
+   summing of absolute values.*/
+#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+ "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "paddsw %%xmm7,%%xmm1\n\t" \
+ "psubw %%xmm6,%%xmm2\n\t" \
+ "psubw %%xmm1,%%xmm0\n\t" \
+ /*xmm7={1}x4 (needed for the horizontal add that follows) \
+   xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "pmaxsw %%xmm5,%%xmm3\n\t" \
+ "paddw %%xmm2,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm6\n\t" \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "paddsw %%xmm7,%%xmm6\n\t" \
+ "paddw %%xmm3,%%xmm0\n\t" \
+ "psrlw $14,%%xmm7\n\t" \
+ "psubw %%xmm6,%%xmm0\n\t" \
+
+/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
+   absolute value of each component, and accumulates everything into xmm0.*/
+#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
+
+/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
+   component, and accumulates everything into xmm0.
+  Note that xmm0 will have an extra 4 added to each column, and that after
+   removing this value, the remainder will be half the conventional value.*/
+#define OC_HADAMARD_ABS_ACCUM_8x8 \
+ OC_HADAMARD_AB_8x8 \
+ OC_HADAMARD_C_ABS_ACCUM_8x8
+
+static unsigned oc_int_frag_satd_sse2(int *_dc,
+ const unsigned char *_src,int _src_ystride,
+ const unsigned char *_ref,int _ref_ystride){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  unsigned ret;
+  unsigned ret2;
+  int      dc;
+  __asm__ __volatile__(
+    OC_LOAD_SUB_8x8
+    OC_HADAMARD_8x8
+    OC_TRANSPOSE_8x8
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x8
+    OC_HADAMARD_C_ABS_ACCUM_A_8x8
+    "movd %%xmm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x8
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.
+      We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
+       latency of pmaddwd by starting to compute abs(dc) here.*/
+    "pmaddwd %%xmm7,%%xmm0\n\t"
+    "movsx %w[dc],%[dc]\n\t"
+    "cdq\n\t"
+    "movdqa %%xmm0,%%xmm1\n\t"
+    "punpckhqdq %%xmm0,%%xmm0\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "movd %%xmm0,%[ret]\n\t"
+    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
+       added to them, a factor of two removed, and the DC value included;
+       correct the final sum here.*/
+    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
+    "xor %[dc],%[ret2]\n\t"
+    "sub %[ret2],%[ret]\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[dc] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf].*/
+    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
+       constraints, otherewise if gcc can prove they're equal it will allocate
+       them to the same register (which is bad); _src and _ref face a similar
+       problem.
+      All four are destructively modified, but if we list them as output
+       constraints, gcc can't alias them with other outputs.*/
+    :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
+    :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
+     [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
+    /*We have to use neg, so we actually clobber the condition codes for once
+       (not to mention sub, and add).*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
+}
+
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+  OC_ALIGN8(unsigned char ref[64]);
+  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+  return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
+}
+
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
+ const unsigned char *_src,int _ystride){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  unsigned ret;
+  int      dc;
+  __asm__ __volatile__(
+    OC_LOAD_8x8
+    OC_HADAMARD_8x8
+    OC_TRANSPOSE_8x8
+    /*We split out the stages here so we can save the DC coefficient in the
+       middle.*/
+    OC_HADAMARD_AB_8x8
+    OC_HADAMARD_C_ABS_ACCUM_A_8x8
+    "movd %%xmm1,%[dc]\n\t"
+    OC_HADAMARD_C_ABS_ACCUM_B_8x8
+    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
+       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
+       for the factor of two we dropped + 3 for the vertical accumulation).
+      Now we finally have to promote things to dwords.*/
+    "pmaddwd %%xmm7,%%xmm0\n\t"
+    /*We assume that the DC coefficient is always positive (which is true,
+       because the input to the INTRA transform was not a difference).*/
+    "movzx %w[dc],%[dc]\n\t"
+    "movdqa %%xmm0,%%xmm1\n\t"
+    "punpckhqdq %%xmm0,%%xmm0\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "pshuflw $0xE,%%xmm0,%%xmm1\n\t"
+    "paddd %%xmm1,%%xmm0\n\t"
+    "movd %%xmm0,%[ret]\n\t"
+    "lea -64(%[ret],%[ret]),%[ret]\n\t"
+    "sub %[dc],%[ret]\n\t"
+    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
+       and %[dc] with some of the inputs, since for once we don't write to
+       them until after we're done using everything but %[buf].*/
+    :[ret]"=a"(ret),[dc]"=r"(dc),
+     [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
+    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
+     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
+    /*We have to use sub, so we actually clobber the condition codes for once.*/
+    :"cc"
+  );
+  *_dc=dc;
+  return ret;
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86/sse2fdct.c b/thirdparty/libtheora/x86/sse2fdct.c
index 86c17d68b1..64c1d27372 100644
--- a/thirdparty/libtheora/x86/sse2fdct.c
+++ b/thirdparty/libtheora/x86/sse2fdct.c
@@ -13,12 +13,14 @@
 /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
 #include <stddef.h>
 #include "x86enc.h"
+#include "x86zigzag.h"
+#include "sse2trans.h"
 
 #if defined(OC_X86_64_ASM)
 
-# define OC_FDCT8x8 \
+# define OC_FDCT_8x8 \
  /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
- "#OC_FDCT8x8\n\t" \
+ "#OC_FDCT_8x8\n\t" \
  /*Stage 1:*/ \
  "movdqa %%xmm0,%%xmm11\n\t" \
  "movdqa %%xmm1,%%xmm10\n\t" \
@@ -349,81 +351,6 @@
  "psubw %%xmm14,%%xmm10\n\t" \
  "paddw %%xmm10,%%xmm7\n\t " \
 
-# define OC_TRANSPOSE8x8 \
- "#OC_TRANSPOSE8x8\n\t" \
- "movdqa %%xmm4,%%xmm8\n\t" \
- /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
- "punpcklwd %%xmm5,%%xmm4\n\t" \
- /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
- "punpckhwd %%xmm5,%%xmm8\n\t" \
- /*xmm5 is free.*/ \
- "movdqa %%xmm0,%%xmm5\n\t" \
- /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
- "punpcklwd %%xmm1,%%xmm0\n\t" \
- /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
- "punpckhwd %%xmm1,%%xmm5\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm6,%%xmm1\n\t" \
- /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
- "punpcklwd %%xmm7,%%xmm6\n\t" \
- /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
- "punpckhwd %%xmm7,%%xmm1\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm2,%%xmm7\n\t" \
- /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
- "punpcklwd %%xmm3,%%xmm7\n\t" \
- /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
- "punpckhwd %%xmm3,%%xmm2\n\t" \
- /*xmm3 is free.*/ \
- "movdqa %%xmm0,%%xmm3\n\t" \
- /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
- "punpckldq %%xmm7,%%xmm0\n\t" \
- /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
- "punpckhdq %%xmm7,%%xmm3\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm5,%%xmm7\n\t" \
- /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
- "punpckldq %%xmm2,%%xmm5\n\t" \
- /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
- "punpckhdq %%xmm2,%%xmm7\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm4,%%xmm2\n\t" \
- /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
- "punpckldq %%xmm6,%%xmm2\n\t" \
- /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
- "punpckhdq %%xmm6,%%xmm4\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm8,%%xmm6\n\t" \
- /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
- "punpckldq %%xmm1,%%xmm6\n\t" \
- /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
- "punpckhdq %%xmm1,%%xmm8\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm0,%%xmm1\n\t" \
- /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
- "punpcklqdq %%xmm2,%%xmm0\n\t" \
- /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
- "punpckhqdq %%xmm2,%%xmm1\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm3,%%xmm2\n\t" \
- /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
- "punpcklqdq %%xmm4,%%xmm2\n\t" \
- /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
- "punpckhqdq %%xmm4,%%xmm3\n\t" \
- /*xmm4 is free.*/ \
- "movdqa %%xmm5,%%xmm4\n\t" \
- /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
- "punpcklqdq %%xmm6,%%xmm4\n\t" \
- /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
- "punpckhqdq %%xmm6,%%xmm5\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm7,%%xmm6\n\t" \
- /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
- "punpcklqdq %%xmm8,%%xmm6\n\t" \
- /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
- "punpckhqdq %%xmm8,%%xmm7\n\t" \
- /*xmm8 is free.*/ \
-
 /*SSE2 implementation of the fDCT for x86-64 only.
   Because of the 8 extra XMM registers on x86-64, this version can operate
    without any temporary stack access at all.*/
@@ -482,12 +409,10 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
     "psubw %%xmm9,%%xmm1\n\t"
     /*Transform columns.*/
-    OC_FDCT8x8
+    OC_FDCT_8x8
     /*Transform rows.*/
-    OC_TRANSPOSE8x8
-    OC_FDCT8x8
-    /*TODO: zig-zag ordering?*/
-    OC_TRANSPOSE8x8
+    OC_TRANSPOSE_8x8
+    OC_FDCT_8x8
     /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
     "paddw %%xmm14,%%xmm14\n\t"
     "psubw %%xmm14,%%xmm0\n\t"
@@ -506,15 +431,19 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
     "psubw %%xmm14,%%xmm7\n\t"
     "psraw $2,%%xmm6\n\t"
     "psraw $2,%%xmm7\n\t"
-    /*Store the result.*/
-    "movdqa %%xmm0,0x00(%[y])\n\t"
-    "movdqa %%xmm1,0x10(%[y])\n\t"
-    "movdqa %%xmm2,0x20(%[y])\n\t"
-    "movdqa %%xmm3,0x30(%[y])\n\t"
-    "movdqa %%xmm4,0x40(%[y])\n\t"
-    "movdqa %%xmm5,0x50(%[y])\n\t"
-    "movdqa %%xmm6,0x60(%[y])\n\t"
-    "movdqa %%xmm7,0x70(%[y])\n\t"
+    /*Transpose, zig-zag, and store the result.*/
+    /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
+       version will do for now.*/
+#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
+    "movdq2q %%xmm"#_row","_reg"\n\t" \
+
+#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
+    "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
+    "movdq2q %%xmm"#_row","_reg"\n\t" \
+
+    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
+#undef OC_ZZ_LOAD_ROW_LO
+#undef OC_ZZ_LOAD_ROW_HI
     :[a]"=&r"(a)
     :[y]"r"(_y),[x]"r"(_x)
     :"memory"
diff --git a/thirdparty/libtheora/x86/sse2idct.c b/thirdparty/libtheora/x86/sse2idct.c
new file mode 100644
index 0000000000..4597ab074f
--- /dev/null
+++ b/thirdparty/libtheora/x86/sse2idct.c
@@ -0,0 +1,456 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
+
+ ********************************************************************/
+
+/*SSE2 acceleration of Theora's iDCT.*/
+#include "x86int.h"
+#include "sse2trans.h"
+#include "../dct.h"
+
+#if defined(OC_X86_ASM)
+
+/*A table of constants used by the MMX routines.*/
+const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
+        8,      8,      8,      8,      8,      8,      8,      8,
+  OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
+  OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
+  OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
+  OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
+  OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
+  OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
+  OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
+};
+
+
+/*Performs the first three stages of the iDCT.
+  xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
+   (accessed in that order).
+  The remaining rows must be in _x at their corresponding locations.
+  On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.*/
+#define OC_IDCT_8x8_ABC(_x) \
+  "#OC_IDCT_8x8_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
+  "movdqa %%xmm1,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm1\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm6,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  "paddw %%xmm4,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
+  "movdqa %%xmm4,%%xmm2\n\t" \
+  "movdqa %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm4\n\t" \
+  "pmulhw %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm3,%%xmm6\n\t" \
+  "pmulhw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm6,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  "psubw %%xmm4,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
+  "movdqa %%xmm3,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm7\n\t" \
+  "pmulhw %%xmm5,%%xmm3\n\t" \
+  "pmulhw %%xmm5,%%xmm7\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "pmulhw %%xmm6,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
+  "paddw %%xmm5,%%xmm7\n\t" \
+  "psubw %%xmm4,%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
+  "paddw %%xmm7,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "pmulhw %%xmm6,%%xmm4\n\t" \
+  "paddw %%xmm7,%%xmm7\n\t" \
+  "psubw %%xmm6,%%xmm7\n\t" \
+  "paddw %%xmm6,%%xmm4\n\t" \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm3\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "movdqa %%xmm5,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm5\n\t" \
+  "paddw %%xmm7,%%xmm5\n\t" \
+  "movdqa %%xmm0,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D \
+  "#OC_IDCT_8x8_D\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+
+/*Performs the last stage of the iDCT.
+  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
+   contain rows 4 through 7.
+  On output, xmm0 through xmm7 contain the corresponding rows.*/
+#define OC_IDCT_8x8_D_STORE \
+  "#OC_IDCT_8x8_D_STORE\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
+    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
+    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
+    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
+  "psubw %%xmm3,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
+  "psubw %%xmm0,%%xmm7\n\t" \
+  "psubw %%xmm1,%%xmm6\n\t" \
+  "psubw %%xmm2,%%xmm5\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm4,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm5\n\t" \
+  "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm0\n\t" \
+  "paddw %%xmm1,%%xmm1\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm3,%%xmm3\n\t" \
+  "paddw %%xmm7,%%xmm0\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "psraw $4,%%xmm0\n\t" \
+  "paddw %%xmm5,%%xmm2\n\t" \
+  "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
+  "psraw $4,%%xmm1\n\t" \
+  "paddw %%xmm4,%%xmm3\n\t" \
+  "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
+  "psraw $4,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
+  "psraw $4,%%xmm3\n\t" \
+  "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
+  "psraw $4,%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "psraw $4,%%xmm5\n\t" \
+  "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
+  "psraw $4,%%xmm6\n\t" \
+  "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
+  "psraw $4,%%xmm7\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
+
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  int i;
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
+    "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
+    "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
+    "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
+    "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
+    OC_IDCT_8x8_ABC(x)
+    OC_IDCT_8x8_D
+    OC_TRANSPOSE_8x8
+    /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
+    "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
+    "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
+    "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
+    "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+    OC_IDCT_8x8_ABC(y)
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+  /*Clear input data for next block (decoder only).*/
+  for(i=0;i<2;i++){
+    __asm__ __volatile__(
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+    );
+  }
+}
+
+/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
+   need to work with four columns at a time.
+  Doing this in MMX is faster on processors with a 64-bit data path.*/
+#define OC_IDCT_8x8_10_MMX \
+  "#OC_IDCT_8x8_10_MMX\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
+  "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm6\n\t" \
+  "pmulhw %%mm2,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
+  "paddw %%mm6,%%mm2\n\t" \
+  "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
+  "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
+  "pmulhw %%mm3,%%mm5\n\t" \
+  "pmulhw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
+  "paddw %%mm3,%%mm5\n\t" \
+  "paddw %%mm3,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
+  "pmulhw %%mm1,%%mm3\n\t" \
+  "pmulhw %%mm1,%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
+  "movq %%mm3,%%mm6\n\t" \
+  "paddw %%mm1,%%mm7\n\t" \
+  /*0-1 butterfly. \
+    mm4=C4, mm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: mm3=t[4], mm5=t[5] \
+    7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
+  "psubw %%mm5,%%mm3\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "movq %%mm4,%%mm1\n\t" \
+  "pmulhw %%mm0,%%mm4\n\t" \
+  "paddw %%mm0,%%mm4\n\t" \
+  "movq %%mm7,%%mm0\n\t" \
+  "movq %%mm4,%%mm5\n\t" \
+  "paddw %%mm2,%%mm0\n\t" \
+  "psubw %%mm2,%%mm7\n\t" \
+  "movq %%mm1,%%mm2\n\t" \
+  "pmulhw %%mm6,%%mm1\n\t" \
+  "pmulhw %%mm7,%%mm2\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
+  "paddw %%mm7,%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
+    0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
+    1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
+  "paddw %%mm2,%%mm1\n\t" \
+  "paddw %%mm5,%%mm6\n\t" \
+  "paddw %%mm4,%%mm7\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm4,%%mm4\n\t" \
+  "paddw %%mm5,%%mm5\n\t" \
+  "psubw %%mm1,%%mm2\n\t" \
+  "psubw %%mm7,%%mm4\n\t" \
+  "psubw %%mm6,%%mm5\n\t" \
+  /*Stage 4: \
+    0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
+    1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
+    2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
+    3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
+  "psubw %%mm0,%%mm7\n\t" \
+  "psubw %%mm1,%%mm6\n\t" \
+  "psubw %%mm2,%%mm5\n\t" \
+  "psubw %%mm3,%%mm4\n\t" \
+  "paddw %%mm0,%%mm0\n\t" \
+  "paddw %%mm1,%%mm1\n\t" \
+  "paddw %%mm2,%%mm2\n\t" \
+  "paddw %%mm3,%%mm3\n\t" \
+  "paddw %%mm7,%%mm0\n\t" \
+  "paddw %%mm6,%%mm1\n\t" \
+  "paddw %%mm5,%%mm2\n\t" \
+  "paddw %%mm4,%%mm3\n\t" \
+
+#define OC_IDCT_8x8_10_ABC \
+  "#OC_IDCT_8x8_10_ABC\n\t" \
+  /*Stage 1:*/ \
+  /*2-3 rotation by 6pi/16. \
+    xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm6\n\t" \
+  "pmulhw %%xmm2,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
+  "paddw %%xmm6,%%xmm2\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+  /*5-6 rotation by 3pi/16. \
+    xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
+  "pmulhw %%xmm3,%%xmm5\n\t" \
+  "pmulhw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
+  "paddw %%xmm3,%%xmm5\n\t" \
+  "paddw %%xmm3,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
+  /*4-7 rotation by 7pi/16. \
+    xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
+  "pmulhw %%xmm1,%%xmm3\n\t" \
+  "pmulhw %%xmm1,%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
+  "movdqa %%xmm3,%%xmm6\n\t" \
+  "paddw %%xmm1,%%xmm7\n\t" \
+  /*0-1 butterfly. \
+    xmm4=C4, xmm0=X0, X4=0.*/ \
+  /*Stage 2:*/ \
+  /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
+    7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
+  "psubw %%xmm5,%%xmm3\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "movdqa %%xmm4,%%xmm1\n\t" \
+  "pmulhw %%xmm0,%%xmm4\n\t" \
+  "paddw %%xmm0,%%xmm4\n\t" \
+  "movdqa %%xmm7,%%xmm0\n\t" \
+  "movdqa %%xmm4,%%xmm5\n\t" \
+  "paddw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm2,%%xmm7\n\t" \
+  "movdqa %%xmm1,%%xmm2\n\t" \
+  "pmulhw %%xmm6,%%xmm1\n\t" \
+  "pmulhw %%xmm7,%%xmm2\n\t" \
+  "paddw %%xmm6,%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
+  "paddw %%xmm7,%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
+  /*Stage 3: \
+    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
+    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
+    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
+  "paddw %%xmm2,%%xmm1\n\t" \
+  "paddw %%xmm5,%%xmm6\n\t" \
+  "paddw %%xmm4,%%xmm7\n\t" \
+  "paddw %%xmm2,%%xmm2\n\t" \
+  "paddw %%xmm4,%%xmm4\n\t" \
+  "paddw %%xmm5,%%xmm5\n\t" \
+  "psubw %%xmm1,%%xmm2\n\t" \
+  "psubw %%xmm7,%%xmm4\n\t" \
+  "psubw %%xmm6,%%xmm5\n\t" \
+
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  OC_ALIGN16(ogg_int16_t buf[16]);
+  /*This routine accepts an 8x8 matrix pre-transposed.*/
+  __asm__ __volatile__(
+    "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
+    OC_IDCT_8x8_10_MMX
+    OC_TRANSPOSE_8x4_MMX2SSE
+    OC_IDCT_8x8_10_ABC
+    OC_IDCT_8x8_D_STORE
+    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
+  );
+  /*Clear input data for next block (decoder only).*/
+  __asm__ __volatile__(
+    "pxor %%mm0,%%mm0\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+    "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+    :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+  );
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.*/
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     use a smaller transform when the block ends with a long zero run instead
+     of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Then perform the iDCT.*/
+  if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
+  else oc_idct8x8_slow_sse2(_y,_x);
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86/sse2trans.h b/thirdparty/libtheora/x86/sse2trans.h
new file mode 100644
index 0000000000..e76da5140b
--- /dev/null
+++ b/thirdparty/libtheora/x86/sse2trans.h
@@ -0,0 +1,242 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_sse2trans_H)
+# define _x86_sse2trans_H (1)
+# include "x86int.h"
+
+# if defined(OC_X86_64_ASM)
+/*On x86-64 we can transpose in-place without spilling registers.
+  By clever choices of the order to apply the butterflies and the order of
+   their outputs, we can take the rows in order and output the columns in order
+   without any extra operations and using just one temporary register.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm8 is free.*/ \
+
+# else
+/*Otherwise, we need to spill some values to %[buf] temporarily.
+  Again, the butterflies are carefully arranged to get the columns to come out
+   in order, minimizing register spills and maximizing the delay between a load
+   and when the value loaded is actually used.*/
+#  define OC_TRANSPOSE_8x8 \
+ "#OC_TRANSPOSE_8x8\n\t" \
+ /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm2,%%xmm0\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm0\n\t" \
+ /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
+ /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm6,%%xmm2\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm2\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm4,%%xmm7\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm7\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm3,%%xmm5\n\t" \
+ /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm3\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm2,%%xmm7\n\t" \
+ /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
+ /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm3,%%xmm1\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm0,%%xmm3\n\t" \
+ /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm0,%%xmm1\n\t" \
+ /*xmm0 is free.*/ \
+ "movdqa %%xmm4,%%xmm0\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm0\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm5,%%xmm6\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm6\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm0,%%xmm1\n\t" \
+ /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
+ /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm7,%%xmm5\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm7,%%xmm4\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm0,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm0,%%xmm7\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
+
+# endif
+
+/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
+   four SSE registers.
+  No need to be clever here; we have plenty of room.*/
+#  define OC_TRANSPOSE_8x4_MMX2SSE \
+ "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
+ "movq2dq %%mm0,%%xmm0\n\t" \
+ "movq2dq %%mm1,%%xmm1\n\t" \
+ /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ "movq2dq %%mm2,%%xmm3\n\t" \
+ "movq2dq %%mm3,%%xmm2\n\t" \
+ /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm2,%%xmm3\n\t" \
+ "movq2dq %%mm4,%%xmm4\n\t" \
+ "movq2dq %%mm5,%%xmm5\n\t" \
+ /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ "movq2dq %%mm6,%%xmm7\n\t" \
+ "movq2dq %%mm7,%%xmm6\n\t" \
+ /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm0,%%xmm2\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm3,%%xmm0\n\t" \
+ /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm3,%%xmm2\n\t" \
+ "movdqa %%xmm4,%%xmm5\n\t" \
+ /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm7,%%xmm4\n\t" \
+ /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm7,%%xmm5\n\t" \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm4,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm2,%%xmm3\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm5,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm5,%%xmm3\n\t" \
+
+#endif
diff --git a/thirdparty/libtheora/x86/x86cpu.c b/thirdparty/libtheora/x86/x86cpu.c
new file mode 100644
index 0000000000..49fd76d0ac
--- /dev/null
+++ b/thirdparty/libtheora/x86/x86cpu.c
@@ -0,0 +1,182 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "cpuid\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# else
+/*On x86-32, not so much.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   "cpuid\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+  __asm__ __volatile__(
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "popfl\n\t"
+   :[a]"=r"(eax),[b]"=r"(ebx)
+   :
+   :"cc"
+  );
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif
diff --git a/thirdparty/libtheora/x86/x86cpu.h b/thirdparty/libtheora/x86/x86cpu.h
new file mode 100644
index 0000000000..e0192d52e2
--- /dev/null
+++ b/thirdparty/libtheora/x86/x86cpu.h
@@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86cpu_H)
+# define _x86_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
diff --git a/thirdparty/libtheora/x86/x86enc.c b/thirdparty/libtheora/x86/x86enc.c
index 43b7be3ea3..ffa9c14a42 100644
--- a/thirdparty/libtheora/x86/x86enc.c
+++ b/thirdparty/libtheora/x86/x86enc.c
@@ -18,32 +18,46 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
   ogg_uint32_t cpu_flags;
-  cpu_flags=oc_cpu_flags_get();
-  oc_enc_vtable_init_c(_enc);
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
+# if defined(OC_ENC_USE_VTABLE)
   if(cpu_flags&OC_CPU_X86_MMX){
     _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
     _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
     _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
   }
   if(cpu_flags&OC_CPU_X86_MMXEXT){
     _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
     _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
     _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
-    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
-    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
     _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
     _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
   }
   if(cpu_flags&OC_CPU_X86_SSE2){
-# if defined(OC_X86_64_ASM)
-    /*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/
+#  if defined(OC_X86_64_ASM)
+    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+#  endif
+    _enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
+    _enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
+    _enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
+    _enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
+    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
+    _enc->opt_vtable.quantize=oc_enc_quantize_sse2;
+# else
+    (void) cpu_flags;
 # endif
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+# if defined(OC_ENC_USE_VTABLE)
   }
+# endif
 }
 #endif
diff --git a/thirdparty/libtheora/x86/x86enc.h b/thirdparty/libtheora/x86/x86enc.h
index 06c3908bcd..c258247d67 100644
--- a/thirdparty/libtheora/x86/x86enc.h
+++ b/thirdparty/libtheora/x86/x86enc.h
@@ -17,11 +17,62 @@
 
 #if !defined(_x86_x86enc_H)
 # define _x86_x86enc_H (1)
-# include "../encint.h"
 # include "x86int.h"
 
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+# if defined(OC_X86_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_x86
+#  if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+  If the best routine we have available only needs SSE2 (which at the moment
+   covers all of them), then we can avoid runtime detection and the indirect
+   call.*/
+#   define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \
+  oc_enc_frag_sub_mmx(_diff,_x,_y,_stride)
+#   define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \
+  oc_enc_frag_sub_128_mmx(_diff,_x,_stride)
+#   define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_sad_mmxext(_src,_ref,_ystride)
+#   define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
+  oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh)
+#   define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
+  oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh)
+#   define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
+  oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride)
+#   define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
+  oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride)
+#   define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
+  oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride)
+#   define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
+  oc_enc_frag_ssd_sse2(_src,_ref,_ystride)
+#   define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
+  oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask)
+#   define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
+  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride)
+#   define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
+  oc_enc_enquant_table_init_x86(_enquant,_dequant)
+#   define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
+  oc_enc_enquant_table_fixup_x86(_enquant,_nqis)
+#  define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
+  oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant)
+#   define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+#   define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+#   define oc_enc_fdct8x8(_enc,_y,_x) \
+  oc_enc_fdct8x8_x86_64sse2(_y,_x)
+#  else
+#   define OC_ENC_USE_VTABLE (1)
+#  endif
+# endif
+
+# include "../encint.h"
 
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);
 unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
@@ -29,19 +80,35 @@ unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
 unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
  unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
-void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,const unsigned char *_y,int _stride);
-void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,int _stride);
+unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
+unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
+void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
+ const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
 void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
  const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+
+# if defined(OC_X86_64_ASM)
 void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+# endif
 
 #endif
diff --git a/thirdparty/libtheora/x86/x86enquant.c b/thirdparty/libtheora/x86/x86enquant.c
new file mode 100644
index 0000000000..39477ecc21
--- /dev/null
+++ b/thirdparty/libtheora/x86/x86enquant.c
@@ -0,0 +1,149 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+
+ ********************************************************************/
+
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+
+
+/*The default enquant table is not quite suitable for SIMD purposes.
+  First, the m and l parameters need to be separated so that an entire row full
+   of m's or l's can be loaded at a time.
+  Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
+   emulate one with a multiply.
+  Therefore we translate the shift count into a scale factor.*/
+void oc_enc_enquant_table_init_x86(void *_enquant,
+ const ogg_uint16_t _dequant[64]){
+  ogg_int16_t *m;
+  ogg_int16_t *l;
+  int          zzi;
+  m=(ogg_int16_t *)_enquant;
+  l=m+64;
+  for(zzi=0;zzi<64;zzi++){
+    oc_iquant q;
+    oc_iquant_init(&q,_dequant[zzi]);
+    m[zzi]=q.m;
+    /*q.l must be at least 2 for this to work; fortunately, once all the scale
+       factors are baked in, the minimum quantizer is much larger than that.*/
+    l[zzi]=1<<16-q.l;
+  }
+}
+
+void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
+  int pli;
+  int qii;
+  int qti;
+  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[0];
+    ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
+     ((ogg_int16_t *)_enquant[pli][0][qti])[64];
+  }
+}
+
+int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant){
+  ptrdiff_t r;
+  __asm__ __volatile__(
+    "xor %[r],%[r]\n\t"
+    /*Loop through two rows at a time.*/
+    ".p2align 4\n\t"
+    "0:\n\t"
+    /*Load the first two rows of the data and the quant matrices.*/
+    "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
+    "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
+    "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
+    "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
+    "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
+    "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
+    /*Double the input and propagate its sign to the rounding factor.
+      Using SSSE3's psignw would help here, but we need the mask later anyway.*/
+    "movdqa %%xmm0,%%xmm6\n\t"
+    "psraw $15,%%xmm0\n\t"
+    "movdqa %%xmm1,%%xmm7\n\t"
+    "paddw %%xmm6,%%xmm6\n\t"
+    "psraw $15,%%xmm1\n\t"
+    "paddw %%xmm7,%%xmm7\n\t"
+    "paddw %%xmm0,%%xmm2\n\t"
+    "paddw %%xmm1,%%xmm3\n\t"
+    "pxor %%xmm0,%%xmm2\n\t"
+    "pxor %%xmm1,%%xmm3\n\t"
+    /*Add the rounding factor and perform the first multiply.*/
+    "paddw %%xmm2,%%xmm6\n\t"
+    "paddw %%xmm3,%%xmm7\n\t"
+    "pmulhw %%xmm6,%%xmm4\n\t"
+    "pmulhw %%xmm7,%%xmm5\n\t"
+    "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
+    "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
+    "paddw %%xmm4,%%xmm6\n\t"
+    "paddw %%xmm5,%%xmm7\n\t"
+    /*Emulate an element-wise right-shift via a second multiply.*/
+    "pmulhw %%xmm2,%%xmm6\n\t"
+    "pmulhw %%xmm3,%%xmm7\n\t"
+    "add $32,%[r]\n\t"
+    "cmp $96,%[r]\n\t"
+    /*Correct for the sign.*/
+    "psubw %%xmm0,%%xmm6\n\t"
+    "psubw %%xmm1,%%xmm7\n\t"
+    /*Save the result.*/
+    "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
+    "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
+    "jle 0b\n\t"
+    /*Now find the location of the last non-zero value.*/
+    "movdqa 0x50(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x40(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pxor %%xmm0,%%xmm0\n\t"
+    "mov $-1,%k[dq]\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
+    "mov $32,%[r]\n\t"
+    /*We have to use xor here instead of not in order to set the flags.*/
+    "xor %k[dq],%k[q]\n\t"
+    "jnz 1f\n\t"
+    "movdqa 0x30(%[qdct]),%%xmm7\n\t"
+    "movdqa 0x20(%[qdct]),%%xmm6\n\t"
+    "movdqa 0x10(%[qdct]),%%xmm5\n\t"
+    "movdqa 0x00(%[qdct]),%%xmm4\n\t"
+    "packsswb %%xmm7,%%xmm6\n\t"
+    "packsswb %%xmm5,%%xmm4\n\t"
+    "pcmpeqb %%xmm0,%%xmm6\n\t"
+    "pcmpeqb %%xmm0,%%xmm4\n\t"
+    "pmovmskb %%xmm6,%k[q]\n\t"
+    "pmovmskb %%xmm4,%k[r]\n\t"
+    "shl $16,%k[q]\n\t"
+    "or %k[r],%k[q]\n\t"
+    "xor %[r],%[r]\n\t"
+    "not %k[q]\n\t"
+    "or $1,%k[q]\n\t"
+    "1:\n\t"
+    "bsr %k[q],%k[q]\n\t"
+    "add %k[q],%k[r]\n\t"
+    :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
+    :[dct]"r"(_dct),[qdct]"r"(_qdct)
+    :"cc","memory"
+  );
+  return (int)r;
+}
+
+#endif
diff --git a/thirdparty/libtheora/x86/x86int.h b/thirdparty/libtheora/x86/x86int.h
index ede724f5aa..ceb2dbb0ec 100644
--- a/thirdparty/libtheora/x86/x86int.h
+++ b/thirdparty/libtheora/x86/x86int.h
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,24 +19,104 @@
 # define _x86_x86int_H (1)
 # include "../internal.h"
 
-void oc_state_vtable_init_x86(oc_theora_state *_state);
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  if defined(OC_X86_64_ASM)
+/*x86-64 guarantees SIMD support up through at least SSE2.
+  If the best routine we have available only needs SSE2 (which at the moment
+   covers all of them), then we can avoid runtime detection and the indirect
+   call.*/
+#   define oc_frag_copy(_state,_dst,_src,_ystride) \
+  oc_frag_copy_mmx(_dst,_src,_ystride)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
+   _fragis,_nfragis,_frag_buf_offs)
+#   define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
+  oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
+#   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
+  oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
+#   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
+  oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  oc_idct8x8_sse2(_y,_x,_last_zzi)
+#   define oc_state_frag_recon oc_state_frag_recon_mmx
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_mmxext(_bv,_flimit)
+#   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
+#   define oc_restore_fpu(_state) \
+  oc_restore_fpu_mmx()
+#  else
+#   define OC_STATE_USE_VTABLE (1)
+#  endif
+# endif
+
+# include "../state.h"
+# include "x86cpu.h"
+
+/*Converts the expression in the argument to a string.*/
+#define OC_M2STR(_s) #_s
+
+/*Memory operands do not always include an offset.
+  To avoid warnings, we force an offset with %H (which adds 8).*/
+# if __GNUC_PREREQ(4,0)
+#  define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs-8+%H[_name])
+# endif
+/*If your gcc version does't support %H, then you get to suffer the warnings.
+  Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
+   whole offset, instead of substituting in 0 for the missing operand to +.*/
+# if !defined(OC_MEM_OFFS)
+#  define OC_MEM_OFFS(_offs,_name) \
+  OC_M2STR(_offs+%[_name])
+# endif
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
+    array_addr__; \
+  }))
+
+/*Declare an array operand with an exact size.
+  This tells gcc we're going to clobber this memory region, without having to
+   clobber all of "memory" and lets us access local buffers directly using the
+   stack pointer, without allocating a separate register to point to them.*/
+#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
+  (*({ \
+    const struct{_type array_value__[(_size)];} *array_addr__= \
+     (const void *)(_ptr); \
+    array_addr__; \
+  }))
+
+extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
+
+void oc_state_accel_init_x86(oc_theora_state *_state);
 
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 
 #endif
diff --git a/thirdparty/libtheora/x86/x86state.c b/thirdparty/libtheora/x86/x86state.c
index a786bec284..9f8bceb534 100644
--- a/thirdparty/libtheora/x86/x86state.c
+++ b/thirdparty/libtheora/x86/x86state.c
@@ -11,7 +11,7 @@
  ********************************************************************
 
   function:
-    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
+    last mod: $Id$
 
  ********************************************************************/
 
@@ -19,8 +19,7 @@
 
 #if defined(OC_X86_ASM)
 
-#include "../cpu.c"
-
+#if defined(OC_STATE_USE_VTABLE)
 /*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
    each quadrant of the destination.*/
 static const unsigned char OC_FZIG_ZAG_MMX[128]={
@@ -39,24 +38,60 @@ static const unsigned char OC_FZIG_ZAG_MMX[128]={
   64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+#endif
+
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_SSE2[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
   64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
 };
 
-void oc_state_vtable_init_x86(oc_theora_state *_state){
+void oc_state_accel_init_x86(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
   _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
   if(_state->cpu_flags&OC_CPU_X86_MMX){
     _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
     _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
-  else oc_state_vtable_init_c(_state);
+  if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     oc_state_loop_filter_frag_rows_mmxext;
+  }
+  if(_state->cpu_flags&OC_CPU_X86_SSE2){
+    _state->opt_vtable.idct8x8=oc_idct8x8_sse2;
+# endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
+# if defined(OC_STATE_USE_VTABLE)
+  }
+# endif
 }
 #endif
diff --git a/thirdparty/libtheora/x86/x86zigzag.h b/thirdparty/libtheora/x86/x86zigzag.h
new file mode 100644
index 0000000000..fb21e0bb43
--- /dev/null
+++ b/thirdparty/libtheora/x86/x86zigzag.h
@@ -0,0 +1,244 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86zigzag_H)
+# define _x86_x86zigzag_H (1)
+# include "x86enc.h"
+
+
+/*Converts DCT coefficients from transposed order into zig-zag scan order and
+   stores them in %[y].
+  This relies on two macros to load the contents of each row:
+   OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
+   the first four and second four entries of each row into the specified
+   register, respectively.
+  OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
+   (because when the rows are already in SSE2 registers, loading the high half
+   destructively modifies the register).
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  8  1  2   9 16 24 17 B
+    C 10  3  4 11  18 25 32 40 E
+    F 33 26 19 12   5  6 13 20 D
+    G 27 34 41 48  56 49 42 35 I
+    L 28 21 14  7  15 22 29 36 M
+    H 43 50 57 58  51 44 37 30 O
+    N 23 31 38 45  52 59 60 53 J
+    P 46 39 47 54  61 62 55 63 K
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
+  OC_ZZ_LOAD_ROW_LO(0,"%%mm0")   /*mm0=03 02 01 00*/ \
+  OC_ZZ_LOAD_ROW_LO(1,"%%mm1")   /*mm1=11 10 09 08*/ \
+  OC_ZZ_LOAD_ROW_LO(2,"%%mm2")   /*mm2=19 18 17 16*/ \
+  OC_ZZ_LOAD_ROW_LO(3,"%%mm3")   /*mm3=27 26 25 24*/ \
+  OC_ZZ_LOAD_ROW_HI(0,"%%mm4")   /*mm4=07 06 05 04*/ \
+  OC_ZZ_LOAD_ROW_HI(1,"%%mm5")   /*mm5=15 14 13 12*/ \
+  OC_ZZ_LOAD_ROW_HI(2,"%%mm6")   /*mm6=23 22 21 20*/ \
+  "movq %%mm0,%%mm7\n\t"         /*mm7=03 02 01 00*/ \
+  "punpckhdq %%mm1,%%mm0\n\t"    /*mm0=11 10 03 02*/ \
+  "pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
+  "punpcklwd %%mm0,%%mm1\n\t"    /*mm1=03 09 02 08*/ \
+  "pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
+  "punpcklwd %%mm1,%%mm7\n\t"    /*mm7=02 01 08 00 *A*/ \
+  "movq %%mm7,0x00(%[y])\n\t" \
+  "punpckhwd %%mm4,%%mm1\n\t"    /*mm1=04 03 07 09*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=19 18 17 16*/ \
+  "punpckhdq %%mm1,%%mm0\n\t"    /*mm0=04 03 11 10*/ \
+  "punpckhwd %%mm5,%%mm7\n\t"    /*mm7=12 19 15 18*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=25 07 24 09*/ \
+  "punpcklwd %%mm6,%%mm5\n\t"    /*mm5=21 14 20 13*/ \
+  "punpcklwd %%mm2,%%mm1\n\t"    /*mm1=17 24 16 09 *B*/ \
+  OC_ZZ_LOAD_ROW_LO(4,"%%mm2")   /*mm2=35 34 33 32*/ \
+  "movq %%mm1,0x08(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_LO(5,"%%mm1")   /*mm1=43 42 41 40*/ \
+  "pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
+  "movq %%mm0,0x10(%[y])\n\t" \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=?? 07 23 22*/ \
+  "punpckldq %%mm5,%%mm4\n\t"    /*mm4=20 13 06 05 *D*/ \
+  "movq %%mm4,0x28(%[y])\n\t" \
+  "psrlq $16,%%mm3\n\t"          /*mm3=.. 27 26 25*/ \
+  "pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
+  "movq %%mm7,%%mm4\n\t"         /*mm4=12 19 15 18*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=26 33 25 32*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=41 15 40 18*/ \
+  "punpckhwd %%mm1,%%mm3\n\t"    /*mm3=43 .. 42 27*/ \
+  "punpckldq %%mm2,%%mm4\n\t"    /*mm4=25 32 40 18*/ \
+  "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=35 42 34 27*/ \
+  OC_ZZ_LOAD_ROW_LO(6,"%%mm0")   /*mm0=51 50 49 48*/ \
+  "pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
+  "movq %%mm4,0x18(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_LO(7,"%%mm4")   /*mm4=59 58 57 56*/ \
+  "punpckhdq %%mm7,%%mm2\n\t"    /*mm2=12 19 26 33 *F*/ \
+  "movq %%mm2,0x20(%[y])\n\t" \
+  "pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
+  "pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
+  "movq %%mm3,%%mm2\n\t"         /*mm2=35 42 34 27*/ \
+  "punpckhwd %%mm0,%%mm1\n\t"    /*mm1=50 43 48 41*/ \
+  "pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=48 41 34 27 *G*/ \
+  "movq %%mm3,0x30(%[y])\n\t" \
+  "punpckhdq %%mm4,%%mm1\n\t"    /*mm1=58 57 50 43 *H*/ \
+  "movq %%mm1,0x50(%[y])\n\t" \
+  OC_ZZ_LOAD_ROW_HI(7,"%%mm1")   /*mm1=63 62 61 60*/ \
+  "punpcklwd %%mm0,%%mm4\n\t"    /*mm4=49 56 51 59*/ \
+  OC_ZZ_LOAD_ROW_HI(6,"%%mm0")   /*mm0=55 54 53 52*/ \
+  "psllq $16,%%mm6\n\t"          /*mm6=07 23 22 ..*/ \
+  "movq %%mm4,%%mm3\n\t"         /*mm3=49 56 51 59*/ \
+  "punpckhdq %%mm2,%%mm4\n\t"    /*mm4=35 42 49 56 *I*/ \
+  OC_ZZ_LOAD_ROW_HI(3,"%%mm2")   /*mm2=31 30 29 28*/ \
+  "movq %%mm4,0x38(%[y])\n\t" \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=61 51 60 59*/ \
+  "punpcklwd %%mm6,%%mm7\n\t"    /*mm7=22 15 .. ??*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=61 51 60 59*/ \
+  "punpcklwd %%mm0,%%mm3\n\t"    /*mm3=53 60 52 59*/ \
+  "punpckhwd %%mm0,%%mm4\n\t"    /*mm4=55 61 54 51*/ \
+  OC_ZZ_LOAD_ROW_HI(4,"%%mm0")   /*mm0=39 38 37 36*/ \
+  "pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
+  "movq %%mm3,0x68(%[y])\n\t" \
+  "movq %%mm4,%%mm3\n\t"         /*mm3=?? ?? 54 51*/ \
+  "pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
+  "punpckhwd %%mm1,%%mm4\n\t"    /*mm4=63 55 62 61 *K*/ \
+  OC_ZZ_LOAD_ROW_HI(5,"%%mm1")   /*mm1=47 46 45 44*/ \
+  "movq %%mm4,0x78(%[y])\n\t" \
+  "punpckhwd %%mm2,%%mm6\n\t"    /*mm6=28 07 31 23*/ \
+  "punpcklwd %%mm0,%%mm2\n\t"    /*mm2=37 30 36 29*/ \
+  "punpckhdq %%mm6,%%mm5\n\t"    /*mm5=28 07 21 14*/ \
+  "pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
+  "pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
+  "movq %%mm5,0x40(%[y])\n\t" \
+  "punpckhdq %%mm2,%%mm7\n\t"    /*mm7=36 29 22 15 *M*/ \
+  "movq %%mm7,0x48(%[y])\n\t" \
+  "pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
+  "punpckhwd %%mm1,%%mm0\n\t"    /*mm0=46 39 45 38*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=47 54 44 51*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=45 38 31 23 *N*/ \
+  "movq %%mm6,0x60(%[y])\n\t" \
+  "punpckhdq %%mm3,%%mm0\n\t"    /*mm0=47 54 46 39*/ \
+  "punpckldq %%mm2,%%mm3\n\t"    /*mm3=30 37 44 51 *O*/ \
+  "movq %%mm3,0x58(%[y])\n\t" \
+  "pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
+  "movq %%mm0,0x70(%[y])\n\t" \
+
+/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
+   order and stores them in %[qdct].
+  The index of each output element in the original 64-element array should wind
+   up in the following 8x8 matrix (the letters indicate the order we compute
+   each 4-tuple below):
+    A  0  1  8 16   9  2  3 10 B
+    C 17 24 32 25  18 11  4  5 D
+    E 12 19 26 33  40 48 41 34 I
+    H 27 20 13  6   7 14 21 28 G
+    K 35 42 49 56  57 50 43 36 J
+    F 29 22 15 23  30 37 44 51 M
+    P 58 59 52 45  38 31 39 46 L
+    N 53 60 61 54  47 55 62 63 O
+  The order of the coefficients within each tuple is reversed in the comments
+   below to reflect the usual MSB to LSB notation.*/
+#define OC_ZIG_ZAG_MMXEXT \
+  "movq 0x00(%[dct]),%%mm0\n\t"  /*mm0=03 02 01 00*/ \
+  "movq 0x08(%[dct]),%%mm1\n\t"  /*mm1=07 06 05 04*/ \
+  "movq 0x10(%[dct]),%%mm2\n\t"  /*mm2=11 10 09 08*/ \
+  "movq 0x20(%[dct]),%%mm3\n\t"  /*mm3=19 18 17 16*/ \
+  "movq 0x30(%[dct]),%%mm4\n\t"  /*mm4=27 26 25 24*/ \
+  "movq 0x40(%[dct]),%%mm5\n\t"  /*mm5=35 34 33 32*/ \
+  "movq %%mm2,%%mm7\n\t"         /*mm7=11 10 09 08*/ \
+  "punpcklwd %%mm3,%%mm2\n\t"    /*mm2=17 09 16 08*/ \
+  "movq %%mm0,%%mm6\n\t"         /*mm6=03 02 01 00*/ \
+  "punpckldq %%mm2,%%mm0\n\t"    /*mm0=16 08 01 00 *A*/ \
+  "movq %%mm0,0x00(%[qdct])\n\t" \
+  "movq 0x18(%[dct]),%%mm0\n\t"  /*mm0=15 14 13 12*/ \
+  "punpckhdq %%mm6,%%mm6\n\t"    /*mm6=03 02 03 02*/ \
+  "psrlq $16,%%mm7\n\t"          /*mm7=.. 11 10 09*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=10 09 03 02*/ \
+  "punpckhwd %%mm7,%%mm3\n\t"    /*mm3=.. 19 11 18*/ \
+  "pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
+  "movq %%mm6,0x08(%[qdct])\n\t" \
+  "psrlq $48,%%mm2\n\t"          /*mm2=.. .. .. 17*/ \
+  "movq %%mm1,%%mm6\n\t"         /*mm6=07 06 05 04*/ \
+  "punpcklwd %%mm5,%%mm2\n\t"    /*mm2=33 .. 32 17*/ \
+  "movq %%mm3,%%mm7\n\t"         /*mm7=.. 19 11 18*/ \
+  "punpckldq %%mm1,%%mm3\n\t"    /*mm3=05 04 11 18 *C*/ \
+  "por %%mm2,%%mm7\n\t"          /*mm7=33 19 ?? ??*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=25 32 24 17 *D**/ \
+  "movq %%mm2,0x10(%[qdct])\n\t" \
+  "movq %%mm3,0x18(%[qdct])\n\t" \
+  "movq 0x28(%[dct]),%%mm2\n\t"  /*mm2=23 22 21 20*/ \
+  "movq 0x38(%[dct]),%%mm1\n\t"  /*mm1=31 30 29 28*/ \
+  "pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
+  "punpckhdq %%mm7,%%mm7\n\t"    /*mm7=33 19 33 19*/ \
+  "punpckhwd %%mm3,%%mm6\n\t"    /*mm6=14 07 13 06*/ \
+  "punpckldq %%mm0,%%mm0\n\t"    /*mm0=13 12 13 12*/ \
+  "punpcklwd %%mm1,%%mm3\n\t"    /*mm3=29 15 28 12*/ \
+  "punpckhwd %%mm4,%%mm0\n\t"    /*mm0=27 13 26 12*/ \
+  "pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
+  "psrlq $48,%%mm4\n\t"          /*mm4=.. .. .. 27*/ \
+  "punpcklwd %%mm7,%%mm0\n\t"    /*mm0=33 26 19 12 *E*/ \
+  "punpcklwd %%mm1,%%mm4\n\t"    /*mm4=29 .. 28 27*/ \
+  "punpckhwd %%mm2,%%mm3\n\t"    /*mm3=23 15 22 29 *F*/ \
+  "movq %%mm0,0x20(%[qdct])\n\t" \
+  "movq %%mm3,0x50(%[qdct])\n\t" \
+  "movq 0x60(%[dct]),%%mm3\n\t"  /*mm3=51 50 49 48*/ \
+  "movq 0x70(%[dct]),%%mm7\n\t"  /*mm7=59 58 57 56*/ \
+  "movq 0x50(%[dct]),%%mm0\n\t"  /*mm0=43 42 41 40*/ \
+  "punpcklwd %%mm4,%%mm2\n\t"    /*mm2=28 21 27 20*/ \
+  "psrlq $32,%%mm5\n\t"          /*mm5=.. .. 35 34*/ \
+  "movq %%mm2,%%mm4\n\t"         /*mm4=28 21 27 20*/ \
+  "punpckldq %%mm6,%%mm2\n\t"    /*mm2=13 06 27 20*/ \
+  "punpckhdq %%mm4,%%mm6\n\t"    /*mm6=28 21 14 07 *G*/ \
+  "movq %%mm3,%%mm4\n\t"         /*mm4=51 50 49 48*/ \
+  "pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
+  "movq %%mm2,0x30(%[qdct])\n\t" \
+  "movq %%mm6,0x38(%[qdct])\n\t" \
+  "movq 0x48(%[dct]),%%mm2\n\t"  /*mm2=39 38 37 36*/ \
+  "punpcklwd %%mm5,%%mm4\n\t"    /*mm4=35 49 34 48*/ \
+  "movq 0x58(%[dct]),%%mm5\n\t"  /*mm5=47 46 45 44*/ \
+  "punpckldq %%mm7,%%mm6\n\t"    /*mm6=57 56 14 07*/ \
+  "psrlq $32,%%mm3\n\t"          /*mm3=.. .. 51 50*/ \
+  "punpckhwd %%mm0,%%mm6\n\t"    /*mm6=43 57 42 56*/ \
+  "punpcklwd %%mm4,%%mm0\n\t"    /*mm0=34 41 48 40 *I*/ \
+  "pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
+  "movq %%mm0,0x28(%[qdct])\n\t" \
+  "punpcklwd %%mm2,%%mm3\n\t"    /*mm3=37 51 36 50*/ \
+  "punpckhwd %%mm6,%%mm4\n\t"    /*mm4=42 35 56 49*/ \
+  "punpcklwd %%mm3,%%mm6\n\t"    /*mm6=36 43 50 57 *J*/ \
+  "pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
+  "movq %%mm4,0x40(%[qdct])\n\t" \
+  "movq %%mm6,0x48(%[qdct])\n\t" \
+  "movq 0x68(%[dct]),%%mm6\n\t"  /*mm6=55 54 53 52*/ \
+  "movq 0x78(%[dct]),%%mm0\n\t"  /*mm0=63 62 61 60*/ \
+  "psrlq $32,%%mm1\n\t"          /*mm1=.. .. 31 30*/ \
+  "pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
+  "pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
+  "punpcklwd %%mm5,%%mm1\n\t"    /*mm1=46 31 44 30*/ \
+  "pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
+  "punpckhwd %%mm1,%%mm2\n\t"    /*mm2=46 39 31 38 *L*/ \
+  "punpcklwd %%mm3,%%mm1\n\t"    /*mm1=51 44 37 30 *M*/ \
+  "movq %%mm2,0x68(%[qdct])\n\t" \
+  "movq %%mm1,0x58(%[qdct])\n\t" \
+  "punpckhwd %%mm6,%%mm5\n\t"    /*mm5=55 47 52 45*/ \
+  "punpckldq %%mm0,%%mm6\n\t"    /*mm6=61 60 54 53*/ \
+  "pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
+  "pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
+  "punpckhdq %%mm0,%%mm5\n\t"    /*mm5=63 62 55 47 *O*/ \
+  "punpckhdq %%mm4,%%mm7\n\t"    /*mm7=45 52 59 58 *P*/ \
+  "movq %%mm6,0x70(%[qdct])\n\t" \
+  "movq %%mm5,0x78(%[qdct])\n\t" \
+  "movq %%mm7,0x60(%[qdct])\n\t" \
+
+#endif