12 files changed, 0 insertions, 3612 deletions
diff --git a/drivers/theora/x86/mmxencfrag.c b/drivers/theora/x86/mmxencfrag.c
deleted file mode 100644
index c79ff01fcc..0000000000
--- a/drivers/theora/x86/mmxencfrag.c
+++ /dev/null
@@ -1,900 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
-
- ********************************************************************/
-#include <stddef.h>
-#include "x86enc.h"
-
-#if defined(OC_X86_ASM)
-
-unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride){
-  ptrdiff_t ystride3;
-  ptrdiff_t ret;
-  __asm__ __volatile__(
-    /*Load the first 4 rows of each block.*/
-    "movq (%[src]),%%mm0\n\t"
-    "movq (%[ref]),%%mm1\n\t"
-    "movq (%[src],%[ystride]),%%mm2\n\t"
-    "movq (%[ref],%[ystride]),%%mm3\n\t"
-    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
-    "movq (%[src],%[ystride],2),%%mm4\n\t"
-    "movq (%[ref],%[ystride],2),%%mm5\n\t"
-    "movq (%[src],%[ystride3]),%%mm6\n\t"
-    "movq (%[ref],%[ystride3]),%%mm7\n\t"
-    /*Compute their SADs and add them in %%mm0*/
-    "psadbw %%mm1,%%mm0\n\t"
-    "psadbw %%mm3,%%mm2\n\t"
-    "lea (%[src],%[ystride],4),%[src]\n\t"
-    "paddw %%mm2,%%mm0\n\t"
-    "lea (%[ref],%[ystride],4),%[ref]\n\t"
-    /*Load the next 3 rows as registers become available.*/
-    "movq (%[src]),%%mm2\n\t"
-    "movq (%[ref]),%%mm3\n\t"
-    "psadbw %%mm5,%%mm4\n\t"
-    "psadbw %%mm7,%%mm6\n\t"
-    "paddw %%mm4,%%mm0\n\t"
-    "movq (%[ref],%[ystride]),%%mm5\n\t"
-    "movq (%[src],%[ystride]),%%mm4\n\t"
-    "paddw %%mm6,%%mm0\n\t"
-    "movq (%[ref],%[ystride],2),%%mm7\n\t"
-    "movq (%[src],%[ystride],2),%%mm6\n\t"
-    /*Start adding their SADs to %%mm0*/
-    "psadbw %%mm3,%%mm2\n\t"
-    "psadbw %%mm5,%%mm4\n\t"
-    "paddw %%mm2,%%mm0\n\t"
-    "psadbw %%mm7,%%mm6\n\t"
-    /*Load last row as registers become available.*/
-    "movq (%[src],%[ystride3]),%%mm2\n\t"
-    "movq (%[ref],%[ystride3]),%%mm3\n\t"
-    /*And finish adding up their SADs.*/
-    "paddw %%mm4,%%mm0\n\t"
-    "psadbw %%mm3,%%mm2\n\t"
-    "paddw %%mm6,%%mm0\n\t"
-    "paddw %%mm2,%%mm0\n\t"
-    "movd %%mm0,%[ret]\n\t"
-    :[ret]"=a"(ret),[src]"+%r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
-    :[ystride]"r"((ptrdiff_t)_ystride)
-  );
-  return (unsigned)ret;
-}
-
-unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  /*Early termination is for suckers.*/
-  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
-}
-
-/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
-   first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
-  We pre-load the next two rows of data as registers become available.*/
-#define OC_SAD2_LOOP \
- "#OC_SAD2_LOOP\n\t" \
- /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
-    pavgb computes (%%mm0+%%mm1+1>>1). \
-   The latter is exactly 1 too large when the low bit of two corresponding \
-    bytes is only set in one of them. \
-   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
-    correct the output of pavgb.*/ \
- "movq %%mm0,%%mm6\n\t" \
- "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
- "pxor %%mm1,%%mm0\n\t" \
- "pavgb %%mm1,%%mm6\n\t" \
- "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
- "movq %%mm2,%%mm1\n\t" \
- "pand %%mm7,%%mm0\n\t" \
- "pavgb %%mm3,%%mm2\n\t" \
- "pxor %%mm3,%%mm1\n\t" \
- "movq (%[ref2],%[ystride]),%%mm3\n\t" \
- "psubb %%mm0,%%mm6\n\t" \
- "movq (%[ref1]),%%mm0\n\t" \
- "pand %%mm7,%%mm1\n\t" \
- "psadbw %%mm6,%%mm4\n\t" \
- "movd %[ret],%%mm6\n\t" \
- "psubb %%mm1,%%mm2\n\t" \
- "movq (%[ref2]),%%mm1\n\t" \
- "lea (%[src],%[ystride],2),%[src]\n\t" \
- "psadbw %%mm2,%%mm5\n\t" \
- "movq (%[ref1],%[ystride]),%%mm2\n\t" \
- "paddw %%mm4,%%mm5\n\t" \
- "movq (%[src]),%%mm4\n\t" \
- "paddw %%mm5,%%mm6\n\t" \
- "movq (%[src],%[ystride]),%%mm5\n\t" \
- "movd %%mm6,%[ret]\n\t" \
-
-/*Same as above, but does not pre-load the next two rows.*/
-#define OC_SAD2_TAIL \
- "#OC_SAD2_TAIL\n\t" \
- "movq %%mm0,%%mm6\n\t" \
- "pavgb %%mm1,%%mm0\n\t" \
- "pxor %%mm1,%%mm6\n\t" \
- "movq %%mm2,%%mm1\n\t" \
- "pand %%mm7,%%mm6\n\t" \
- "pavgb %%mm3,%%mm2\n\t" \
- "pxor %%mm3,%%mm1\n\t" \
- "psubb %%mm6,%%mm0\n\t" \
- "pand %%mm7,%%mm1\n\t" \
- "psadbw %%mm0,%%mm4\n\t" \
- "psubb %%mm1,%%mm2\n\t" \
- "movd %[ret],%%mm6\n\t" \
- "psadbw %%mm2,%%mm5\n\t" \
- "paddw %%mm4,%%mm5\n\t" \
- "paddw %%mm5,%%mm6\n\t" \
- "movd %%mm6,%[ret]\n\t" \
-
-unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
-  ptrdiff_t ret;
-  __asm__ __volatile__(
-    "movq (%[ref1]),%%mm0\n\t"
-    "movq (%[ref2]),%%mm1\n\t"
-    "movq (%[ref1],%[ystride]),%%mm2\n\t"
-    "movq (%[ref2],%[ystride]),%%mm3\n\t"
-    "xor %[ret],%[ret]\n\t"
-    "movq (%[src]),%%mm4\n\t"
-    "pxor %%mm7,%%mm7\n\t"
-    "pcmpeqb %%mm6,%%mm6\n\t"
-    "movq (%[src],%[ystride]),%%mm5\n\t"
-    "psubb %%mm6,%%mm7\n\t"
-    OC_SAD2_LOOP
-    OC_SAD2_LOOP
-    OC_SAD2_LOOP
-    OC_SAD2_TAIL
-    :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+%r"(_ref1),[ref2]"+r"(_ref2)
-    :[ystride]"r"((ptrdiff_t)_ystride)
-  );
-  return (unsigned)ret;
-}
-
-/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
-   16-bit difference in %%mm0...%%mm7.*/
-#define OC_LOAD_SUB_8x4(_off) \
- "#OC_LOAD_SUB_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[ref]),%%mm4\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm1\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "movd "_off"(%[src]),%%mm2\n\t" \
- "movd "_off"(%[ref]),%%mm7\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm3\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
- "punpcklbw %%mm4,%%mm0\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "punpcklbw %%mm4,%%mm4\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "psubw %%mm4,%%mm0\n\t" \
- "movd "_off"(%[src]),%%mm4\n\t" \
- "movq %%mm0,"_off"*2(%[buf])\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
- "punpcklbw %%mm5,%%mm1\n\t" \
- "punpcklbw %%mm5,%%mm5\n\t" \
- "psubw %%mm5,%%mm1\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm2\n\t" \
- "punpcklbw %%mm7,%%mm7\n\t" \
- "psubw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
- "punpcklbw %%mm6,%%mm3\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "punpcklbw %%mm6,%%mm6\n\t" \
- "psubw %%mm6,%%mm3\n\t" \
- "movd "_off"(%[src]),%%mm6\n\t" \
- "punpcklbw %%mm0,%%mm4\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "punpcklbw %%mm0,%%mm0\n\t" \
- "lea (%[src],%[src_ystride],2),%[src]\n\t" \
- "psubw %%mm0,%%mm4\n\t" \
- "movd "_off"(%[ref]),%%mm0\n\t" \
- "punpcklbw %%mm7,%%mm5\n\t" \
- "neg %[src_ystride]\n\t" \
- "punpcklbw %%mm7,%%mm7\n\t" \
- "psubw %%mm7,%%mm5\n\t" \
- "movd "_off"(%[src],%[src_ystride]),%%mm7\n\t" \
- "punpcklbw %%mm0,%%mm6\n\t" \
- "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
- "punpcklbw %%mm0,%%mm0\n\t" \
- "neg %[ref_ystride]\n\t" \
- "psubw %%mm0,%%mm6\n\t" \
- "movd "_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
- "lea (%[src],%[src_ystride],8),%[src]\n\t" \
- "punpcklbw %%mm0,%%mm7\n\t" \
- "neg %[src_ystride]\n\t" \
- "punpcklbw %%mm0,%%mm0\n\t" \
- "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
- "psubw %%mm0,%%mm7\n\t" \
- "neg %[ref_ystride]\n\t" \
- "movq "_off"*2(%[buf]),%%mm0\n\t" \
-
-/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
-#define OC_LOAD_8x4(_off) \
- "#OC_LOAD_8x4\n\t" \
- "movd "_off"(%[src]),%%mm0\n\t" \
- "movd "_off"(%[src],%[ystride]),%%mm1\n\t" \
- "movd "_off"(%[src],%[ystride],2),%%mm2\n\t" \
- "pxor %%mm7,%%mm7\n\t" \
- "movd "_off"(%[src],%[ystride3]),%%mm3\n\t" \
- "punpcklbw %%mm7,%%mm0\n\t" \
- "movd "_off"(%[src4]),%%mm4\n\t" \
- "punpcklbw %%mm7,%%mm1\n\t" \
- "movd "_off"(%[src4],%[ystride]),%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm2\n\t" \
- "movd "_off"(%[src4],%[ystride],2),%%mm6\n\t" \
- "punpcklbw %%mm7,%%mm3\n\t" \
- "movd "_off"(%[src4],%[ystride3]),%%mm7\n\t" \
- "punpcklbw %%mm4,%%mm4\n\t" \
- "punpcklbw %%mm5,%%mm5\n\t" \
- "psrlw $8,%%mm4\n\t" \
- "psrlw $8,%%mm5\n\t" \
- "punpcklbw %%mm6,%%mm6\n\t" \
- "punpcklbw %%mm7,%%mm7\n\t" \
- "psrlw $8,%%mm6\n\t" \
- "psrlw $8,%%mm7\n\t" \
-
-/*Performs the first two stages of an 8-point 1-D Hadamard transform.
-  The transform is performed in place, except that outputs 0-3 are swapped with
-   outputs 4-7.
-  Outputs 2, 3, 6 and 7 from the second stage are negated (which allows us to
-   perform this stage in place with no temporary registers).*/
-#define OC_HADAMARD_AB_8x4 \
- "#OC_HADAMARD_AB_8x4\n\t" \
- /*Stage A: \
-   Outputs 0-3 are swapped with 4-7 here.*/ \
- "paddw %%mm1,%%mm5\n\t" \
- "paddw %%mm2,%%mm6\n\t" \
- "paddw %%mm1,%%mm1\n\t" \
- "paddw %%mm2,%%mm2\n\t" \
- "psubw %%mm5,%%mm1\n\t" \
- "psubw %%mm6,%%mm2\n\t" \
- "paddw %%mm3,%%mm7\n\t" \
- "paddw %%mm0,%%mm4\n\t" \
- "paddw %%mm3,%%mm3\n\t" \
- "paddw %%mm0,%%mm0\n\t" \
- "psubw %%mm7,%%mm3\n\t" \
- "psubw %%mm4,%%mm0\n\t" \
- /*Stage B:*/ \
- "paddw %%mm2,%%mm0\n\t" \
- "paddw %%mm3,%%mm1\n\t" \
- "paddw %%mm6,%%mm4\n\t" \
- "paddw %%mm7,%%mm5\n\t" \
- "paddw %%mm2,%%mm2\n\t" \
- "paddw %%mm3,%%mm3\n\t" \
- "paddw %%mm6,%%mm6\n\t" \
- "paddw %%mm7,%%mm7\n\t" \
- "psubw %%mm0,%%mm2\n\t" \
- "psubw %%mm1,%%mm3\n\t" \
- "psubw %%mm4,%%mm6\n\t" \
- "psubw %%mm5,%%mm7\n\t" \
-
-/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
-  Ouputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
-   place with no temporary registers).*/
-#define OC_HADAMARD_C_8x4 \
- "#OC_HADAMARD_C_8x4\n\t" \
- /*Stage C:*/ \
- "paddw %%mm1,%%mm0\n\t" \
- "paddw %%mm3,%%mm2\n\t" \
- "paddw %%mm5,%%mm4\n\t" \
- "paddw %%mm7,%%mm6\n\t" \
- "paddw %%mm1,%%mm1\n\t" \
- "paddw %%mm3,%%mm3\n\t" \
- "paddw %%mm5,%%mm5\n\t" \
- "paddw %%mm7,%%mm7\n\t" \
- "psubw %%mm0,%%mm1\n\t" \
- "psubw %%mm2,%%mm3\n\t" \
- "psubw %%mm4,%%mm5\n\t" \
- "psubw %%mm6,%%mm7\n\t" \
-
-/*Performs an 8-point 1-D Hadamard transform.
-  The transform is performed in place, except that outputs 0-3 are swapped with
-   outputs 4-7.
-  Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
-   in place with no temporary registers).*/
-#define OC_HADAMARD_8x4 \
- OC_HADAMARD_AB_8x4 \
- OC_HADAMARD_C_8x4 \
-
-/*Performs the first part of the final stage of the Hadamard transform and
-   summing of absolute values.
-  At the end of this part, %%mm1 will contain the DC coefficient of the
-   transform.*/
-#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
- /*We use the fact that \
-     (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
-    to merge the final butterfly with the abs and the first stage of \
-    accumulation. \
-   Thus we can avoid using pabsw, which is not available until SSSE3. \
-   Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
-    implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
-    registers). \
-   Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
-   This implementation is only 26 (+4 for spilling registers).*/ \
- "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
- "movq %%mm7,"_r7"(%[buf])\n\t" \
- "movq %%mm6,"_r6"(%[buf])\n\t" \
- /*mm7={0x7FFF}x4 \
-   mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
- "pcmpeqb %%mm7,%%mm7\n\t" \
- "movq %%mm0,%%mm6\n\t" \
- "psrlw $1,%%mm7\n\t" \
- "paddw %%mm1,%%mm6\n\t" \
- "pmaxsw %%mm1,%%mm0\n\t" \
- "paddsw %%mm7,%%mm6\n\t" \
- "psubw %%mm6,%%mm0\n\t" \
- /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
-   mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
- "movq %%mm2,%%mm6\n\t" \
- "movq %%mm4,%%mm1\n\t" \
- "pmaxsw %%mm3,%%mm2\n\t" \
- "pmaxsw %%mm5,%%mm4\n\t" \
- "paddw %%mm3,%%mm6\n\t" \
- "paddw %%mm5,%%mm1\n\t" \
- "movq "_r7"(%[buf]),%%mm3\n\t" \
-
-/*Performs the second part of the final stage of the Hadamard transform and
-   summing of absolute values.*/
-#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
- "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
- "paddsw %%mm7,%%mm6\n\t" \
- "movq "_r6"(%[buf]),%%mm5\n\t" \
- "paddsw %%mm7,%%mm1\n\t" \
- "psubw %%mm6,%%mm2\n\t" \
- "psubw %%mm1,%%mm4\n\t" \
- /*mm7={1}x4 (needed for the horizontal add that follows) \
-   mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
- "movq %%mm3,%%mm6\n\t" \
- "pmaxsw %%mm5,%%mm3\n\t" \
- "paddw %%mm2,%%mm0\n\t" \
- "paddw %%mm5,%%mm6\n\t" \
- "paddw %%mm4,%%mm0\n\t" \
- "paddsw %%mm7,%%mm6\n\t" \
- "paddw %%mm3,%%mm0\n\t" \
- "psrlw $14,%%mm7\n\t" \
- "psubw %%mm6,%%mm0\n\t" \
-
-/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
-   absolute value of each component, and accumulates everything into mm0.
-  This is the only portion of SATD which requires MMXEXT (we could use plain
-   MMX, but it takes 4 instructions and an extra register to work around the
-   lack of a pmaxsw, which is a pretty serious penalty).*/
-#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
- OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
- OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
-
-/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
-   component, and accumulates everything into mm0.
-  Note that mm0 will have an extra 4 added to each column, and that after
-   removing this value, the remainder will be half the conventional value.*/
-#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
- OC_HADAMARD_AB_8x4 \
- OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
-
-/*Performs two 4x4 transposes (mostly) in place.
-  On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
-   contains rows {a,b,c,d}.
-  On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
-   {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
-#define OC_TRANSPOSE_4x4x2(_off) \
- "#OC_TRANSPOSE_4x4x2\n\t" \
- /*First 4x4 transpose:*/ \
- "movq %%mm5,0x10+"_off"(%[buf])\n\t" \
- /*mm0 = e3 e2 e1 e0 \
-   mm1 = f3 f2 f1 f0 \
-   mm2 = g3 g2 g1 g0 \
-   mm3 = h3 h2 h1 h0*/ \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklwd %%mm3,%%mm2\n\t" \
- "punpckhwd %%mm3,%%mm5\n\t" \
- "movq %%mm0,%%mm3\n\t" \
- "punpcklwd %%mm1,%%mm0\n\t" \
- "punpckhwd %%mm1,%%mm3\n\t" \
- /*mm0 = f1 e1 f0 e0 \
-   mm3 = f3 e3 f2 e2 \
-   mm2 = h1 g1 h0 g0 \
-   mm5 = h3 g3 h2 g2*/ \
- "movq %%mm0,%%mm1\n\t" \
- "punpckldq %%mm2,%%mm0\n\t" \
- "punpckhdq %%mm2,%%mm1\n\t" \
- "movq %%mm3,%%mm2\n\t" \
- "punpckhdq %%mm5,%%mm3\n\t" \
- "movq %%mm0,0x40+"_off"(%[buf])\n\t" \
- "punpckldq %%mm5,%%mm2\n\t" \
- /*mm0 = h0 g0 f0 e0 \
-   mm1 = h1 g1 f1 e1 \
-   mm2 = h2 g2 f2 e2 \
-   mm3 = h3 g3 f3 e3*/ \
- "movq 0x10+"_off"(%[buf]),%%mm5\n\t" \
- /*Second 4x4 transpose:*/ \
- /*mm4 = a3 a2 a1 a0 \
-   mm5 = b3 b2 b1 b0 \
-   mm6 = c3 c2 c1 c0 \
-   mm7 = d3 d2 d1 d0*/ \
- "movq %%mm6,%%mm0\n\t" \
- "punpcklwd %%mm7,%%mm6\n\t" \
- "movq %%mm1,0x50+"_off"(%[buf])\n\t" \
- "punpckhwd %%mm7,%%mm0\n\t" \
- "movq %%mm4,%%mm7\n\t" \
- "punpcklwd %%mm5,%%mm4\n\t" \
- "movq %%mm2,0x60+"_off"(%[buf])\n\t" \
- "punpckhwd %%mm5,%%mm7\n\t" \
- /*mm4 = b1 a1 b0 a0 \
-   mm7 = b3 a3 b2 a2 \
-   mm6 = d1 c1 d0 c0 \
-   mm0 = d3 c3 d2 c2*/ \
- "movq %%mm4,%%mm5\n\t" \
- "punpckldq %%mm6,%%mm4\n\t" \
- "movq %%mm3,0x70+"_off"(%[buf])\n\t" \
- "punpckhdq %%mm6,%%mm5\n\t" \
- "movq %%mm7,%%mm6\n\t" \
- "punpckhdq %%mm0,%%mm7\n\t" \
- "punpckldq %%mm0,%%mm6\n\t" \
- /*mm4 = d0 c0 b0 a0 \
-   mm5 = d1 c1 b1 a1 \
-   mm6 = d2 c2 b2 a2 \
-   mm7 = d3 c3 b3 a3*/ \
-
-static unsigned oc_int_frag_satd_thresh_mmxext(const unsigned char *_src,
- int _src_ystride,const unsigned char *_ref,int _ref_ystride,unsigned _thresh){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  bufp=buf;
-  __asm__ __volatile__(
-    OC_LOAD_SUB_8x4("0x00")
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
-    /*Finish swapping out this 8x4 block to make room for the next one.
-      mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_SUB_8x4("0x04")
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
-    /*Here the first 4x4 block of output from the last transpose is the second
-       4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place, so
-       we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x28","0x38")
-    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
-       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
-       for the factor of two we dropped + 3 for the vertical accumulation).
-      Now we finally have to promote things to dwords.
-      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
-       latency of pmaddwd by starting the next series of loads now.*/
-    "mov %[thresh],%[ret2]\n\t"
-    "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
-    "movq %%mm0,%%mm4\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
-    "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
-    "paddd %%mm0,%%mm4\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
-    "movd %%mm4,%[ret]\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
-    /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
-       added to them, and a factor of two removed; correct the final sum here.*/
-    "lea -32(%[ret],%[ret]),%[ret]\n\t"
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "cmp %[ret2],%[ret]\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    "jae 1f\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
-    "pmaddwd %%mm7,%%mm0\n\t"
-    /*There isn't much to stick in here to hide the latency this time, but the
-       alternative to pmaddwd is movq->punpcklwd->punpckhwd->paddd, whose
-       latency is even worse.*/
-    "sub $32,%[ret]\n\t"
-    "movq %%mm0,%%mm4\n\t"
-    "punpckhdq %%mm0,%%mm0\n\t"
-    "paddd %%mm0,%%mm4\n\t"
-    "movd %%mm4,%[ret2]\n\t"
-    "lea (%[ret],%[ret2],2),%[ret]\n\t"
-    ".p2align 4,,15\n\t"
-    "1:\n\t"
-    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
-       and %[ret2] with some of the inputs, since for once we don't write to
-       them until after we're done using everything but %[buf] (which is also
-       listed as an output to ensure gcc _doesn't_ alias them against it).*/
-    /*Note that _src_ystride and _ref_ystride must be given non-overlapping
-       constraints, otherewise if gcc can prove they're equal it will allocate
-       them to the same register (which is bad); _src and _ref face a similar
-       problem, though those are never actually the same.*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
-    :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
-     [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride),
-     [thresh]"m"(_thresh)
-    /*We have to use neg, so we actually clobber the condition codes for once
-       (not to mention cmp, sub, and add).*/
-    :"cc"
-  );
-  return ret;
-}
-
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh){
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,_ref,_ystride,_thresh);
-}
-
-/*Our internal implementation of frag_copy2 takes an extra stride parameter so
-   we can share code with oc_enc_frag_satd2_thresh_mmxext().*/
-static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
-  __asm__ __volatile__(
-    /*Load the first 3 rows.*/
-    "movq (%[src1]),%%mm0\n\t"
-    "movq (%[src2]),%%mm1\n\t"
-    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
-    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
-    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
-    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
-    "pxor %%mm7,%%mm7\n\t"
-    "movq (%[src1]),%%mm4\n\t"
-    "pcmpeqb %%mm6,%%mm6\n\t"
-    "movq (%[src2]),%%mm5\n\t"
-    /*mm7={1}x8.*/
-    "psubb %%mm6,%%mm7\n\t"
-    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
-    "movq %%mm0,%%mm6\n\t"
-    "pxor %%mm1,%%mm0\n\t"
-    "pavgb %%mm1,%%mm6\n\t"
-    /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
-    "movq %%mm2,%%mm1\n\t"
-    "pand %%mm7,%%mm0\n\t"
-    "pavgb %%mm3,%%mm2\n\t"
-    "pxor %%mm3,%%mm1\n\t"
-    /*%%mm3 is free.*/
-    "psubb %%mm0,%%mm6\n\t"
-    /*%%mm0 is free, start loading the next row.*/
-    "movq (%[src1],%[src_ystride]),%%mm0\n\t"
-    /*Start averaging %%mm5 and %%mm4 using %%mm3.*/
-    "movq %%mm4,%%mm3\n\t"
-    /*%%mm6 (row 0) is done; write it out.*/
-    "movq %%mm6,(%[dst])\n\t"
-    "pand %%mm7,%%mm1\n\t"
-    "pavgb %%mm5,%%mm4\n\t"
-    "psubb %%mm1,%%mm2\n\t"
-    /*%%mm1 is free, continue loading the next row.*/
-    "movq (%[src2],%[src_ystride]),%%mm1\n\t"
-    "pxor %%mm5,%%mm3\n\t"
-    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
-    /*%%mm2 (row 1) is done; write it out.*/
-    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
-    "pand %%mm7,%%mm3\n\t"
-    /*Start loading the next row.*/
-    "movq (%[src1]),%%mm2\n\t"
-    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
-    "psubb %%mm3,%%mm4\n\t"
-    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
-    /*%%mm4 (row 2) is done; write it out.*/
-    "movq %%mm4,(%[dst])\n\t"
-    /*Continue loading the next row.*/
-    "movq (%[src2]),%%mm3\n\t"
-    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
-    "movq %%mm0,%%mm6\n\t"
-    "pxor %%mm1,%%mm0\n\t"
-    /*Start loading the next row.*/
-    "movq (%[src1],%[src_ystride]),%%mm4\n\t"
-    "pavgb %%mm1,%%mm6\n\t"
-    /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
-    "movq %%mm2,%%mm1\n\t"
-    "pand %%mm7,%%mm0\n\t"
-    /*Continue loading the next row.*/
-    "movq (%[src2],%[src_ystride]),%%mm5\n\t"
-    "pavgb %%mm3,%%mm2\n\t"
-    "lea (%[src1],%[src_ystride],2),%[src1]\n\t"
-    "pxor %%mm3,%%mm1\n\t"
-    /*%%mm3 is free.*/
-    "psubb %%mm0,%%mm6\n\t"
-    /*%%mm0 is free, start loading the next row.*/
-    "movq (%[src1]),%%mm0\n\t"
-    /*Start averaging %%mm5 into %%mm4 using %%mm3.*/
-    "movq %%mm4,%%mm3\n\t"
-    /*%%mm6 (row 3) is done; write it out.*/
-    "movq %%mm6,(%[dst],%[dst_ystride])\n\t"
-    "pand %%mm7,%%mm1\n\t"
-    "lea (%[src2],%[src_ystride],2),%[src2]\n\t"
-    "pavgb %%mm5,%%mm4\n\t"
-    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
-    "psubb %%mm1,%%mm2\n\t"
-    /*%%mm1 is free; continue loading the next row.*/
-    "movq (%[src2]),%%mm1\n\t"
-    "pxor %%mm5,%%mm3\n\t"
-    /*%%mm2 (row 4) is done; write it out.*/
-    "movq %%mm2,(%[dst])\n\t"
-    "pand %%mm7,%%mm3\n\t"
-    /*Start loading the next row.*/
-    "movq (%[src1],%[src_ystride]),%%mm2\n\t"
-    "psubb %%mm3,%%mm4\n\t"
-    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
-    "movq %%mm0,%%mm6\n\t"
-    /*Continue loading the next row.*/
-    "movq (%[src2],%[src_ystride]),%%mm3\n\t"
-    /*%%mm4 (row 5) is done; write it out.*/
-    "movq %%mm4,(%[dst],%[dst_ystride])\n\t"
-    "pxor %%mm1,%%mm0\n\t"
-    "pavgb %%mm1,%%mm6\n\t"
-    /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
-    "movq %%mm2,%%mm4\n\t"
-    "pand %%mm7,%%mm0\n\t"
-    "pavgb %%mm3,%%mm2\n\t"
-    "pxor %%mm3,%%mm4\n\t"
-    "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
-    "psubb %%mm0,%%mm6\n\t"
-    "pand %%mm7,%%mm4\n\t"
-    /*%%mm6 (row 6) is done, write it out.*/
-    "movq %%mm6,(%[dst])\n\t"
-    "psubb %%mm4,%%mm2\n\t"
-    /*%%mm2 (row 7) is done, write it out.*/
-    "movq %%mm2,(%[dst],%[dst_ystride])\n\t"
-    :[dst]"+r"(_dst),[src1]"+%r"(_src1),[src2]"+r"(_src2)
-    :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
-     [src_ystride]"r"((ptrdiff_t)_src_ystride)
-    :"memory"
-  );
-}
-
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh){
-  OC_ALIGN8(unsigned char ref[64]);
-  oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
-  return oc_int_frag_satd_thresh_mmxext(_src,_ystride,ref,8,_thresh);
-}
-
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,
- int _ystride){
-  OC_ALIGN8(ogg_int16_t  buf[64]);
-  ogg_int16_t *bufp;
-  unsigned     ret;
-  unsigned     ret2;
-  bufp=buf;
-  __asm__ __volatile__(
-    OC_LOAD_8x4("0x00")
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x00")
-    /*Finish swapping out this 8x4 block to make room for the next one.
-      mm0...mm3 have been swapped out already.*/
-    "movq %%mm4,0x00(%[buf])\n\t"
-    "movq %%mm5,0x10(%[buf])\n\t"
-    "movq %%mm6,0x20(%[buf])\n\t"
-    "movq %%mm7,0x30(%[buf])\n\t"
-    OC_LOAD_8x4("0x04")
-    OC_HADAMARD_8x4
-    OC_TRANSPOSE_4x4x2("0x08")
-    /*Here the first 4x4 block of output from the last transpose is the second
-       4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place, so
-       we only have to do half the loads.*/
-    "movq 0x10(%[buf]),%%mm1\n\t"
-    "movq 0x20(%[buf]),%%mm2\n\t"
-    "movq 0x30(%[buf]),%%mm3\n\t"
-    "movq 0x00(%[buf]),%%mm0\n\t"
-    /*We split out the stages here so we can save the DC coefficient in the
-       middle.*/
-    OC_HADAMARD_AB_8x4
-    OC_HADAMARD_C_ABS_ACCUM_A_8x4("0x28","0x38")
-    "movd %%mm1,%[ret]\n\t"
-    OC_HADAMARD_C_ABS_ACCUM_B_8x4("0x28","0x38")
-    /*Up to this point, everything fit in 16 bits (8 input + 1 for the
-       difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
-       for the factor of two we dropped + 3 for the vertical accumulation).
-      Now we finally have to promote things to dwords.
-      We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
-       latency of pmaddwd by starting the next series of loads now.*/
-    "pmaddwd %%mm7,%%mm0\n\t"
-    "movq 0x50(%[buf]),%%mm1\n\t"
-    "movq 0x58(%[buf]),%%mm5\n\t"
-    "movq 0x60(%[buf]),%%mm2\n\t"
-    "movq %%mm0,%%mm4\n\t"
-    "movq 0x68(%[buf]),%%mm6\n\t"
-    "punpckhdq %%mm0,%%mm0\n\t"
-    "movq 0x70(%[buf]),%%mm3\n\t"
-    "paddd %%mm0,%%mm4\n\t"
-    "movq 0x78(%[buf]),%%mm7\n\t"
-    "movd %%mm4,%[ret2]\n\t"
-    "movq 0x40(%[buf]),%%mm0\n\t"
-    "movq 0x48(%[buf]),%%mm4\n\t"
-    OC_HADAMARD_ABS_ACCUM_8x4("0x68","0x78")
-    "pmaddwd %%mm7,%%mm0\n\t"
-    /*We assume that the DC coefficient is always positive (which is true,
-       because the input to the INTRA transform was not a difference).*/
-    "movzx %w[ret],%[ret]\n\t"
-    "add %[ret2],%[ret2]\n\t"
-    "sub %[ret],%[ret2]\n\t"
-    "movq %%mm0,%%mm4\n\t"
-    "punpckhdq %%mm0,%%mm0\n\t"
-    "paddd %%mm0,%%mm4\n\t"
-    "movd %%mm4,%[ret]\n\t"
-    "lea -64(%[ret2],%[ret],2),%[ret]\n\t"
-    /*Although it looks like we're using 7 registers here, gcc can alias %[ret]
-       and %[ret2] with some of the inputs, since for once we don't write to
-       them until after we're done using everything but %[buf] (which is also
-       listed as an output to ensure gcc _doesn't_ alias them against it).*/
-    :[ret]"=a"(ret),[ret2]"=r"(ret2),[buf]"+r"(bufp)
-    :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
-     [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
-    /*We have to use sub, so we actually clobber the condition codes for once
-       (not to mention add).*/
-    :"cc"
-  );
-  return ret;
-}
-
-void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
- const unsigned char *_src,const unsigned char *_ref,int _ystride){
-  int i;
-  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
-  for(i=4;i-->0;){
-    __asm__ __volatile__(
-      /*mm0=[src]*/
-      "movq (%[src]),%%mm0\n\t"
-      /*mm1=[ref]*/
-      "movq (%[ref]),%%mm1\n\t"
-      /*mm4=[src+ystride]*/
-      "movq (%[src],%[ystride]),%%mm4\n\t"
-      /*mm5=[ref+ystride]*/
-      "movq (%[ref],%[ystride]),%%mm5\n\t"
-      /*Compute [src]-[ref].*/
-      "movq %%mm0,%%mm2\n\t"
-      "punpcklbw %%mm7,%%mm0\n\t"
-      "movq %%mm1,%%mm3\n\t"
-      "punpckhbw %%mm7,%%mm2\n\t"
-      "punpcklbw %%mm7,%%mm1\n\t"
-      "punpckhbw %%mm7,%%mm3\n\t"
-      "psubw %%mm1,%%mm0\n\t"
-      "psubw %%mm3,%%mm2\n\t"
-      /*Compute [src+ystride]-[ref+ystride].*/
-      "movq %%mm4,%%mm1\n\t"
-      "punpcklbw %%mm7,%%mm4\n\t"
-      "movq %%mm5,%%mm3\n\t"
-      "punpckhbw %%mm7,%%mm1\n\t"
-      "lea (%[src],%[ystride],2),%[src]\n\t"
-      "punpcklbw %%mm7,%%mm5\n\t"
-      "lea (%[ref],%[ystride],2),%[ref]\n\t"
-      "punpckhbw %%mm7,%%mm3\n\t"
-      "psubw %%mm5,%%mm4\n\t"
-      "psubw %%mm3,%%mm1\n\t"
-      /*Write the answer out.*/
-      "movq %%mm0,0x00(%[residue])\n\t"
-      "movq %%mm2,0x08(%[residue])\n\t"
-      "movq %%mm4,0x10(%[residue])\n\t"
-      "movq %%mm1,0x18(%[residue])\n\t"
-      "lea 0x20(%[residue]),%[residue]\n\t"
-      :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
-      :[ystride]"r"((ptrdiff_t)_ystride)
-      :"memory"
-    );
-  }
-}
-
-void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
- const unsigned char *_src,int _ystride){
-  ptrdiff_t ystride3;
-  __asm__ __volatile__(
-    /*mm0=[src]*/
-    "movq (%[src]),%%mm0\n\t"
-    /*mm1=[src+ystride]*/
-    "movq (%[src],%[ystride]),%%mm1\n\t"
-    /*mm6={-1}x4*/
-    "pcmpeqw %%mm6,%%mm6\n\t"
-    /*mm2=[src+2*ystride]*/
-    "movq (%[src],%[ystride],2),%%mm2\n\t"
-    /*[ystride3]=3*[ystride]*/
-    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
-    /*mm6={1}x4*/
-    "psllw $15,%%mm6\n\t"
-    /*mm3=[src+3*ystride]*/
-    "movq (%[src],%[ystride3]),%%mm3\n\t"
-    /*mm6={128}x4*/
-    "psrlw $8,%%mm6\n\t"
-    /*mm7=0*/
-    "pxor %%mm7,%%mm7\n\t"
-    /*[src]=[src]+4*[ystride]*/
-    "lea (%[src],%[ystride],4),%[src]\n\t"
-    /*Compute [src]-128 and [src+ystride]-128*/
-    "movq %%mm0,%%mm4\n\t"
-    "punpcklbw %%mm7,%%mm0\n\t"
-    "movq %%mm1,%%mm5\n\t"
-    "punpckhbw %%mm7,%%mm4\n\t"
-    "psubw %%mm6,%%mm0\n\t"
-    "punpcklbw %%mm7,%%mm1\n\t"
-    "psubw %%mm6,%%mm4\n\t"
-    "punpckhbw %%mm7,%%mm5\n\t"
-    "psubw %%mm6,%%mm1\n\t"
-    "psubw %%mm6,%%mm5\n\t"
-    /*Write the answer out.*/
-    "movq %%mm0,0x00(%[residue])\n\t"
-    "movq %%mm4,0x08(%[residue])\n\t"
-    "movq %%mm1,0x10(%[residue])\n\t"
-    "movq %%mm5,0x18(%[residue])\n\t"
-    /*mm0=[src+4*ystride]*/
-    "movq (%[src]),%%mm0\n\t"
-    /*mm1=[src+5*ystride]*/
-    "movq (%[src],%[ystride]),%%mm1\n\t"
-    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
-    "movq %%mm2,%%mm4\n\t"
-    "punpcklbw %%mm7,%%mm2\n\t"
-    "movq %%mm3,%%mm5\n\t"
-    "punpckhbw %%mm7,%%mm4\n\t"
-    "psubw %%mm6,%%mm2\n\t"
-    "punpcklbw %%mm7,%%mm3\n\t"
-    "psubw %%mm6,%%mm4\n\t"
-    "punpckhbw %%mm7,%%mm5\n\t"
-    "psubw %%mm6,%%mm3\n\t"
-    "psubw %%mm6,%%mm5\n\t"
-    /*Write the answer out.*/
-    "movq %%mm2,0x20(%[residue])\n\t"
-    "movq %%mm4,0x28(%[residue])\n\t"
-    "movq %%mm3,0x30(%[residue])\n\t"
-    "movq %%mm5,0x38(%[residue])\n\t"
-    /*mm2=[src+6*ystride]*/
-    "movq (%[src],%[ystride],2),%%mm2\n\t"
-    /*mm3=[src+7*ystride]*/
-    "movq (%[src],%[ystride3]),%%mm3\n\t"
-    /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
-    "movq %%mm0,%%mm4\n\t"
-    "punpcklbw %%mm7,%%mm0\n\t"
-    "movq %%mm1,%%mm5\n\t"
-    "punpckhbw %%mm7,%%mm4\n\t"
-    "psubw %%mm6,%%mm0\n\t"
-    "punpcklbw %%mm7,%%mm1\n\t"
-    "psubw %%mm6,%%mm4\n\t"
-    "punpckhbw %%mm7,%%mm5\n\t"
-    "psubw %%mm6,%%mm1\n\t"
-    "psubw %%mm6,%%mm5\n\t"
-    /*Write the answer out.*/
-    "movq %%mm0,0x40(%[residue])\n\t"
-    "movq %%mm4,0x48(%[residue])\n\t"
-    "movq %%mm1,0x50(%[residue])\n\t"
-    "movq %%mm5,0x58(%[residue])\n\t"
-    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
-    "movq %%mm2,%%mm4\n\t"
-    "punpcklbw %%mm7,%%mm2\n\t"
-    "movq %%mm3,%%mm5\n\t"
-    "punpckhbw %%mm7,%%mm4\n\t"
-    "psubw %%mm6,%%mm2\n\t"
-    "punpcklbw %%mm7,%%mm3\n\t"
-    "psubw %%mm6,%%mm4\n\t"
-    "punpckhbw %%mm7,%%mm5\n\t"
-    "psubw %%mm6,%%mm3\n\t"
-    "psubw %%mm6,%%mm5\n\t"
-    /*Write the answer out.*/
-    "movq %%mm2,0x60(%[residue])\n\t"
-    "movq %%mm4,0x68(%[residue])\n\t"
-    "movq %%mm3,0x70(%[residue])\n\t"
-    "movq %%mm5,0x78(%[residue])\n\t"
-    :[src]"+r"(_src),[ystride3]"=&r"(ystride3)
-    :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
-    :"memory"
-  );
-}
-
-void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride){
-  oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
-}
-
-#endif
diff --git a/drivers/theora/x86/mmxfdct.c b/drivers/theora/x86/mmxfdct.c
deleted file mode 100644
index 211875255e..0000000000
--- a/drivers/theora/x86/mmxfdct.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************/
-/*MMX fDCT implementation for x86_32*/
-/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
-#include "x86enc.h"
-
-#if defined(OC_X86_ASM)
-
-# define OC_FDCT_STAGE1_8x4 \
- "#OC_FDCT_STAGE1_8x4\n\t" \
- /*Stage 1:*/ \
- /*mm0=t7'=t0-t7*/ \
- "psubw %%mm7,%%mm0\n\t" \
- "paddw %%mm7,%%mm7\n\t" \
- /*mm1=t6'=t1-t6*/ \
- "psubw %%mm6,%%mm1\n\t" \
- "paddw %%mm6,%%mm6\n\t" \
- /*mm2=t5'=t2-t5*/ \
- "psubw %%mm5,%%mm2\n\t" \
- "paddw %%mm5,%%mm5\n\t" \
- /*mm3=t4'=t3-t4*/ \
- "psubw %%mm4,%%mm3\n\t" \
- "paddw %%mm4,%%mm4\n\t" \
- /*mm7=t0'=t0+t7*/ \
- "paddw %%mm0,%%mm7\n\t" \
- /*mm6=t1'=t1+t6*/ \
- "paddw %%mm1,%%mm6\n\t" \
- /*mm5=t2'=t2+t5*/ \
- "paddw %%mm2,%%mm5\n\t" \
- /*mm4=t3'=t3+t4*/ \
- "paddw %%mm3,%%mm4\n\t" \
-
-# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
- "#OC_FDCT8x4\n\t" \
- /*Stage 2:*/ \
- /*mm7=t3''=t0'-t3'*/ \
- "psubw %%mm4,%%mm7\n\t" \
- "paddw %%mm4,%%mm4\n\t" \
- /*mm6=t2''=t1'-t2'*/ \
- "psubw %%mm5,%%mm6\n\t" \
- "movq %%mm7,"_r6"(%[y])\n\t" \
- "paddw %%mm5,%%mm5\n\t" \
- /*mm1=t5''=t6'-t5'*/ \
- "psubw %%mm2,%%mm1\n\t" \
- "movq %%mm6,"_r2"(%[y])\n\t" \
- /*mm4=t0''=t0'+t3'*/ \
- "paddw %%mm7,%%mm4\n\t" \
- "paddw %%mm2,%%mm2\n\t" \
- /*mm5=t1''=t1'+t2'*/ \
- "movq %%mm4,"_r0"(%[y])\n\t" \
- "paddw %%mm6,%%mm5\n\t" \
- /*mm2=t6''=t6'+t5'*/ \
- "paddw %%mm1,%%mm2\n\t" \
- "movq %%mm5,"_r4"(%[y])\n\t" \
- /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
- /*mm4, mm5, mm6, mm7 are free.*/ \
- /*Stage 3:*/ \
- /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
- "mov $0x5A806A0A,%[a]\n\t" \
- "pcmpeqb %%mm6,%%mm6\n\t" \
- "movd %[a],%%mm7\n\t" \
- "psrlw $15,%%mm6\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "paddw %%mm6,%%mm6\n\t" \
- /*mm0=0, m2={-1}x4 \
-   mm5:mm4=t5''*27146+0xB500*/ \
- "movq %%mm1,%%mm4\n\t" \
- "movq %%mm1,%%mm5\n\t" \
- "punpcklwd %%mm6,%%mm4\n\t" \
- "movq %%mm2,"_r3"(%[y])\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "movq %%mm0,"_r7"(%[y])\n\t" \
- "punpckhwd %%mm6,%%mm5\n\t" \
- "pxor %%mm0,%%mm0\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "pcmpeqb %%mm2,%%mm2\n\t" \
- /*mm2=t6'', mm1=t5''+(t5''!=0) \
-   mm4=(t5''*27146+0xB500>>16)*/ \
- "pcmpeqw %%mm1,%%mm0\n\t" \
- "psrad $16,%%mm4\n\t" \
- "psubw %%mm2,%%mm0\n\t" \
- "movq "_r3"(%[y]),%%mm2\n\t" \
- "psrad $16,%%mm5\n\t" \
- "paddw %%mm0,%%mm1\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
- "paddw %%mm1,%%mm4\n\t" \
- "movq "_r7"(%[y]),%%mm0\n\t" \
- "psraw $1,%%mm4\n\t" \
- "movq %%mm3,%%mm1\n\t" \
- /*mm3=t4''=t4'+s*/ \
- "paddw %%mm4,%%mm3\n\t" \
- /*mm1=t5'''=t4'-s*/ \
- "psubw %%mm4,%%mm1\n\t" \
- /*mm1=0, mm3={-1}x4 \
-   mm5:mm4=t6''*27146+0xB500*/ \
- "movq %%mm2,%%mm4\n\t" \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklwd %%mm6,%%mm4\n\t" \
- "movq %%mm1,"_r5"(%[y])\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "movq %%mm3,"_r1"(%[y])\n\t" \
- "punpckhwd %%mm6,%%mm5\n\t" \
- "pxor %%mm1,%%mm1\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "pcmpeqb %%mm3,%%mm3\n\t" \
- /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
- "psrad $16,%%mm4\n\t" \
- "pcmpeqw %%mm2,%%mm1\n\t" \
- "psrad $16,%%mm5\n\t" \
- "psubw %%mm3,%%mm1\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- "paddw %%mm1,%%mm2\n\t" \
- /*mm1=t1'' \
-   mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
- "paddw %%mm2,%%mm4\n\t" \
- "movq "_r4"(%[y]),%%mm1\n\t" \
- "psraw $1,%%mm4\n\t" \
- "movq %%mm0,%%mm2\n\t" \
- /*mm7={54491-0x7FFF,0x7FFF}x2 \
-   mm0=t7''=t7'+s*/ \
- "paddw %%mm4,%%mm0\n\t" \
- /*mm2=t6'''=t7'-s*/ \
- "psubw %%mm4,%%mm2\n\t" \
- /*Stage 4:*/ \
- /*mm0=0, mm2=t0'' \
-   mm5:mm4=t1''*27146+0xB500*/ \
- "movq %%mm1,%%mm4\n\t" \
- "movq %%mm1,%%mm5\n\t" \
- "punpcklwd %%mm6,%%mm4\n\t" \
- "movq %%mm2,"_r3"(%[y])\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "movq "_r0"(%[y]),%%mm2\n\t" \
- "punpckhwd %%mm6,%%mm5\n\t" \
- "movq %%mm0,"_r7"(%[y])\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "pxor %%mm0,%%mm0\n\t" \
- /*mm7={27146,0x4000>>1}x2 \
-   mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
- "psrad $16,%%mm4\n\t" \
- "mov $0x20006A0A,%[a]\n\t" \
- "pcmpeqw %%mm1,%%mm0\n\t" \
- "movd %[a],%%mm7\n\t" \
- "psrad $16,%%mm5\n\t" \
- "psubw %%mm3,%%mm0\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- "paddw %%mm1,%%mm0\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "paddw %%mm4,%%mm0\n\t" \
- /*mm6={0x00000E3D}x2 \
-   mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
- "movq %%mm2,%%mm4\n\t" \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklwd %%mm6,%%mm4\n\t" \
- "mov $0x0E3D,%[a]\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "punpckhwd %%mm6,%%mm5\n\t" \
- "movd %[a],%%mm6\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "pxor %%mm1,%%mm1\n\t" \
- "punpckldq %%mm6,%%mm6\n\t" \
- "pcmpeqw %%mm2,%%mm1\n\t" \
- /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
- "psrad $16,%%mm4\n\t" \
- "psubw %%mm3,%%mm1\n\t" \
- "psrad $16,%%mm5\n\t" \
- "paddw %%mm1,%%mm2\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- "movq "_r5"(%[y]),%%mm1\n\t" \
- "paddw %%mm2,%%mm4\n\t" \
- /*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
-   The naive implementation could cause overflow, so we use \
-    u=(r&s)+((r^s)>>1).*/ \
- "movq "_r3"(%[y]),%%mm2\n\t" \
- "movq %%mm0,%%mm7\n\t" \
- "pxor %%mm4,%%mm0\n\t" \
- "pand %%mm4,%%mm7\n\t" \
- "psraw $1,%%mm0\n\t" \
- "mov $0x7FFF54DC,%[a]\n\t" \
- "paddw %%mm7,%%mm0\n\t" \
- "movd %[a],%%mm7\n\t" \
- /*mm7={54491-0x7FFF,0x7FFF}x2 \
-   mm4=_y[4]=v=r-u*/ \
- "psubw %%mm0,%%mm4\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "movq %%mm4,"_r4"(%[y])\n\t" \
- /*mm0=0, mm7={36410}x4 \
-   mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
- "movq %%mm1,%%mm4\n\t" \
- "movq %%mm1,%%mm5\n\t" \
- "punpcklwd %%mm1,%%mm4\n\t" \
- "mov $0x8E3A8E3A,%[a]\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "movq %%mm0,"_r0"(%[y])\n\t" \
- "punpckhwd %%mm1,%%mm5\n\t" \
- "pxor %%mm0,%%mm0\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "pcmpeqw %%mm0,%%mm1\n\t" \
- "movd %[a],%%mm7\n\t" \
- "psubw %%mm3,%%mm1\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "paddd %%mm6,%%mm4\n\t" \
- "paddd %%mm6,%%mm5\n\t" \
- /*mm0=0 \
-   mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
- "movq %%mm2,%%mm6\n\t" \
- "movq %%mm2,%%mm3\n\t" \
- "pmulhw %%mm7,%%mm6\n\t" \
- "paddw %%mm2,%%mm1\n\t" \
- "pmullw %%mm7,%%mm3\n\t" \
- "pxor %%mm0,%%mm0\n\t" \
- "paddw %%mm1,%%mm6\n\t" \
- "movq %%mm3,%%mm1\n\t" \
- "punpckhwd %%mm6,%%mm3\n\t" \
- "punpcklwd %%mm6,%%mm1\n\t" \
- /*mm3={-1}x4, mm6={1}x4 \
-   mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
- "paddd %%mm3,%%mm5\n\t" \
- "paddd %%mm1,%%mm4\n\t" \
- "psrad $16,%%mm5\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "psrad $16,%%mm4\n\t" \
- "pcmpeqb %%mm3,%%mm3\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- "psubw %%mm3,%%mm6\n\t" \
- /*mm1=t7'', mm7={26568,0x3400}x2 \
-   mm2=s=t6'''-(36410*u>>16)*/ \
- "movq %%mm4,%%mm1\n\t" \
- "mov $0x340067C8,%[a]\n\t" \
- "pmulhw %%mm7,%%mm4\n\t" \
- "movd %[a],%%mm7\n\t" \
- "movq %%mm1,"_r5"(%[y])\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "paddw %%mm1,%%mm4\n\t" \
- "movq "_r7"(%[y]),%%mm1\n\t" \
- "psubw %%mm4,%%mm2\n\t" \
- /*mm6={0x00007B1B}x2 \
-   mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
- "movq %%mm2,%%mm4\n\t" \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklwd %%mm6,%%mm4\n\t" \
- "pcmpeqw %%mm2,%%mm0\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "mov $0x7B1B,%[a]\n\t" \
- "punpckhwd %%mm6,%%mm5\n\t" \
- "movd %[a],%%mm6\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "psubw %%mm3,%%mm0\n\t" \
- "punpckldq %%mm6,%%mm6\n\t" \
- /*mm7={64277-0x7FFF,0x7FFF}x2 \
-   mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
- "psrad $17,%%mm4\n\t" \
- "paddw %%mm0,%%mm2\n\t" \
- "psrad $17,%%mm5\n\t" \
- "mov $0x7FFF7B16,%[a]\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- "movd %[a],%%mm7\n\t" \
- "paddw %%mm4,%%mm2\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- /*mm0=0, mm7={12785}x4 \
-   mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
- "movq %%mm1,%%mm4\n\t" \
- "movq %%mm1,%%mm5\n\t" \
- "movq %%mm2,"_r3"(%[y])\n\t" \
- "punpcklwd %%mm1,%%mm4\n\t" \
- "movq "_r1"(%[y]),%%mm2\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "mov $0x31F131F1,%[a]\n\t" \
- "punpckhwd %%mm1,%%mm5\n\t" \
- "pxor %%mm0,%%mm0\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "pcmpeqw %%mm0,%%mm1\n\t" \
- "movd %[a],%%mm7\n\t" \
- "psubw %%mm3,%%mm1\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "paddd %%mm6,%%mm4\n\t" \
- "paddd %%mm6,%%mm5\n\t" \
- /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
- "movq %%mm2,%%mm6\n\t" \
- "movq %%mm2,%%mm3\n\t" \
- "pmulhw %%mm7,%%mm6\n\t" \
- "pmullw %%mm7,%%mm3\n\t" \
- "paddw %%mm1,%%mm6\n\t" \
- "movq %%mm3,%%mm1\n\t" \
- "punpckhwd %%mm6,%%mm3\n\t" \
- "punpcklwd %%mm6,%%mm1\n\t" \
- /*mm3={-1}x4, mm6={1}x4 \
-   mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
- "paddd %%mm3,%%mm5\n\t" \
- "paddd %%mm1,%%mm4\n\t" \
- "psrad $16,%%mm5\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "psrad $16,%%mm4\n\t" \
- "pcmpeqb %%mm3,%%mm3\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- "psubw %%mm3,%%mm6\n\t" \
- /*mm1=t3'', mm7={20539,0x3000}x2 \
-   mm4=s=(12785*u>>16)-t4''*/ \
- "movq %%mm4,"_r1"(%[y])\n\t" \
- "pmulhw %%mm7,%%mm4\n\t" \
- "mov $0x3000503B,%[a]\n\t" \
- "movq "_r6"(%[y]),%%mm1\n\t" \
- "movd %[a],%%mm7\n\t" \
- "psubw %%mm2,%%mm4\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- /*mm6={0x00006CB7}x2 \
-   mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
- "movq %%mm4,%%mm5\n\t" \
- "movq %%mm4,%%mm2\n\t" \
- "punpcklwd %%mm6,%%mm4\n\t" \
- "pcmpeqw %%mm2,%%mm0\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "mov $0x6CB7,%[a]\n\t" \
- "punpckhwd %%mm6,%%mm5\n\t" \
- "movd %[a],%%mm6\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "psubw %%mm3,%%mm0\n\t" \
- "punpckldq %%mm6,%%mm6\n\t" \
- /*mm7={60547-0x7FFF,0x7FFF}x2 \
-   mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
- "psrad $20,%%mm4\n\t" \
- "paddw %%mm0,%%mm2\n\t" \
- "psrad $20,%%mm5\n\t" \
- "mov $0x7FFF6C84,%[a]\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- "movd %[a],%%mm7\n\t" \
- "paddw %%mm4,%%mm2\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- /*mm0=0, mm7={25080}x4 \
-   mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
- "movq %%mm1,%%mm4\n\t" \
- "movq %%mm1,%%mm5\n\t" \
- "movq %%mm2,"_r7"(%[y])\n\t" \
- "punpcklwd %%mm1,%%mm4\n\t" \
- "movq "_r2"(%[y]),%%mm2\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "mov $0x61F861F8,%[a]\n\t" \
- "punpckhwd %%mm1,%%mm5\n\t" \
- "pxor %%mm0,%%mm0\n\t" \
- "pmaddwd %%mm7,%%mm5\n\t" \
- "movd %[a],%%mm7\n\t" \
- "pcmpeqw %%mm0,%%mm1\n\t" \
- "psubw %%mm3,%%mm1\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "paddd %%mm6,%%mm4\n\t" \
- "paddd %%mm6,%%mm5\n\t" \
- /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
- "movq %%mm2,%%mm6\n\t" \
- "movq %%mm2,%%mm3\n\t" \
- "pmulhw %%mm7,%%mm6\n\t" \
- "pmullw %%mm7,%%mm3\n\t" \
- "paddw %%mm1,%%mm6\n\t" \
- "movq %%mm3,%%mm1\n\t" \
- "punpckhwd %%mm6,%%mm3\n\t" \
- "punpcklwd %%mm6,%%mm1\n\t" \
- /*mm1={-1}x4 \
-   mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
- "paddd %%mm3,%%mm5\n\t" \
- "paddd %%mm1,%%mm4\n\t" \
- "psrad $16,%%mm5\n\t" \
- "mov $0x28005460,%[a]\n\t" \
- "psrad $16,%%mm4\n\t" \
- "pcmpeqb %%mm1,%%mm1\n\t" \
- "packssdw %%mm5,%%mm4\n\t" \
- /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
-   mm4=s=(25080*u>>16)-t2''*/ \
- "movq %%mm4,%%mm6\n\t" \
- "pmulhw %%mm7,%%mm4\n\t" \
- "pxor %%mm5,%%mm5\n\t" \
- "movd %[a],%%mm7\n\t" \
- "psubw %%mm1,%%mm5\n\t" \
- "punpckldq %%mm7,%%mm7\n\t" \
- "psubw %%mm2,%%mm4\n\t" \
- /*mm2=s+(s!=0) \
-   mm4:mm3=s*21600+0x2800*/ \
- "movq %%mm4,%%mm3\n\t" \
- "movq %%mm4,%%mm2\n\t" \
- "punpckhwd %%mm5,%%mm4\n\t" \
- "pcmpeqw %%mm2,%%mm0\n\t" \
- "pmaddwd %%mm7,%%mm4\n\t" \
- "psubw %%mm1,%%mm0\n\t" \
- "punpcklwd %%mm5,%%mm3\n\t" \
- "paddw %%mm0,%%mm2\n\t" \
- "pmaddwd %%mm7,%%mm3\n\t" \
- /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
-   mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
- "movq "_r4"(%[y]),%%mm0\n\t" \
- "psrad $18,%%mm4\n\t" \
- "movq "_r5"(%[y]),%%mm5\n\t" \
- "psrad $18,%%mm3\n\t" \
- "movq "_r7"(%[y]),%%mm1\n\t" \
- "packssdw %%mm4,%%mm3\n\t" \
- "movq "_r0"(%[y]),%%mm4\n\t" \
- "paddw %%mm2,%%mm3\n\t" \
-
-/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
-  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
-   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
-# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
- "#OC_TRANSPOSE8x4\n\t" \
- /*First 4x4 transpose:*/ \
- /*mm0 = e3 e2 e1 e0 \
-   mm5 = f3 f2 f1 f0 \
-   mm3 = g3 g2 g1 g0 \
-   mm1 = h3 h2 h1 h0*/ \
- "movq %%mm0,%%mm2\n\t" \
- "punpcklwd %%mm5,%%mm0\n\t" \
- "punpckhwd %%mm5,%%mm2\n\t" \
- "movq %%mm3,%%mm5\n\t" \
- "punpcklwd %%mm1,%%mm3\n\t" \
- "punpckhwd %%mm1,%%mm5\n\t" \
- /*mm0 = f1 e1 f0 e0 \
-   mm2 = f3 e3 f2 e2 \
-   mm3 = h1 g1 h0 g0 \
-   mm5 = h3 g3 h2 g2*/ \
- "movq %%mm0,%%mm1\n\t" \
- "punpckldq %%mm3,%%mm0\n\t" \
- "movq %%mm0,"_r4"(%[y])\n\t" \
- "punpckhdq %%mm3,%%mm1\n\t" \
- "movq "_r1"(%[y]),%%mm0\n\t" \
- "movq %%mm2,%%mm3\n\t" \
- "punpckldq %%mm5,%%mm2\n\t" \
- "punpckhdq %%mm5,%%mm3\n\t" \
- "movq "_r3"(%[y]),%%mm5\n\t" \
- /*_y[4] = h0 g0 f0 e0 \
-    mm1  = h1 g1 f1 e1 \
-    mm2  = h2 g2 f2 e2 \
-    mm3  = h3 g3 f3 e3*/ \
- /*Second 4x4 transpose:*/ \
- /*mm4 = a3 a2 a1 a0 \
-   mm0 = b3 b2 b1 b0 \
-   mm6 = c3 c2 c1 c0 \
-   mm5 = d3 d2 d1 d0*/ \
- "movq %%mm4,%%mm7\n\t" \
- "punpcklwd %%mm0,%%mm4\n\t" \
- "punpckhwd %%mm0,%%mm7\n\t" \
- "movq %%mm6,%%mm0\n\t" \
- "punpcklwd %%mm5,%%mm6\n\t" \
- "punpckhwd %%mm5,%%mm0\n\t" \
- /*mm4 = b1 a1 b0 a0 \
-   mm7 = b3 a3 b2 a2 \
-   mm6 = d1 c1 d0 c0 \
-   mm0 = d3 c3 d2 c2*/ \
- "movq %%mm4,%%mm5\n\t" \
- "punpckldq %%mm6,%%mm4\n\t" \
- "punpckhdq %%mm6,%%mm5\n\t" \
- "movq %%mm7,%%mm6\n\t" \
- "punpckhdq %%mm0,%%mm7\n\t" \
- "punpckldq %%mm0,%%mm6\n\t" \
- /*mm4 = d0 c0 b0 a0 \
-   mm5 = d1 c1 b1 a1 \
-   mm6 = d2 c2 b2 a2 \
-   mm7 = d3 c3 b3 a3*/ \
-
-/*MMX implementation of the fDCT.*/
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
-  __asm__ __volatile__(
-    /*Add two extra bits of working precision to improve accuracy; any more and
-       we could overflow.*/
-    /*We also add biases to correct for some systematic error that remains in
-       the full fDCT->iDCT round trip.*/
-    "movq 0x00(%[x]),%%mm0\n\t"
-    "movq 0x10(%[x]),%%mm1\n\t"
-    "movq 0x20(%[x]),%%mm2\n\t"
-    "movq 0x30(%[x]),%%mm3\n\t"
-    "pcmpeqb %%mm4,%%mm4\n\t"
-    "pxor %%mm7,%%mm7\n\t"
-    "movq %%mm0,%%mm5\n\t"
-    "psllw $2,%%mm0\n\t"
-    "pcmpeqw %%mm7,%%mm5\n\t"
-    "movq 0x70(%[x]),%%mm7\n\t"
-    "psllw $2,%%mm1\n\t"
-    "psubw %%mm4,%%mm5\n\t"
-    "psllw $2,%%mm2\n\t"
-    "mov $1,%[a]\n\t"
-    "pslld $16,%%mm5\n\t"
-    "movd %[a],%%mm6\n\t"
-    "psllq $16,%%mm5\n\t"
-    "mov $0x10001,%[a]\n\t"
-    "psllw $2,%%mm3\n\t"
-    "movd %[a],%%mm4\n\t"
-    "punpckhwd %%mm6,%%mm5\n\t"
-    "psubw %%mm6,%%mm1\n\t"
-    "movq 0x60(%[x]),%%mm6\n\t"
-    "paddw %%mm5,%%mm0\n\t"
-    "movq 0x50(%[x]),%%mm5\n\t"
-    "paddw %%mm4,%%mm0\n\t"
-    "movq 0x40(%[x]),%%mm4\n\t"
-    /*We inline stage1 of the transform here so we can get better instruction
-       scheduling with the shifts.*/
-    /*mm0=t7'=t0-t7*/
-    "psllw $2,%%mm7\n\t"
-    "psubw %%mm7,%%mm0\n\t"
-    "psllw $2,%%mm6\n\t"
-    "paddw %%mm7,%%mm7\n\t"
-    /*mm1=t6'=t1-t6*/
-    "psllw $2,%%mm5\n\t"
-    "psubw %%mm6,%%mm1\n\t"
-    "psllw $2,%%mm4\n\t"
-    "paddw %%mm6,%%mm6\n\t"
-    /*mm2=t5'=t2-t5*/
-    "psubw %%mm5,%%mm2\n\t"
-    "paddw %%mm5,%%mm5\n\t"
-    /*mm3=t4'=t3-t4*/
-    "psubw %%mm4,%%mm3\n\t"
-    "paddw %%mm4,%%mm4\n\t"
-    /*mm7=t0'=t0+t7*/
-    "paddw %%mm0,%%mm7\n\t"
-    /*mm6=t1'=t1+t6*/
-    "paddw %%mm1,%%mm6\n\t"
-    /*mm5=t2'=t2+t5*/
-    "paddw %%mm2,%%mm5\n\t"
-    /*mm4=t3'=t3+t4*/
-    "paddw %%mm3,%%mm4\n\t"
-    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
-    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
-    /*Swap out this 8x4 block for the next one.*/
-    "movq 0x08(%[x]),%%mm0\n\t"
-    "movq %%mm7,0x30(%[y])\n\t"
-    "movq 0x78(%[x]),%%mm7\n\t"
-    "movq %%mm1,0x50(%[y])\n\t"
-    "movq 0x18(%[x]),%%mm1\n\t"
-    "movq %%mm6,0x20(%[y])\n\t"
-    "movq 0x68(%[x]),%%mm6\n\t"
-    "movq %%mm2,0x60(%[y])\n\t"
-    "movq 0x28(%[x]),%%mm2\n\t"
-    "movq %%mm5,0x10(%[y])\n\t"
-    "movq 0x58(%[x]),%%mm5\n\t"
-    "movq %%mm3,0x70(%[y])\n\t"
-    "movq 0x38(%[x]),%%mm3\n\t"
-    /*And increase its working precision, too.*/
-    "psllw $2,%%mm0\n\t"
-    "movq %%mm4,0x00(%[y])\n\t"
-    "psllw $2,%%mm7\n\t"
-    "movq 0x48(%[x]),%%mm4\n\t"
-    /*We inline stage1 of the transform here so we can get better instruction
-       scheduling with the shifts.*/
-    /*mm0=t7'=t0-t7*/
-    "psubw %%mm7,%%mm0\n\t"
-    "psllw $2,%%mm1\n\t"
-    "paddw %%mm7,%%mm7\n\t"
-    "psllw $2,%%mm6\n\t"
-    /*mm1=t6'=t1-t6*/
-    "psubw %%mm6,%%mm1\n\t"
-    "psllw $2,%%mm2\n\t"
-    "paddw %%mm6,%%mm6\n\t"
-    "psllw $2,%%mm5\n\t"
-    /*mm2=t5'=t2-t5*/
-    "psubw %%mm5,%%mm2\n\t"
-    "psllw $2,%%mm3\n\t"
-    "paddw %%mm5,%%mm5\n\t"
-    "psllw $2,%%mm4\n\t"
-    /*mm3=t4'=t3-t4*/
-    "psubw %%mm4,%%mm3\n\t"
-    "paddw %%mm4,%%mm4\n\t"
-    /*mm7=t0'=t0+t7*/
-    "paddw %%mm0,%%mm7\n\t"
-    /*mm6=t1'=t1+t6*/
-    "paddw %%mm1,%%mm6\n\t"
-    /*mm5=t2'=t2+t5*/
-    "paddw %%mm2,%%mm5\n\t"
-    /*mm4=t3'=t3+t4*/
-    "paddw %%mm3,%%mm4\n\t"
-    OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
-    OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
-    /*Here the first 4x4 block of output from the last transpose is the second
-       4x4 block of input for the next transform.
-      We have cleverly arranged that it already be in the appropriate place,
-       so we only have to do half the stores and loads.*/
-    "movq 0x00(%[y]),%%mm0\n\t"
-    "movq %%mm1,0x58(%[y])\n\t"
-    "movq 0x10(%[y]),%%mm1\n\t"
-    "movq %%mm2,0x68(%[y])\n\t"
-    "movq 0x20(%[y]),%%mm2\n\t"
-    "movq %%mm3,0x78(%[y])\n\t"
-    "movq 0x30(%[y]),%%mm3\n\t"
-    OC_FDCT_STAGE1_8x4
-    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x18(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x08(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
-    "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
-    "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
-    "psraw $2,%%mm7\n\t"
-    "movq 0x40(%[y]),%%mm0\n\t"
-    "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x30(%[y])\n\t"
-    "movq 0x78(%[y]),%%mm7\n\t"
-    "movq %%mm1,0x08(%[y])\n\t"
-    "movq 0x50(%[y]),%%mm1\n\t"
-    "movq %%mm6,0x20(%[y])\n\t"
-    "movq 0x68(%[y]),%%mm6\n\t"
-    "movq %%mm2,0x28(%[y])\n\t"
-    "movq 0x60(%[y]),%%mm2\n\t"
-    "movq %%mm5,0x10(%[y])\n\t"
-    "movq 0x58(%[y]),%%mm5\n\t"
-    "movq %%mm3,0x38(%[y])\n\t"
-    "movq 0x70(%[y]),%%mm3\n\t"
-    "movq %%mm4,0x00(%[y])\n\t"
-    "movq 0x48(%[y]),%%mm4\n\t"
-    OC_FDCT_STAGE1_8x4
-    OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
-    /*mm0={-2}x4*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "paddw %%mm0,%%mm0\n\t"
-    /*Round the results.*/
-    "psubw %%mm0,%%mm1\n\t"
-    "psubw %%mm0,%%mm2\n\t"
-    "psraw $2,%%mm1\n\t"
-    "psubw %%mm0,%%mm3\n\t"
-    "movq %%mm1,0x58(%[y])\n\t"
-    "psraw $2,%%mm2\n\t"
-    "psubw %%mm0,%%mm4\n\t"
-    "movq 0x48(%[y]),%%mm1\n\t"
-    "psraw $2,%%mm3\n\t"
-    "psubw %%mm0,%%mm5\n\t"
-    "movq %%mm2,0x68(%[y])\n\t"
-    "psraw $2,%%mm4\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "movq %%mm3,0x78(%[y])\n\t"
-    "psraw $2,%%mm5\n\t"
-    "psubw %%mm0,%%mm7\n\t"
-    "movq %%mm4,0x40(%[y])\n\t"
-    "psraw $2,%%mm6\n\t"
-    "psubw %%mm0,%%mm1\n\t"
-    "movq %%mm5,0x50(%[y])\n\t"
-    "psraw $2,%%mm7\n\t"
-    "movq %%mm6,0x60(%[y])\n\t"
-    "psraw $2,%%mm1\n\t"
-    "movq %%mm7,0x70(%[y])\n\t"
-    "movq %%mm1,0x48(%[y])\n\t"
-    :[a]"=&r"(a)
-    :[y]"r"(_y),[x]"r"(_x)
-    :"memory"
-  );
-}
-
-#endif
diff --git a/drivers/theora/x86/mmxfrag.c b/drivers/theora/x86/mmxfrag.c
deleted file mode 100644
index 2c732939c3..0000000000
--- a/drivers/theora/x86/mmxfrag.c
+++ /dev/null
@@ -1,293 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: mmxfrag.c 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-
-/*MMX acceleration of fragment reconstruction for motion compensation.
-  Originally written by Rudolf Marek.
-  Additional optimization by Nils Pipenbrinck.
-  Note: Loops are unrolled for best performance.
-  The iteration each instruction belongs to is marked in the comments as #i.*/
-#include <stddef.h>
-#include "x86int.h"
-#include "mmxfrag.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-void oc_frag_copy_mmx(unsigned char *_dst,
- const unsigned char *_src,int _ystride){
-  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
-}
-
-void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
- const ogg_int16_t *_residue){
-  __asm__ __volatile__(
-    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    /*#0 Load low residue.*/
-    "movq 0*8(%[residue]),%%mm1\n\t"
-    /*#0 Load high residue.*/
-    "movq 1*8(%[residue]),%%mm2\n\t"
-    /*Set mm0 to 0x8000800080008000.*/
-    "psllw $15,%%mm0\n\t"
-    /*#1 Load low residue.*/
-    "movq 2*8(%[residue]),%%mm3\n\t"
-    /*#1 Load high residue.*/
-    "movq 3*8(%[residue]),%%mm4\n\t"
-    /*Set mm0 to 0x0080008000800080.*/
-    "psrlw $8,%%mm0\n\t"
-    /*#2 Load low residue.*/
-    "movq 4*8(%[residue]),%%mm5\n\t"
-    /*#2 Load high residue.*/
-    "movq 5*8(%[residue]),%%mm6\n\t"
-    /*#0 Bias low  residue.*/
-    "paddsw %%mm0,%%mm1\n\t"
-    /*#0 Bias high residue.*/
-    "paddsw %%mm0,%%mm2\n\t"
-    /*#0 Pack to byte.*/
-    "packuswb %%mm2,%%mm1\n\t"
-    /*#1 Bias low  residue.*/
-    "paddsw %%mm0,%%mm3\n\t"
-    /*#1 Bias high residue.*/
-    "paddsw %%mm0,%%mm4\n\t"
-    /*#1 Pack to byte.*/
-    "packuswb %%mm4,%%mm3\n\t"
-    /*#2 Bias low  residue.*/
-    "paddsw %%mm0,%%mm5\n\t"
-    /*#2 Bias high residue.*/
-    "paddsw %%mm0,%%mm6\n\t"
-    /*#2 Pack to byte.*/
-    "packuswb %%mm6,%%mm5\n\t"
-    /*#0 Write row.*/
-    "movq %%mm1,(%[dst])\n\t"
-    /*#1 Write row.*/
-    "movq %%mm3,(%[dst],%[ystride])\n\t"
-    /*#2 Write row.*/
-    "movq %%mm5,(%[dst],%[ystride],2)\n\t"
-    /*#3 Load low residue.*/
-    "movq 6*8(%[residue]),%%mm1\n\t"
-    /*#3 Load high residue.*/
-    "movq 7*8(%[residue]),%%mm2\n\t"
-    /*#4 Load high residue.*/
-    "movq 8*8(%[residue]),%%mm3\n\t"
-    /*#4 Load high residue.*/
-    "movq 9*8(%[residue]),%%mm4\n\t"
-    /*#5 Load high residue.*/
-    "movq 10*8(%[residue]),%%mm5\n\t"
-    /*#5 Load high residue.*/
-    "movq 11*8(%[residue]),%%mm6\n\t"
-    /*#3 Bias low  residue.*/
-    "paddsw %%mm0,%%mm1\n\t"
-    /*#3 Bias high residue.*/
-    "paddsw %%mm0,%%mm2\n\t"
-    /*#3 Pack to byte.*/
-    "packuswb %%mm2,%%mm1\n\t"
-    /*#4 Bias low  residue.*/
-    "paddsw %%mm0,%%mm3\n\t"
-    /*#4 Bias high residue.*/
-    "paddsw %%mm0,%%mm4\n\t"
-    /*#4 Pack to byte.*/
-    "packuswb %%mm4,%%mm3\n\t"
-    /*#5 Bias low  residue.*/
-    "paddsw %%mm0,%%mm5\n\t"
-    /*#5 Bias high residue.*/
-    "paddsw %%mm0,%%mm6\n\t"
-    /*#5 Pack to byte.*/
-    "packuswb %%mm6,%%mm5\n\t"
-    /*#3 Write row.*/
-    "movq %%mm1,(%[dst],%[ystride3])\n\t"
-    /*#4 Write row.*/
-    "movq %%mm3,(%[dst4])\n\t"
-    /*#5 Write row.*/
-    "movq %%mm5,(%[dst4],%[ystride])\n\t"
-    /*#6 Load low residue.*/
-    "movq 12*8(%[residue]),%%mm1\n\t"
-    /*#6 Load high residue.*/
-    "movq 13*8(%[residue]),%%mm2\n\t"
-    /*#7 Load low residue.*/
-    "movq 14*8(%[residue]),%%mm3\n\t"
-    /*#7 Load high residue.*/
-    "movq 15*8(%[residue]),%%mm4\n\t"
-    /*#6 Bias low  residue.*/
-    "paddsw %%mm0,%%mm1\n\t"
-    /*#6 Bias high residue.*/
-    "paddsw %%mm0,%%mm2\n\t"
-    /*#6 Pack to byte.*/
-    "packuswb %%mm2,%%mm1\n\t"
-    /*#7 Bias low  residue.*/
-    "paddsw %%mm0,%%mm3\n\t"
-    /*#7 Bias high residue.*/
-    "paddsw %%mm0,%%mm4\n\t"
-    /*#7 Pack to byte.*/
-    "packuswb %%mm4,%%mm3\n\t"
-    /*#6 Write row.*/
-    "movq %%mm1,(%[dst4],%[ystride],2)\n\t"
-    /*#7 Write row.*/
-    "movq %%mm3,(%[dst4],%[ystride3])\n\t"
-    :
-    :[residue]"r"(_residue),
-     [dst]"r"(_dst),
-     [dst4]"r"(_dst+(_ystride<<2)),
-     [ystride]"r"((ptrdiff_t)_ystride),
-     [ystride3]"r"((ptrdiff_t)_ystride*3)
-    :"memory"
-  );
-}
-
-void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
- int _ystride,const ogg_int16_t *_residue){
-  int i;
-  /*Zero mm0.*/
-  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
-  for(i=4;i-->0;){
-    __asm__ __volatile__(
-      /*#0 Load source.*/
-      "movq (%[src]),%%mm3\n\t"
-      /*#1 Load source.*/
-      "movq (%[src],%[ystride]),%%mm7\n\t"
-      /*#0 Get copy of src.*/
-      "movq %%mm3,%%mm4\n\t"
-      /*#0 Expand high source.*/
-      "punpckhbw %%mm0,%%mm4\n\t"
-      /*#0 Expand low  source.*/
-      "punpcklbw %%mm0,%%mm3\n\t"
-      /*#0 Add residue high.*/
-      "paddsw 8(%[residue]),%%mm4\n\t"
-      /*#1 Get copy of src.*/
-      "movq %%mm7,%%mm2\n\t"
-      /*#0 Add residue low.*/
-      "paddsw (%[residue]), %%mm3\n\t"
-      /*#1 Expand high source.*/
-      "punpckhbw %%mm0,%%mm2\n\t"
-      /*#0 Pack final row pixels.*/
-      "packuswb %%mm4,%%mm3\n\t"
-      /*#1 Expand low  source.*/
-      "punpcklbw %%mm0,%%mm7\n\t"
-      /*#1 Add residue low.*/
-      "paddsw 16(%[residue]),%%mm7\n\t"
-      /*#1 Add residue high.*/
-      "paddsw 24(%[residue]),%%mm2\n\t"
-      /*Advance residue.*/
-      "lea 32(%[residue]),%[residue]\n\t"
-      /*#1 Pack final row pixels.*/
-      "packuswb %%mm2,%%mm7\n\t"
-      /*Advance src.*/
-      "lea (%[src],%[ystride],2),%[src]\n\t"
-      /*#0 Write row.*/
-      "movq %%mm3,(%[dst])\n\t"
-      /*#1 Write row.*/
-      "movq %%mm7,(%[dst],%[ystride])\n\t"
-      /*Advance dst.*/
-      "lea (%[dst],%[ystride],2),%[dst]\n\t"
-      :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
-      :[ystride]"r"((ptrdiff_t)_ystride)
-      :"memory"
-    );
-  }
-}
-
-void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
- const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
-  int i;
-  /*Zero mm7.*/
-  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
-  for(i=4;i-->0;){
-    __asm__ __volatile__(
-      /*#0 Load src1.*/
-      "movq (%[src1]),%%mm0\n\t"
-      /*#0 Load src2.*/
-      "movq (%[src2]),%%mm2\n\t"
-      /*#0 Copy src1.*/
-      "movq %%mm0,%%mm1\n\t"
-      /*#0 Copy src2.*/
-      "movq %%mm2,%%mm3\n\t"
-      /*#1 Load src1.*/
-      "movq (%[src1],%[ystride]),%%mm4\n\t"
-      /*#0 Unpack lower src1.*/
-      "punpcklbw %%mm7,%%mm0\n\t"
-      /*#1 Load src2.*/
-      "movq (%[src2],%[ystride]),%%mm5\n\t"
-      /*#0 Unpack higher src1.*/
-      "punpckhbw %%mm7,%%mm1\n\t"
-      /*#0 Unpack lower src2.*/
-      "punpcklbw %%mm7,%%mm2\n\t"
-      /*#0 Unpack higher src2.*/
-      "punpckhbw %%mm7,%%mm3\n\t"
-      /*Advance src1 ptr.*/
-      "lea (%[src1],%[ystride],2),%[src1]\n\t"
-      /*Advance src2 ptr.*/
-      "lea (%[src2],%[ystride],2),%[src2]\n\t"
-      /*#0 Lower src1+src2.*/
-      "paddsw %%mm2,%%mm0\n\t"
-      /*#0 Higher src1+src2.*/
-      "paddsw %%mm3,%%mm1\n\t"
-      /*#1 Copy src1.*/
-      "movq %%mm4,%%mm2\n\t"
-      /*#0 Build lo average.*/
-      "psraw $1,%%mm0\n\t"
-      /*#1 Copy src2.*/
-      "movq %%mm5,%%mm3\n\t"
-      /*#1 Unpack lower src1.*/
-      "punpcklbw %%mm7,%%mm4\n\t"
-      /*#0 Build hi average.*/
-      "psraw $1,%%mm1\n\t"
-      /*#1 Unpack higher src1.*/
-      "punpckhbw %%mm7,%%mm2\n\t"
-      /*#0 low+=residue.*/
-      "paddsw (%[residue]),%%mm0\n\t"
-      /*#1 Unpack lower src2.*/
-      "punpcklbw %%mm7,%%mm5\n\t"
-      /*#0 high+=residue.*/
-      "paddsw 8(%[residue]),%%mm1\n\t"
-      /*#1 Unpack higher src2.*/
-      "punpckhbw %%mm7,%%mm3\n\t"
-      /*#1 Lower src1+src2.*/
-      "paddsw %%mm4,%%mm5\n\t"
-      /*#0 Pack and saturate.*/
-      "packuswb %%mm1,%%mm0\n\t"
-      /*#1 Higher src1+src2.*/
-      "paddsw %%mm2,%%mm3\n\t"
-      /*#0 Write row.*/
-      "movq %%mm0,(%[dst])\n\t"
-      /*#1 Build lo average.*/
-      "psraw $1,%%mm5\n\t"
-      /*#1 Build hi average.*/
-      "psraw $1,%%mm3\n\t"
-      /*#1 low+=residue.*/
-      "paddsw 16(%[residue]),%%mm5\n\t"
-      /*#1 high+=residue.*/
-      "paddsw 24(%[residue]),%%mm3\n\t"
-      /*#1 Pack and saturate.*/
-      "packuswb  %%mm3,%%mm5\n\t"
-      /*#1 Write row ptr.*/
-      "movq %%mm5,(%[dst],%[ystride])\n\t"
-      /*Advance residue ptr.*/
-      "add $32,%[residue]\n\t"
-      /*Advance dest ptr.*/
-      "lea (%[dst],%[ystride],2),%[dst]\n\t"
-     :[dst]"+r"(_dst),[residue]"+r"(_residue),
-      [src1]"+%r"(_src1),[src2]"+r"(_src2)
-     :[ystride]"r"((ptrdiff_t)_ystride)
-     :"memory"
-    );
-  }
-}
-
-void oc_restore_fpu_mmx(void){
-  __asm__ __volatile__("emms\n\t");
-}
-#endif
diff --git a/drivers/theora/x86/mmxfrag.h b/drivers/theora/x86/mmxfrag.h
deleted file mode 100644
index a398427629..0000000000
--- a/drivers/theora/x86/mmxfrag.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#if !defined(_x86_mmxfrag_H)
-# define _x86_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    ptrdiff_t            ystride3; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm__ __volatile__( \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*ystride3=ystride*3*/ \
-      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[src],%[ystride],4),%[src]\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
-      :[ystride]"r"((ptrdiff_t)(_ystride)) \
-      :"memory" \
-    ); \
-  } \
-  while(0)
-
-# endif
-#endif
diff --git a/drivers/theora/x86/mmxidct.c b/drivers/theora/x86/mmxidct.c
deleted file mode 100644
index 76424e6364..0000000000
--- a/drivers/theora/x86/mmxidct.c
+++ /dev/null
@@ -1,564 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-
-/*MMX acceleration of Theora's iDCT.
-  Originally written by Rudolf Marek, based on code from On2's VP3.*/
-#include "x86int.h"
-#include "../dct.h"
-
-#if defined(OC_X86_ASM)
-
-/*These are offsets into the table of constants below.*/
-/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (0)
-/*A row of 8's.*/
-#define OC_EIGHT_OFFSET  (56)
-
-
-
-/*A table of constants used by the MMX routines.*/
-static const ogg_uint16_t __attribute__((aligned(8),used))
- OC_IDCT_CONSTS[(7+1)*4]={
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-  (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
-};
-
-/*Converts the expression in the argument to a string.*/
-#define OC_M2STR(_s) #_s
-
-/*38 cycles*/
-#define OC_IDCT_BEGIN \
-  "#OC_IDCT_BEGIN\n\t" \
-  "movq "OC_I(3)",%%mm2\n\t" \
-  "movq "OC_C(3)",%%mm6\n\t" \
-  "movq %%mm2,%%mm4\n\t" \
-  "movq "OC_J(5)",%%mm7\n\t" \
-  "pmulhw %%mm6,%%mm4\n\t" \
-  "movq "OC_C(5)",%%mm1\n\t" \
-  "pmulhw %%mm7,%%mm6\n\t" \
-  "movq %%mm1,%%mm5\n\t" \
-  "pmulhw %%mm2,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm3\n\t" \
-  "pmulhw %%mm7,%%mm5\n\t" \
-  "movq "OC_C(1)",%%mm0\n\t" \
-  "paddw %%mm2,%%mm4\n\t" \
-  "paddw %%mm7,%%mm6\n\t" \
-  "paddw %%mm1,%%mm2\n\t" \
-  "movq "OC_J(7)",%%mm1\n\t" \
-  "paddw %%mm5,%%mm7\n\t" \
-  "movq %%mm0,%%mm5\n\t" \
-  "pmulhw %%mm3,%%mm0\n\t" \
-  "paddw %%mm7,%%mm4\n\t" \
-  "pmulhw %%mm1,%%mm5\n\t" \
-  "movq "OC_C(7)",%%mm7\n\t" \
-  "psubw %%mm2,%%mm6\n\t" \
-  "paddw %%mm3,%%mm0\n\t" \
-  "pmulhw %%mm7,%%mm3\n\t" \
-  "movq "OC_I(2)",%%mm2\n\t" \
-  "pmulhw %%mm1,%%mm7\n\t" \
-  "paddw %%mm1,%%mm5\n\t" \
-  "movq %%mm2,%%mm1\n\t" \
-  "pmulhw "OC_C(2)",%%mm2\n\t" \
-  "psubw %%mm5,%%mm3\n\t" \
-  "movq "OC_J(6)",%%mm5\n\t" \
-  "paddw %%mm7,%%mm0\n\t" \
-  "movq %%mm5,%%mm7\n\t" \
-  "psubw %%mm4,%%mm0\n\t" \
-  "pmulhw "OC_C(2)",%%mm5\n\t" \
-  "paddw %%mm1,%%mm2\n\t" \
-  "pmulhw "OC_C(6)",%%mm1\n\t" \
-  "paddw %%mm4,%%mm4\n\t" \
-  "paddw %%mm0,%%mm4\n\t" \
-  "psubw %%mm6,%%mm3\n\t" \
-  "paddw %%mm7,%%mm5\n\t" \
-  "paddw %%mm6,%%mm6\n\t" \
-  "pmulhw "OC_C(6)",%%mm7\n\t" \
-  "paddw %%mm3,%%mm6\n\t" \
-  "movq %%mm4,"OC_I(1)"\n\t" \
-  "psubw %%mm5,%%mm1\n\t" \
-  "movq "OC_C(4)",%%mm4\n\t" \
-  "movq %%mm3,%%mm5\n\t" \
-  "pmulhw %%mm4,%%mm3\n\t" \
-  "paddw %%mm2,%%mm7\n\t" \
-  "movq %%mm6,"OC_I(2)"\n\t" \
-  "movq %%mm0,%%mm2\n\t" \
-  "movq "OC_I(0)",%%mm6\n\t" \
-  "pmulhw %%mm4,%%mm0\n\t" \
-  "paddw %%mm3,%%mm5\n\t" \
-  "movq "OC_J(4)",%%mm3\n\t" \
-  "psubw %%mm1,%%mm5\n\t" \
-  "paddw %%mm0,%%mm2\n\t" \
-  "psubw %%mm3,%%mm6\n\t" \
-  "movq %%mm6,%%mm0\n\t" \
-  "pmulhw %%mm4,%%mm6\n\t" \
-  "paddw %%mm3,%%mm3\n\t" \
-  "paddw %%mm1,%%mm1\n\t" \
-  "paddw %%mm0,%%mm3\n\t" \
-  "paddw %%mm5,%%mm1\n\t" \
-  "pmulhw %%mm3,%%mm4\n\t" \
-  "paddw %%mm0,%%mm6\n\t" \
-  "psubw %%mm2,%%mm6\n\t" \
-  "paddw %%mm2,%%mm2\n\t" \
-  "movq "OC_I(1)",%%mm0\n\t" \
-  "paddw %%mm6,%%mm2\n\t" \
-  "paddw %%mm3,%%mm4\n\t" \
-  "psubw %%mm1,%%mm2\n\t" \
-  "#end OC_IDCT_BEGIN\n\t" \
-
-/*38+8=46 cycles.*/
-#define OC_ROW_IDCT \
-  "#OC_ROW_IDCT\n" \
-  OC_IDCT_BEGIN \
-  /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
-  /*r4=E'=E-G*/ \
-  "psubw %%mm7,%%mm4\n\t" \
-  /*r1=H'+H'*/ \
-  "paddw %%mm1,%%mm1\n\t" \
-  /*r7=G+G*/ \
-  "paddw %%mm7,%%mm7\n\t" \
-  /*r1=R1=A''+H'*/ \
-  "paddw %%mm2,%%mm1\n\t" \
-  /*r7=G'=E+G*/ \
-  "paddw %%mm4,%%mm7\n\t" \
-  /*r4=R4=E'-D'*/ \
-  "psubw %%mm3,%%mm4\n\t" \
-  "paddw %%mm3,%%mm3\n\t" \
-  /*r6=R6=F'-B''*/ \
-  "psubw %%mm5,%%mm6\n\t" \
-  "paddw %%mm5,%%mm5\n\t" \
-  /*r3=R3=E'+D'*/ \
-  "paddw %%mm4,%%mm3\n\t" \
-  /*r5=R5=F'+B''*/ \
-  "paddw %%mm6,%%mm5\n\t" \
-  /*r7=R7=G'-C'*/ \
-  "psubw %%mm0,%%mm7\n\t" \
-  "paddw %%mm0,%%mm0\n\t" \
-  /*Save R1.*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
-  /*r0=R0=G.+C.*/ \
-  "paddw %%mm7,%%mm0\n\t" \
-  "#end OC_ROW_IDCT\n\t" \
-
-/*The following macro does two 4x4 transposes in place.
-  At entry, we assume:
-    r0 = a3 a2 a1 a0
-  I(1) = b3 b2 b1 b0
-    r2 = c3 c2 c1 c0
-    r3 = d3 d2 d1 d0
-
-    r4 = e3 e2 e1 e0
-    r5 = f3 f2 f1 f0
-    r6 = g3 g2 g1 g0
-    r7 = h3 h2 h1 h0
-
-  At exit, we have:
-  I(0) = d0 c0 b0 a0
-  I(1) = d1 c1 b1 a1
-  I(2) = d2 c2 b2 a2
-  I(3) = d3 c3 b3 a3
-
-  J(4) = h0 g0 f0 e0
-  J(5) = h1 g1 f1 e1
-  J(6) = h2 g2 f2 e2
-  J(7) = h3 g3 f3 e3
-
-  I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
-  J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
-
-  Since r1 is free at entry, we calculate the Js first.*/
-/*19 cycles.*/
-#define OC_TRANSPOSE \
-  "#OC_TRANSPOSE\n\t" \
-  "movq %%mm4,%%mm1\n\t" \
-  "punpcklwd %%mm5,%%mm4\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
-  "punpckhwd %%mm5,%%mm1\n\t" \
-  "movq %%mm6,%%mm0\n\t" \
-  "punpcklwd %%mm7,%%mm6\n\t" \
-  "movq %%mm4,%%mm5\n\t" \
-  "punpckldq %%mm6,%%mm4\n\t" \
-  "punpckhdq %%mm6,%%mm5\n\t" \
-  "movq %%mm1,%%mm6\n\t" \
-  "movq %%mm4,"OC_J(4)"\n\t" \
-  "punpckhwd %%mm7,%%mm0\n\t" \
-  "movq %%mm5,"OC_J(5)"\n\t" \
-  "punpckhdq %%mm0,%%mm6\n\t" \
-  "movq "OC_I(0)",%%mm4\n\t" \
-  "punpckldq %%mm0,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm5\n\t" \
-  "movq %%mm4,%%mm0\n\t" \
-  "movq %%mm6,"OC_J(7)"\n\t" \
-  "punpcklwd %%mm5,%%mm0\n\t" \
-  "movq %%mm1,"OC_J(6)"\n\t" \
-  "punpckhwd %%mm5,%%mm4\n\t" \
-  "movq %%mm2,%%mm5\n\t" \
-  "punpcklwd %%mm3,%%mm2\n\t" \
-  "movq %%mm0,%%mm1\n\t" \
-  "punpckldq %%mm2,%%mm0\n\t" \
-  "punpckhdq %%mm2,%%mm1\n\t" \
-  "movq %%mm4,%%mm2\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
-  "punpckhwd %%mm3,%%mm5\n\t" \
-  "movq %%mm1,"OC_I(1)"\n\t" \
-  "punpckhdq %%mm5,%%mm4\n\t" \
-  "punpckldq %%mm5,%%mm2\n\t" \
-  "movq %%mm4,"OC_I(3)"\n\t" \
-  "movq %%mm2,"OC_I(2)"\n\t" \
-  "#end OC_TRANSPOSE\n\t" \
-
-/*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT \
-  "#OC_COLUMN_IDCT\n" \
-  OC_IDCT_BEGIN \
-  "paddw "OC_8",%%mm2\n\t" \
-  /*r1=H'+H'*/ \
-  "paddw %%mm1,%%mm1\n\t" \
-  /*r1=R1=A''+H'*/ \
-  "paddw %%mm2,%%mm1\n\t" \
-  /*r2=NR2*/ \
-  "psraw $4,%%mm2\n\t" \
-  /*r4=E'=E-G*/ \
-  "psubw %%mm7,%%mm4\n\t" \
-  /*r1=NR1*/ \
-  "psraw $4,%%mm1\n\t" \
-  /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
-  /*r7=G+G*/ \
-  "paddw %%mm7,%%mm7\n\t" \
-  /*Store NR2 at I(2).*/ \
-  "movq %%mm2,"OC_I(2)"\n\t" \
-  /*r7=G'=E+G*/ \
-  "paddw %%mm4,%%mm7\n\t" \
-  /*Store NR1 at I(1).*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
-  /*r4=R4=E'-D'*/ \
-  "psubw %%mm3,%%mm4\n\t" \
-  "paddw "OC_8",%%mm4\n\t" \
-  /*r3=D'+D'*/ \
-  "paddw %%mm3,%%mm3\n\t" \
-  /*r3=R3=E'+D'*/ \
-  "paddw %%mm4,%%mm3\n\t" \
-  /*r4=NR4*/ \
-  "psraw $4,%%mm4\n\t" \
-  /*r6=R6=F'-B''*/ \
-  "psubw %%mm5,%%mm6\n\t" \
-  /*r3=NR3*/ \
-  "psraw $4,%%mm3\n\t" \
-  "paddw "OC_8",%%mm6\n\t" \
-  /*r5=B''+B''*/ \
-  "paddw %%mm5,%%mm5\n\t" \
-  /*r5=R5=F'+B''*/ \
-  "paddw %%mm6,%%mm5\n\t" \
-  /*r6=NR6*/ \
-  "psraw $4,%%mm6\n\t" \
-  /*Store NR4 at J(4).*/ \
-  "movq %%mm4,"OC_J(4)"\n\t" \
-  /*r5=NR5*/ \
-  "psraw $4,%%mm5\n\t" \
-  /*Store NR3 at I(3).*/ \
-  "movq %%mm3,"OC_I(3)"\n\t" \
-  /*r7=R7=G'-C'*/ \
-  "psubw %%mm0,%%mm7\n\t" \
-  "paddw "OC_8",%%mm7\n\t" \
-  /*r0=C'+C'*/ \
-  "paddw %%mm0,%%mm0\n\t" \
-  /*r0=R0=G'+C'*/ \
-  "paddw %%mm7,%%mm0\n\t" \
-  /*r7=NR7*/ \
-  "psraw $4,%%mm7\n\t" \
-  /*Store NR6 at J(6).*/ \
-  "movq %%mm6,"OC_J(6)"\n\t" \
-  /*r0=NR0*/ \
-  "psraw $4,%%mm0\n\t" \
-  /*Store NR5 at J(5).*/ \
-  "movq %%mm5,"OC_J(5)"\n\t" \
-  /*Store NR7 at J(7).*/ \
-  "movq %%mm7,"OC_J(7)"\n\t" \
-  /*Store NR0 at I(0).*/ \
-  "movq %%mm0,"OC_I(0)"\n\t" \
-  "#end OC_COLUMN_IDCT\n\t" \
-
-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
-#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
-
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
-  /*This routine accepts an 8x8 matrix, but in partially transposed form.
-    Every 4x4 block is transposed.*/
-  __asm__ __volatile__(
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
-#undef  OC_I
-#undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
-#undef  OC_I
-#undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
-#undef  OC_I
-#undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
-#undef  OC_I
-#undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
-  );
-}
-
-/*25 cycles.*/
-#define OC_IDCT_BEGIN_10 \
- "#OC_IDCT_BEGIN_10\n\t" \
- "movq "OC_I(3)",%%mm2\n\t" \
- "nop\n\t" \
- "movq "OC_C(3)",%%mm6\n\t" \
- "movq %%mm2,%%mm4\n\t" \
- "movq "OC_C(5)",%%mm1\n\t" \
- "pmulhw %%mm6,%%mm4\n\t" \
- "movq "OC_I(1)",%%mm3\n\t" \
- "pmulhw %%mm2,%%mm1\n\t" \
- "movq "OC_C(1)",%%mm0\n\t" \
- "paddw %%mm2,%%mm4\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "paddw %%mm1,%%mm2\n\t" \
- "movq "OC_I(2)",%%mm5\n\t" \
- "pmulhw %%mm3,%%mm0\n\t" \
- "movq %%mm5,%%mm1\n\t" \
- "paddw %%mm3,%%mm0\n\t" \
- "pmulhw "OC_C(7)",%%mm3\n\t" \
- "psubw %%mm2,%%mm6\n\t" \
- "pmulhw "OC_C(2)",%%mm5\n\t" \
- "psubw %%mm4,%%mm0\n\t" \
- "movq "OC_I(2)",%%mm7\n\t" \
- "paddw %%mm4,%%mm4\n\t" \
- "paddw %%mm5,%%mm7\n\t" \
- "paddw %%mm0,%%mm4\n\t" \
- "pmulhw "OC_C(6)",%%mm1\n\t" \
- "psubw %%mm6,%%mm3\n\t" \
- "movq %%mm4,"OC_I(1)"\n\t" \
- "paddw %%mm6,%%mm6\n\t" \
- "movq "OC_C(4)",%%mm4\n\t" \
- "paddw %%mm3,%%mm6\n\t" \
- "movq %%mm3,%%mm5\n\t" \
- "pmulhw %%mm4,%%mm3\n\t" \
- "movq %%mm6,"OC_I(2)"\n\t" \
- "movq %%mm0,%%mm2\n\t" \
- "movq "OC_I(0)",%%mm6\n\t" \
- "pmulhw %%mm4,%%mm0\n\t" \
- "paddw %%mm3,%%mm5\n\t" \
- "paddw %%mm0,%%mm2\n\t" \
- "psubw %%mm1,%%mm5\n\t" \
- "pmulhw %%mm4,%%mm6\n\t" \
- "paddw "OC_I(0)",%%mm6\n\t" \
- "paddw %%mm1,%%mm1\n\t" \
- "movq %%mm6,%%mm4\n\t" \
- "paddw %%mm5,%%mm1\n\t" \
- "psubw %%mm2,%%mm6\n\t" \
- "paddw %%mm2,%%mm2\n\t" \
- "movq "OC_I(1)",%%mm0\n\t" \
- "paddw %%mm6,%%mm2\n\t" \
- "psubw %%mm1,%%mm2\n\t" \
- "nop\n\t" \
- "#end OC_IDCT_BEGIN_10\n\t" \
-
-/*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 \
- "#OC_ROW_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
- /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
- /*r4=E'=E-G*/ \
- "psubw %%mm7,%%mm4\n\t" \
- /*r1=H'+H'*/ \
- "paddw %%mm1,%%mm1\n\t" \
- /*r7=G+G*/ \
- "paddw %%mm7,%%mm7\n\t" \
- /*r1=R1=A''+H'*/ \
- "paddw %%mm2,%%mm1\n\t" \
- /*r7=G'=E+G*/ \
- "paddw %%mm4,%%mm7\n\t" \
- /*r4=R4=E'-D'*/ \
- "psubw %%mm3,%%mm4\n\t" \
- "paddw %%mm3,%%mm3\n\t" \
- /*r6=R6=F'-B''*/ \
- "psubw %%mm5,%%mm6\n\t" \
- "paddw %%mm5,%%mm5\n\t" \
- /*r3=R3=E'+D'*/ \
- "paddw %%mm4,%%mm3\n\t" \
- /*r5=R5=F'+B''*/ \
- "paddw %%mm6,%%mm5\n\t" \
- /*r7=R7=G'-C'*/ \
- "psubw %%mm0,%%mm7\n\t" \
- "paddw %%mm0,%%mm0\n\t" \
- /*Save R1.*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
- /*r0=R0=G'+C'*/ \
- "paddw %%mm7,%%mm0\n\t" \
- "#end OC_ROW_IDCT_10\n\t" \
-
-/*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 \
- "#OC_COLUMN_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
- "paddw "OC_8",%%mm2\n\t" \
- /*r1=H'+H'*/ \
- "paddw %%mm1,%%mm1\n\t" \
- /*r1=R1=A''+H'*/ \
- "paddw %%mm2,%%mm1\n\t" \
- /*r2=NR2*/ \
- "psraw $4,%%mm2\n\t" \
- /*r4=E'=E-G*/ \
- "psubw %%mm7,%%mm4\n\t" \
- /*r1=NR1*/ \
- "psraw $4,%%mm1\n\t" \
- /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
- /*r7=G+G*/ \
- "paddw %%mm7,%%mm7\n\t" \
- /*Store NR2 at I(2).*/ \
- "movq %%mm2,"OC_I(2)"\n\t" \
- /*r7=G'=E+G*/ \
- "paddw %%mm4,%%mm7\n\t" \
- /*Store NR1 at I(1).*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
- /*r4=R4=E'-D'*/ \
- "psubw %%mm3,%%mm4\n\t" \
- "paddw "OC_8",%%mm4\n\t" \
- /*r3=D'+D'*/ \
- "paddw %%mm3,%%mm3\n\t" \
- /*r3=R3=E'+D'*/ \
- "paddw %%mm4,%%mm3\n\t" \
- /*r4=NR4*/ \
- "psraw $4,%%mm4\n\t" \
- /*r6=R6=F'-B''*/ \
- "psubw %%mm5,%%mm6\n\t" \
- /*r3=NR3*/ \
- "psraw $4,%%mm3\n\t" \
- "paddw "OC_8",%%mm6\n\t" \
- /*r5=B''+B''*/ \
- "paddw %%mm5,%%mm5\n\t" \
- /*r5=R5=F'+B''*/ \
- "paddw %%mm6,%%mm5\n\t" \
- /*r6=NR6*/ \
- "psraw $4,%%mm6\n\t" \
- /*Store NR4 at J(4).*/ \
- "movq %%mm4,"OC_J(4)"\n\t" \
- /*r5=NR5*/ \
- "psraw $4,%%mm5\n\t" \
- /*Store NR3 at I(3).*/ \
- "movq %%mm3,"OC_I(3)"\n\t" \
- /*r7=R7=G'-C'*/ \
- "psubw %%mm0,%%mm7\n\t" \
- "paddw "OC_8",%%mm7\n\t" \
- /*r0=C'+C'*/ \
- "paddw %%mm0,%%mm0\n\t" \
- /*r0=R0=G'+C'*/ \
- "paddw %%mm7,%%mm0\n\t" \
- /*r7=NR7*/ \
- "psraw $4,%%mm7\n\t" \
- /*Store NR6 at J(6).*/ \
- "movq %%mm6,"OC_J(6)"\n\t" \
- /*r0=NR0*/ \
- "psraw $4,%%mm0\n\t" \
- /*Store NR5 at J(5).*/ \
- "movq %%mm5,"OC_J(5)"\n\t" \
- /*Store NR7 at J(7).*/ \
- "movq %%mm7,"OC_J(7)"\n\t" \
- /*Store NR0 at I(0).*/ \
- "movq %%mm0,"OC_I(0)"\n\t" \
- "#end OC_COLUMN_IDCT_10\n\t" \
-
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
-  __asm__ __volatile__(
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
-    /*Done with dequant, descramble, and partial transpose.
-      Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
-#undef  OC_I
-#undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
-#undef  OC_I
-#undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
-#undef  OC_I
-#undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
-  );
-}
-
-/*Performs an inverse 8x8 Type-II DCT transform.
-  The input is assumed to be scaled by a factor of 4 relative to orthonormal
-   version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
-  /*_last_zzi is subtly different from an actual count of the number of
-     coefficients we decoded for this block.
-    It contains the value of zzi BEFORE the final token in the block was
-     decoded.
-    In most cases this is an EOB token (the continuation of an EOB run from a
-     previous block counts), and so this is the same as the coefficient count.
-    However, in the case that the last token was NOT an EOB token, but filled
-     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
-    Provided the last token was not a pure zero run, the minimum value it can
-     be is 46, and so that doesn't affect any of the cases in this routine.
-    However, if the last token WAS a pure zero run of length 63, then _last_zzi
-     will be 1 while the number of coefficients decoded is 64.
-    Thus, we will trigger the following special case, where the real
-     coefficient count would not.
-    Note also that a zero run of length 64 will give _last_zzi a value of 0,
-     but we still process the DC coefficient, which might have a non-zero value
-     due to DC prediction.
-    Although convoluted, this is arguably the correct behavior: it allows us to
-     use a smaller transform when the block ends with a long zero run instead
-     of a normal EOB token.
-    It could be smarter... multiple separate zero runs at the end of a block
-     will fool it, but an encoder that generates these really deserves what it
-     gets.
-    Needless to say we inherited this approach from VP3.*/
-  /*Then perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
-}
-
-#endif
diff --git a/drivers/theora/x86/mmxloop.h b/drivers/theora/x86/mmxloop.h
deleted file mode 100644
index 2e870c795d..0000000000
--- a/drivers/theora/x86/mmxloop.h
+++ /dev/null
@@ -1,215 +0,0 @@
-#if !defined(_x86_mmxloop_H)
-# define _x86_mmxloop_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
-  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
-   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
-#define OC_LOOP_FILTER8_MMX \
- "#OC_LOOP_FILTER8_MMX\n\t" \
- /*mm7=0*/ \
- "pxor %%mm7,%%mm7\n\t" \
- /*mm6:mm0={a0,...,a7}*/ \
- "movq %%mm0,%%mm6\n\t" \
- "punpcklbw %%mm7,%%mm0\n\t" \
- "punpckhbw %%mm7,%%mm6\n\t" \
- /*mm3:mm5={d0,...,d7}*/ \
- "movq %%mm3,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm3\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm6:mm0={a0-d0,...,a7-d7}*/ \
- "psubw %%mm3,%%mm0\n\t" \
- "psubw %%mm5,%%mm6\n\t" \
- /*mm3:mm1={b0,...,b7}*/ \
- "movq %%mm1,%%mm3\n\t" \
- "punpcklbw %%mm7,%%mm1\n\t" \
- "movq %%mm2,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm3\n\t" \
- /*mm5:mm4={c0,...,c7}*/ \
- "movq %%mm2,%%mm5\n\t" \
- "punpcklbw %%mm7,%%mm4\n\t" \
- "punpckhbw %%mm7,%%mm5\n\t" \
- /*mm7={3}x4 \
-   mm5:mm4={c0-b0,...,c7-b7}*/ \
- "pcmpeqw %%mm7,%%mm7\n\t" \
- "psubw %%mm1,%%mm4\n\t" \
- "psrlw $14,%%mm7\n\t" \
- "psubw %%mm3,%%mm5\n\t" \
- /*Scale by 3.*/ \
- "pmullw %%mm7,%%mm4\n\t" \
- "pmullw %%mm7,%%mm5\n\t" \
- /*mm7={4}x4 \
-   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
- "psrlw $1,%%mm7\n\t" \
- "paddw %%mm0,%%mm4\n\t" \
- "psllw $2,%%mm7\n\t" \
- "movq (%[ll]),%%mm0\n\t" \
- "paddw %%mm6,%%mm5\n\t" \
- /*R_i has the range [-127,128], so we compute -R_i instead. \
-   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
- "psubw %%mm7,%%mm4\n\t" \
- "psubw %%mm7,%%mm5\n\t" \
- "psraw $3,%%mm4\n\t" \
- "psraw $3,%%mm5\n\t" \
- "pcmpeqb %%mm7,%%mm7\n\t" \
- "packsswb %%mm5,%%mm4\n\t" \
- "pxor %%mm6,%%mm6\n\t" \
- "pxor %%mm7,%%mm4\n\t" \
- "packuswb %%mm3,%%mm1\n\t" \
- /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
- /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
-    we have to split things by sign (the other option is to work in 16 bits, \
-    but working in 8 bits gives much better parallelism). \
-   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
-   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
-   Finally, we split mm4 into positive and negative pieces using the mask in \
-    mm6, and add and subtract them as appropriate.*/ \
- /*mm4=abs(-R_i)*/ \
- /*mm7=255-2*L*/ \
- "pcmpgtb %%mm4,%%mm6\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "pxor %%mm6,%%mm4\n\t" \
- "psubb %%mm0,%%mm7\n\t" \
- "psubb %%mm6,%%mm4\n\t" \
- /*mm7=255-max(2*L-abs(R_i),0)*/ \
- "paddusb %%mm4,%%mm7\n\t" \
- /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
- "paddusb %%mm7,%%mm4\n\t" \
- "psubusb %%mm7,%%mm4\n\t" \
- /*Now split mm4 by the original sign of -R_i.*/ \
- "movq %%mm4,%%mm5\n\t" \
- "pand %%mm6,%%mm4\n\t" \
- "pandn %%mm5,%%mm6\n\t" \
- /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
- /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
- "paddusb %%mm4,%%mm1\n\t" \
- "psubusb %%mm4,%%mm2\n\t" \
- "psubusb %%mm6,%%mm1\n\t" \
- "paddusb %%mm6,%%mm2\n\t" \
-
-#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
-  do{ \
-    ptrdiff_t ystride3__; \
-    __asm__ __volatile__( \
-      /*mm0={a0,...,a7}*/ \
-      "movq (%[pix]),%%mm0\n\t" \
-      /*ystride3=_ystride*3*/ \
-      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
-      /*mm3={d0,...,d7}*/ \
-      "movq (%[pix],%[ystride3]),%%mm3\n\t" \
-      /*mm1={b0,...,b7}*/ \
-      "movq (%[pix],%[ystride]),%%mm1\n\t" \
-      /*mm2={c0,...,c7}*/ \
-      "movq (%[pix],%[ystride],2),%%mm2\n\t" \
-      OC_LOOP_FILTER8_MMX \
-      /*Write it back out.*/ \
-      "movq %%mm1,(%[pix],%[ystride])\n\t" \
-      "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
-      :[ystride3]"=&r"(ystride3__) \
-      :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
-       [ll]"r"(_ll) \
-      :"memory" \
-    ); \
-  } \
-  while(0)
-
-#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
-  do{ \
-    unsigned char *pix__; \
-    ptrdiff_t      ystride3__; \
-    ptrdiff_t      d__; \
-    pix__=(_pix)-2; \
-    __asm__ __volatile__( \
-      /*x x x x d0 c0 b0 a0*/ \
-      "movd (%[pix]),%%mm0\n\t" \
-      /*x x x x d1 c1 b1 a1*/ \
-      "movd (%[pix],%[ystride]),%%mm1\n\t" \
-      /*ystride3=_ystride*3*/ \
-      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
-      /*x x x x d2 c2 b2 a2*/ \
-      "movd (%[pix],%[ystride],2),%%mm2\n\t" \
-      /*x x x x d3 c3 b3 a3*/ \
-      "lea (%[pix],%[ystride],4),%[d]\n\t" \
-      "movd (%[pix],%[ystride3]),%%mm3\n\t" \
-      /*x x x x d4 c4 b4 a4*/ \
-      "movd (%[d]),%%mm4\n\t" \
-      /*x x x x d5 c5 b5 a5*/ \
-      "movd (%[d],%[ystride]),%%mm5\n\t" \
-      /*x x x x d6 c6 b6 a6*/ \
-      "movd (%[d],%[ystride],2),%%mm6\n\t" \
-      /*x x x x d7 c7 b7 a7*/ \
-      "movd (%[d],%[ystride3]),%%mm7\n\t" \
-      /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
-      "punpcklbw %%mm1,%%mm0\n\t" \
-      /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
-      "punpcklbw %%mm3,%%mm2\n\t" \
-      /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
-      "movq %%mm0,%%mm3\n\t" \
-      /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
-      "punpcklwd %%mm2,%%mm0\n\t" \
-      /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
-      "punpckhwd %%mm2,%%mm3\n\t" \
-      /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
-      "movq %%mm0,%%mm1\n\t" \
-      /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
-      "punpcklbw %%mm5,%%mm4\n\t" \
-      /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
-      "punpcklbw %%mm7,%%mm6\n\t" \
-      /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
-      "movq %%mm4,%%mm5\n\t" \
-      /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
-      "punpcklwd %%mm6,%%mm4\n\t" \
-      /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
-      "punpckhwd %%mm6,%%mm5\n\t" \
-      /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
-      "movq %%mm3,%%mm2\n\t" \
-      /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
-      "punpckldq %%mm4,%%mm0\n\t" \
-      /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
-      "punpckhdq %%mm4,%%mm1\n\t" \
-      /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
-      "punpckldq %%mm5,%%mm2\n\t" \
-      /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
-      "punpckhdq %%mm5,%%mm3\n\t" \
-      OC_LOOP_FILTER8_MMX \
-      /*mm2={b0+R_0'',...,b7+R_7''}*/ \
-      "movq %%mm1,%%mm0\n\t" \
-      /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
-      "punpcklbw %%mm2,%%mm1\n\t" \
-      /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
-      "punpckhbw %%mm2,%%mm0\n\t" \
-      /*[d]=c1 b1 c0 b0*/ \
-      "movd %%mm1,%[d]\n\t" \
-      "movw %w[d],1(%[pix])\n\t" \
-      "psrlq $32,%%mm1\n\t" \
-      "shr $16,%[d]\n\t" \
-      "movw %w[d],1(%[pix],%[ystride])\n\t" \
-      /*[d]=c3 b3 c2 b2*/ \
-      "movd %%mm1,%[d]\n\t" \
-      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
-      "shr $16,%[d]\n\t" \
-      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
-      "lea (%[pix],%[ystride],4),%[pix]\n\t" \
-      /*[d]=c5 b5 c4 b4*/ \
-      "movd %%mm0,%[d]\n\t" \
-      "movw %w[d],1(%[pix])\n\t" \
-      "psrlq $32,%%mm0\n\t" \
-      "shr $16,%[d]\n\t" \
-      "movw %w[d],1(%[pix],%[ystride])\n\t" \
-      /*[d]=c7 b7 c6 b6*/ \
-      "movd %%mm0,%[d]\n\t" \
-      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
-      "shr $16,%[d]\n\t" \
-      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
-      :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
-      :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
-      :"memory" \
-    ); \
-  } \
-  while(0)
-
-# endif
-#endif
diff --git a/drivers/theora/x86/mmxstate.c b/drivers/theora/x86/mmxstate.c
deleted file mode 100644
index 808b0a789b..0000000000
--- a/drivers/theora/x86/mmxstate.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-
-/*MMX acceleration of complete fragment reconstruction algorithm.
-  Originally written by Rudolf Marek.*/
-#include <string.h>
-#include "x86int.h"
-#include "mmxfrag.h"
-#include "mmxloop.h"
-
-#if defined(OC_X86_ASM)
-
-void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
-  unsigned char *dst;
-  ptrdiff_t      frag_buf_off;
-  int            ystride;
-  int            mb_mode;
-  /*Apply the inverse transform.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    /*Note that this value must be unsigned, to keep the __asm__ block from
-       sign-extending it when it puts it in a register.*/
-    ogg_uint16_t p;
-    /*We round this dequant product (and not any of the others) because there's
-       no iDCT rounding.*/
-    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*Fill _dct_coeffs with p.*/
-    __asm__ __volatile__(
-      /*mm0=0000 0000 0000 AAAA*/
-      "movd %[p],%%mm0\n\t"
-      /*mm0=0000 0000 AAAA AAAA*/
-      "punpcklwd %%mm0,%%mm0\n\t"
-      /*mm0=AAAA AAAA AAAA AAAA*/
-      "punpckldq %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
-      :
-      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
-      :"memory"
-    );
-  }
-  else{
-    /*Dequantize the DC coefficient.*/
-    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
-  }
-  /*Fill in the target buffer.*/
-  frag_buf_off=_state->frag_buf_offs[_fragi];
-  mb_mode=_state->frags[_fragi].mb_mode;
-  ystride=_state->ref_ystride[_pli];
-  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
-  else{
-    const unsigned char *ref;
-    int                  mvoffsets[2];
-    ref=
-     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
-     +frag_buf_off;
-    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
-     _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
-      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
-    }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
-  }
-}
-
-/*We copy these entire function to inline the actual MMX routines so that we
-   use only a single indirect call.*/
-
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-  }
-}
-
-/*Apply the loop filter to a given set of fragment rows in the given plane.
-  The filter may be run on the bottom edge, affecting pixels in the next row of
-   fragments, so this row also needs to be available.
-  _bv:        The bounding values array.
-  _refi:      The index of the frame buffer to filter.
-  _pli:       The color plane to filter.
-  _fragy0:    The Y coordinate of the first fragment row to filter.
-  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
-  OC_ALIGN8(unsigned char   ll[8]);
-  const oc_fragment_plane *fplane;
-  const oc_fragment       *frags;
-  const ptrdiff_t         *frag_buf_offs;
-  unsigned char           *ref_frame_data;
-  ptrdiff_t                fragi_top;
-  ptrdiff_t                fragi_bot;
-  ptrdiff_t                fragi0;
-  ptrdiff_t                fragi0_end;
-  int                      ystride;
-  int                      nhfrags;
-  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
-  fplane=_state->fplanes+_pli;
-  nhfrags=fplane->nhfrags;
-  fragi_top=fplane->froffset;
-  fragi_bot=fragi_top+fplane->nfrags;
-  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
-  ystride=_state->ref_ystride[_pli];
-  frags=_state->frags;
-  frag_buf_offs=_state->frag_buf_offs;
-  ref_frame_data=_state->ref_frame_data[_refi];
-  /*The following loops are constructed somewhat non-intuitively on purpose.
-    The main idea is: if a block boundary has at least one coded fragment on
-     it, the filter is applied to it.
-    However, the order that the filters are applied in matters, and VP3 chose
-     the somewhat strange ordering used below.*/
-  while(fragi0<fragi0_end){
-    ptrdiff_t fragi;
-    ptrdiff_t fragi_end;
-    fragi=fragi0;
-    fragi_end=fragi+nhfrags;
-    while(fragi<fragi_end){
-      if(frags[fragi].coded){
-        unsigned char *ref;
-        ref=ref_frame_data+frag_buf_offs[fragi];
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
-        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
-        }
-        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
-        }
-      }
-      fragi++;
-    }
-    fragi0+=nhfrags;
-  }
-}
-
-#endif
diff --git a/drivers/theora/x86/sse2fdct.c b/drivers/theora/x86/sse2fdct.c
deleted file mode 100644
index 86c17d68b1..0000000000
--- a/drivers/theora/x86/sse2fdct.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************/
-/*SSE2 fDCT implementation for x86_64.*/
-/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
-#include <stddef.h>
-#include "x86enc.h"
-
-#if defined(OC_X86_64_ASM)
-
-# define OC_FDCT8x8 \
- /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
- "#OC_FDCT8x8\n\t" \
- /*Stage 1:*/ \
- "movdqa %%xmm0,%%xmm11\n\t" \
- "movdqa %%xmm1,%%xmm10\n\t" \
- "movdqa %%xmm2,%%xmm9\n\t" \
- "movdqa %%xmm3,%%xmm8\n\t" \
- /*xmm11=t7'=t0-t7*/ \
- "psubw %%xmm7,%%xmm11\n\t" \
- /*xmm10=t6'=t1-t6*/ \
- "psubw %%xmm6,%%xmm10\n\t" \
- /*xmm9=t5'=t2-t5*/ \
- "psubw %%xmm5,%%xmm9\n\t" \
- /*xmm8=t4'=t3-t4*/ \
- "psubw %%xmm4,%%xmm8\n\t" \
- /*xmm0=t0'=t0+t7*/ \
- "paddw %%xmm7,%%xmm0\n\t" \
- /*xmm1=t1'=t1+t6*/ \
- "paddw %%xmm6,%%xmm1\n\t" \
- /*xmm5=t2'=t2+t5*/ \
- "paddw %%xmm2,%%xmm5\n\t" \
- /*xmm4=t3'=t3+t4*/ \
- "paddw %%xmm3,%%xmm4\n\t" \
- /*xmm2,3,6,7 are now free.*/ \
- /*Stage 2:*/ \
- "movdqa %%xmm0,%%xmm3\n\t" \
- "mov $0x5A806A0A,%[a]\n\t" \
- "movdqa %%xmm1,%%xmm2\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "movdqa %%xmm10,%%xmm6\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm2=t2''=t1'-t2'*/ \
- "psubw %%xmm5,%%xmm2\n\t" \
- "pxor %%xmm12,%%xmm12\n\t" \
- /*xmm3=t3''=t0'-t3'*/ \
- "psubw %%xmm4,%%xmm3\n\t" \
- "psubw %%xmm14,%%xmm12\n\t" \
- /*xmm10=t5''=t6'-t5'*/ \
- "psubw %%xmm9,%%xmm10\n\t" \
- "paddw %%xmm12,%%xmm12\n\t" \
- /*xmm4=t0''=t0'+t3'*/ \
- "paddw %%xmm0,%%xmm4\n\t" \
- /*xmm1=t1''=t1'+t2'*/ \
- "paddw %%xmm5,%%xmm1\n\t" \
- /*xmm6=t6''=t6'+t5'*/ \
- "paddw %%xmm9,%%xmm6\n\t" \
- /*xmm0,xmm5,xmm9 are now free.*/ \
- /*Stage 3:*/ \
- /*xmm10:xmm5=t5''*27146+0xB500 \
-   xmm0=t5''*/ \
- "movdqa %%xmm10,%%xmm5\n\t" \
- "movdqa %%xmm10,%%xmm0\n\t" \
- "punpckhwd %%xmm12,%%xmm10\n\t" \
- "pmaddwd %%xmm13,%%xmm10\n\t" \
- "punpcklwd %%xmm12,%%xmm5\n\t" \
- "pmaddwd %%xmm13,%%xmm5\n\t" \
- /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
- "psrad $16,%%xmm10\n\t" \
- "psrad $16,%%xmm5\n\t" \
- "packssdw %%xmm10,%%xmm5\n\t" \
- "paddw %%xmm0,%%xmm5\n\t" \
- /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
- "pcmpeqw %%xmm15,%%xmm0\n\t" \
- "psubw %%xmm14,%%xmm0\n\t" \
- "paddw %%xmm5,%%xmm0\n\t" \
- "movdqa %%xmm8,%%xmm5\n\t" \
- "psraw $1,%%xmm0\n\t" \
- /*xmm5=t5'''=t4'-s*/ \
- "psubw %%xmm0,%%xmm5\n\t" \
- /*xmm8=t4''=t4'+s*/ \
- "paddw %%xmm0,%%xmm8\n\t" \
- /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
- /*xmm7:xmm9=t6''*27146+0xB500*/ \
- "movdqa %%xmm6,%%xmm7\n\t" \
- "movdqa %%xmm6,%%xmm9\n\t" \
- "punpckhwd %%xmm12,%%xmm7\n\t" \
- "pmaddwd %%xmm13,%%xmm7\n\t" \
- "punpcklwd %%xmm12,%%xmm9\n\t" \
- "pmaddwd %%xmm13,%%xmm9\n\t" \
- /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
- "psrad $16,%%xmm7\n\t" \
- "psrad $16,%%xmm9\n\t" \
- "packssdw %%xmm7,%%xmm9\n\t" \
- "paddw %%xmm6,%%xmm9\n\t" \
- /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
- "pcmpeqw %%xmm15,%%xmm6\n\t" \
- "psubw %%xmm14,%%xmm6\n\t" \
- "paddw %%xmm6,%%xmm9\n\t" \
- "movdqa %%xmm11,%%xmm7\n\t" \
- "psraw $1,%%xmm9\n\t" \
- /*xmm7=t6'''=t7'-s*/ \
- "psubw %%xmm9,%%xmm7\n\t" \
- /*xmm9=t7''=t7'+s*/ \
- "paddw %%xmm11,%%xmm9\n\t" \
- /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
- /*Stage 4:*/ \
- /*xmm10:xmm0=t1''*27146+0xB500*/ \
- "movdqa %%xmm1,%%xmm0\n\t" \
- "movdqa %%xmm1,%%xmm10\n\t" \
- "punpcklwd %%xmm12,%%xmm0\n\t" \
- "pmaddwd %%xmm13,%%xmm0\n\t" \
- "punpckhwd %%xmm12,%%xmm10\n\t" \
- "pmaddwd %%xmm13,%%xmm10\n\t" \
- /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
- "psrad $16,%%xmm0\n\t" \
- "psrad $16,%%xmm10\n\t" \
- "mov $0x20006A0A,%[a]\n\t" \
- "packssdw %%xmm10,%%xmm0\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "paddw %%xmm1,%%xmm0\n\t" \
- /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
- "pcmpeqw %%xmm15,%%xmm1\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "psubw %%xmm14,%%xmm1\n\t" \
- "paddw %%xmm1,%%xmm0\n\t" \
- /*xmm10:xmm4=t0''*27146+0x4000*/ \
- "movdqa %%xmm4,%%xmm1\n\t" \
- "movdqa %%xmm4,%%xmm10\n\t" \
- "punpcklwd %%xmm12,%%xmm4\n\t" \
- "pmaddwd %%xmm13,%%xmm4\n\t" \
- "punpckhwd %%xmm12,%%xmm10\n\t" \
- "pmaddwd %%xmm13,%%xmm10\n\t" \
- /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
- "psrad $16,%%xmm4\n\t" \
- "psrad $16,%%xmm10\n\t" \
- "mov $0x6CB7,%[a]\n\t" \
- "packssdw %%xmm10,%%xmm4\n\t" \
- "movd %[a],%%xmm12\n\t" \
- "paddw %%xmm1,%%xmm4\n\t" \
- /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
- "pcmpeqw %%xmm15,%%xmm1\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- "psubw %%xmm14,%%xmm1\n\t" \
- "mov $0x7FFF6C84,%[a]\n\t" \
- "paddw %%xmm1,%%xmm4\n\t" \
- /*xmm0=_y[0]=u=r+s>>1 \
-   The naive implementation could cause overflow, so we use \
-    u=(r&s)+((r^s)>>1).*/ \
- "movdqa %%xmm0,%%xmm6\n\t" \
- "pxor %%xmm4,%%xmm0\n\t" \
- "pand %%xmm4,%%xmm6\n\t" \
- "psraw $1,%%xmm0\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "paddw %%xmm6,%%xmm0\n\t" \
- /*xmm4=_y[4]=v=r-u*/ \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "psubw %%xmm0,%%xmm4\n\t" \
- /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
- /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
- "movdqa %%xmm3,%%xmm10\n\t" \
- "movdqa %%xmm3,%%xmm6\n\t" \
- "punpcklwd %%xmm3,%%xmm10\n\t" \
- "pmaddwd %%xmm13,%%xmm10\n\t" \
- "mov $0x61F861F8,%[a]\n\t" \
- "punpckhwd %%xmm3,%%xmm6\n\t" \
- "pmaddwd %%xmm13,%%xmm6\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "paddd %%xmm12,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "paddd %%xmm12,%%xmm6\n\t" \
- /*xmm1:xmm2=25080*t2'' \
-   xmm12=t2''*/ \
- "movdqa %%xmm2,%%xmm11\n\t" \
- "movdqa %%xmm2,%%xmm12\n\t" \
- "pmullw %%xmm13,%%xmm2\n\t" \
- "pmulhw %%xmm13,%%xmm11\n\t" \
- "movdqa %%xmm2,%%xmm1\n\t" \
- "punpcklwd %%xmm11,%%xmm2\n\t" \
- "punpckhwd %%xmm11,%%xmm1\n\t" \
- /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
- "paddd %%xmm2,%%xmm10\n\t" \
- "paddd %%xmm1,%%xmm6\n\t" \
- "psrad $16,%%xmm10\n\t" \
- "pcmpeqw %%xmm15,%%xmm3\n\t" \
- "psrad $16,%%xmm6\n\t" \
- "psubw %%xmm14,%%xmm3\n\t" \
- "packssdw %%xmm6,%%xmm10\n\t" \
- "paddw %%xmm3,%%xmm10\n\t" \
- /*xmm2=_y[2]=u \
-   xmm10=s=(25080*u>>16)-t2''*/ \
- "movdqa %%xmm10,%%xmm2\n\t" \
- "pmulhw %%xmm13,%%xmm10\n\t" \
- "psubw %%xmm12,%%xmm10\n\t" \
- /*xmm1:xmm6=s*21600+0x2800*/ \
- "pxor %%xmm12,%%xmm12\n\t" \
- "psubw %%xmm14,%%xmm12\n\t" \
- "mov $0x28005460,%[a]\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "movdqa %%xmm10,%%xmm6\n\t" \
- "movdqa %%xmm10,%%xmm1\n\t" \
- "punpcklwd %%xmm12,%%xmm6\n\t" \
- "pmaddwd %%xmm13,%%xmm6\n\t" \
- "mov $0x0E3D,%[a]\n\t" \
- "punpckhwd %%xmm12,%%xmm1\n\t" \
- "pmaddwd %%xmm13,%%xmm1\n\t" \
- /*xmm6=(s*21600+0x2800>>18)+s*/ \
- "psrad $18,%%xmm6\n\t" \
- "psrad $18,%%xmm1\n\t" \
- "movd %[a],%%xmm12\n\t" \
- "packssdw %%xmm1,%%xmm6\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- "paddw %%xmm10,%%xmm6\n\t" \
- /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
- "mov $0x7FFF54DC,%[a]\n\t" \
- "pcmpeqw %%xmm15,%%xmm10\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "psubw %%xmm14,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "paddw %%xmm10,%%xmm6\n\t " \
- /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
- /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
- "movdqa %%xmm5,%%xmm10\n\t" \
- "movdqa %%xmm5,%%xmm11\n\t" \
- "punpcklwd %%xmm5,%%xmm10\n\t" \
- "pmaddwd %%xmm13,%%xmm10\n\t" \
- "mov $0x8E3A8E3A,%[a]\n\t" \
- "punpckhwd %%xmm5,%%xmm11\n\t" \
- "pmaddwd %%xmm13,%%xmm11\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "paddd %%xmm12,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "paddd %%xmm12,%%xmm11\n\t" \
- /*xmm7:xmm12=36410*t6''' \
-   xmm1=t6'''*/ \
- "movdqa %%xmm7,%%xmm3\n\t" \
- "movdqa %%xmm7,%%xmm1\n\t" \
- "pmulhw %%xmm13,%%xmm3\n\t" \
- "pmullw %%xmm13,%%xmm7\n\t" \
- "paddw %%xmm1,%%xmm3\n\t" \
- "movdqa %%xmm7,%%xmm12\n\t" \
- "punpckhwd %%xmm3,%%xmm7\n\t" \
- "punpcklwd %%xmm3,%%xmm12\n\t" \
- /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
- "paddd %%xmm12,%%xmm10\n\t" \
- "paddd %%xmm7,%%xmm11\n\t" \
- "psrad $16,%%xmm10\n\t" \
- "pcmpeqw %%xmm15,%%xmm5\n\t" \
- "psrad $16,%%xmm11\n\t" \
- "psubw %%xmm14,%%xmm5\n\t" \
- "packssdw %%xmm11,%%xmm10\n\t" \
- "pxor %%xmm12,%%xmm12\n\t" \
- "paddw %%xmm5,%%xmm10\n\t" \
- /*xmm5=_y[5]=u \
-   xmm1=s=t6'''-(36410*u>>16)*/ \
- "psubw %%xmm14,%%xmm12\n\t" \
- "movdqa %%xmm10,%%xmm5\n\t" \
- "mov $0x340067C8,%[a]\n\t" \
- "pmulhw %%xmm13,%%xmm10\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "paddw %%xmm5,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "psubw %%xmm10,%%xmm1\n\t" \
- /*xmm11:xmm3=s*26568+0x3400*/ \
- "movdqa %%xmm1,%%xmm3\n\t" \
- "movdqa %%xmm1,%%xmm11\n\t" \
- "punpcklwd %%xmm12,%%xmm3\n\t" \
- "pmaddwd %%xmm13,%%xmm3\n\t" \
- "mov $0x7B1B,%[a]\n\t" \
- "punpckhwd %%xmm12,%%xmm11\n\t" \
- "pmaddwd %%xmm13,%%xmm11\n\t" \
- /*xmm3=(s*26568+0x3400>>17)+s*/ \
- "psrad $17,%%xmm3\n\t" \
- "psrad $17,%%xmm11\n\t" \
- "movd %[a],%%xmm12\n\t" \
- "packssdw %%xmm11,%%xmm3\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- "paddw %%xmm1,%%xmm3\n\t" \
- /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
- "mov $0x7FFF7B16,%[a]\n\t" \
- "pcmpeqw %%xmm15,%%xmm1\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "psubw %%xmm14,%%xmm1\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "paddw %%xmm1,%%xmm3\n\t " \
- /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
- /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
- "movdqa %%xmm9,%%xmm10\n\t" \
- "movdqa %%xmm9,%%xmm11\n\t" \
- "punpcklwd %%xmm9,%%xmm10\n\t" \
- "pmaddwd %%xmm13,%%xmm10\n\t" \
- "mov $0x31F131F1,%[a]\n\t" \
- "punpckhwd %%xmm9,%%xmm11\n\t" \
- "pmaddwd %%xmm13,%%xmm11\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "paddd %%xmm12,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "paddd %%xmm12,%%xmm11\n\t" \
- /*xmm12:xmm7=12785*t4''*/ \
- "movdqa %%xmm8,%%xmm7\n\t" \
- "movdqa %%xmm8,%%xmm1\n\t" \
- "pmullw %%xmm13,%%xmm7\n\t" \
- "pmulhw %%xmm13,%%xmm1\n\t" \
- "movdqa %%xmm7,%%xmm12\n\t" \
- "punpcklwd %%xmm1,%%xmm7\n\t" \
- "punpckhwd %%xmm1,%%xmm12\n\t" \
- /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
- "paddd %%xmm7,%%xmm10\n\t" \
- "paddd %%xmm12,%%xmm11\n\t" \
- "psrad $16,%%xmm10\n\t" \
- "pcmpeqw %%xmm15,%%xmm9\n\t" \
- "psrad $16,%%xmm11\n\t" \
- "psubw %%xmm14,%%xmm9\n\t" \
- "packssdw %%xmm11,%%xmm10\n\t" \
- "pxor %%xmm12,%%xmm12\n\t" \
- "paddw %%xmm9,%%xmm10\n\t" \
- /*xmm1=_y[1]=u \
-   xmm10=s=(12785*u>>16)-t4''*/ \
- "psubw %%xmm14,%%xmm12\n\t" \
- "movdqa %%xmm10,%%xmm1\n\t" \
- "mov $0x3000503B,%[a]\n\t" \
- "pmulhw %%xmm13,%%xmm10\n\t" \
- "movd %[a],%%xmm13\n\t" \
- "psubw %%xmm8,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm8:xmm7=s*20539+0x3000*/ \
- "movdqa %%xmm10,%%xmm7\n\t" \
- "movdqa %%xmm10,%%xmm8\n\t" \
- "punpcklwd %%xmm12,%%xmm7\n\t" \
- "pmaddwd %%xmm13,%%xmm7\n\t" \
- "punpckhwd %%xmm12,%%xmm8\n\t" \
- "pmaddwd %%xmm13,%%xmm8\n\t" \
- /*xmm7=(s*20539+0x3000>>20)+s*/ \
- "psrad $20,%%xmm7\n\t" \
- "psrad $20,%%xmm8\n\t" \
- "packssdw %%xmm8,%%xmm7\n\t" \
- "paddw %%xmm10,%%xmm7\n\t" \
- /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
- "pcmpeqw %%xmm15,%%xmm10\n\t" \
- "psubw %%xmm14,%%xmm10\n\t" \
- "paddw %%xmm10,%%xmm7\n\t " \
-
-# define OC_TRANSPOSE8x8 \
- "#OC_TRANSPOSE8x8\n\t" \
- "movdqa %%xmm4,%%xmm8\n\t" \
- /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
- "punpcklwd %%xmm5,%%xmm4\n\t" \
- /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
- "punpckhwd %%xmm5,%%xmm8\n\t" \
- /*xmm5 is free.*/ \
- "movdqa %%xmm0,%%xmm5\n\t" \
- /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
- "punpcklwd %%xmm1,%%xmm0\n\t" \
- /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
- "punpckhwd %%xmm1,%%xmm5\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm6,%%xmm1\n\t" \
- /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
- "punpcklwd %%xmm7,%%xmm6\n\t" \
- /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
- "punpckhwd %%xmm7,%%xmm1\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm2,%%xmm7\n\t" \
- /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
- "punpcklwd %%xmm3,%%xmm7\n\t" \
- /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
- "punpckhwd %%xmm3,%%xmm2\n\t" \
- /*xmm3 is free.*/ \
- "movdqa %%xmm0,%%xmm3\n\t" \
- /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
- "punpckldq %%xmm7,%%xmm0\n\t" \
- /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
- "punpckhdq %%xmm7,%%xmm3\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm5,%%xmm7\n\t" \
- /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
- "punpckldq %%xmm2,%%xmm5\n\t" \
- /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
- "punpckhdq %%xmm2,%%xmm7\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm4,%%xmm2\n\t" \
- /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
- "punpckldq %%xmm6,%%xmm2\n\t" \
- /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
- "punpckhdq %%xmm6,%%xmm4\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm8,%%xmm6\n\t" \
- /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
- "punpckldq %%xmm1,%%xmm6\n\t" \
- /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
- "punpckhdq %%xmm1,%%xmm8\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm0,%%xmm1\n\t" \
- /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
- "punpcklqdq %%xmm2,%%xmm0\n\t" \
- /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
- "punpckhqdq %%xmm2,%%xmm1\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm3,%%xmm2\n\t" \
- /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
- "punpcklqdq %%xmm4,%%xmm2\n\t" \
- /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
- "punpckhqdq %%xmm4,%%xmm3\n\t" \
- /*xmm4 is free.*/ \
- "movdqa %%xmm5,%%xmm4\n\t" \
- /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
- "punpcklqdq %%xmm6,%%xmm4\n\t" \
- /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
- "punpckhqdq %%xmm6,%%xmm5\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm7,%%xmm6\n\t" \
- /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
- "punpcklqdq %%xmm8,%%xmm6\n\t" \
- /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
- "punpckhqdq %%xmm8,%%xmm7\n\t" \
- /*xmm8 is free.*/ \
-
-/*SSE2 implementation of the fDCT for x86-64 only.
-  Because of the 8 extra XMM registers on x86-64, this version can operate
-   without any temporary stack access at all.*/
-void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  ptrdiff_t a;
-  __asm__ __volatile__(
-    /*Load the input.*/
-    "movdqa 0x00(%[x]),%%xmm0\n\t"
-    "movdqa 0x10(%[x]),%%xmm1\n\t"
-    "movdqa 0x20(%[x]),%%xmm2\n\t"
-    "movdqa 0x30(%[x]),%%xmm3\n\t"
-    "movdqa 0x40(%[x]),%%xmm4\n\t"
-    "movdqa 0x50(%[x]),%%xmm5\n\t"
-    "movdqa 0x60(%[x]),%%xmm6\n\t"
-    "movdqa 0x70(%[x]),%%xmm7\n\t"
-    /*Add two extra bits of working precision to improve accuracy; any more and
-       we could overflow.*/
-    /*We also add a few biases to correct for some systematic error that
-       remains in the full fDCT->iDCT round trip.*/
-    /*xmm15={0}x8*/
-    "pxor %%xmm15,%%xmm15\n\t"
-    /*xmm14={-1}x8*/
-    "pcmpeqb %%xmm14,%%xmm14\n\t"
-    "psllw $2,%%xmm0\n\t"
-    /*xmm8=xmm0*/
-    "movdqa %%xmm0,%%xmm8\n\t"
-    "psllw $2,%%xmm1\n\t"
-    /*xmm8={_x[7...0]==0}*/
-    "pcmpeqw %%xmm15,%%xmm8\n\t"
-    "psllw $2,%%xmm2\n\t"
-    /*xmm8={_x[7...0]!=0}*/
-    "psubw %%xmm14,%%xmm8\n\t"
-    "psllw $2,%%xmm3\n\t"
-    /*%[a]=1*/
-    "mov $1,%[a]\n\t"
-    /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
-    "pslld $16,%%xmm8\n\t"
-    "psllw $2,%%xmm4\n\t"
-    /*xmm9={0,0,0,0,0,0,0,1}*/
-    "movd %[a],%%xmm9\n\t"
-    /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
-    "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
-    "psllw $2,%%xmm5\n\t"
-    /*%[a]={1}x2*/
-    "mov $0x10001,%[a]\n\t"
-    /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
-    "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
-    "psllw $2,%%xmm6\n\t"
-    /*xmm10={0,0,0,0,0,0,1,1}*/
-    "movd %[a],%%xmm10\n\t"
-    /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
-    "paddw %%xmm8,%%xmm0\n\t"
-    "psllw $2,%%xmm7\n\t"
-    /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
-    "paddw %%xmm10,%%xmm0\n\t"
-    /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
-    "psubw %%xmm9,%%xmm1\n\t"
-    /*Transform columns.*/
-    OC_FDCT8x8
-    /*Transform rows.*/
-    OC_TRANSPOSE8x8
-    OC_FDCT8x8
-    /*TODO: zig-zag ordering?*/
-    OC_TRANSPOSE8x8
-    /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
-    "paddw %%xmm14,%%xmm14\n\t"
-    "psubw %%xmm14,%%xmm0\n\t"
-    "psubw %%xmm14,%%xmm1\n\t"
-    "psraw $2,%%xmm0\n\t"
-    "psubw %%xmm14,%%xmm2\n\t"
-    "psraw $2,%%xmm1\n\t"
-    "psubw %%xmm14,%%xmm3\n\t"
-    "psraw $2,%%xmm2\n\t"
-    "psubw %%xmm14,%%xmm4\n\t"
-    "psraw $2,%%xmm3\n\t"
-    "psubw %%xmm14,%%xmm5\n\t"
-    "psraw $2,%%xmm4\n\t"
-    "psubw %%xmm14,%%xmm6\n\t"
-    "psraw $2,%%xmm5\n\t"
-    "psubw %%xmm14,%%xmm7\n\t"
-    "psraw $2,%%xmm6\n\t"
-    "psraw $2,%%xmm7\n\t"
-    /*Store the result.*/
-    "movdqa %%xmm0,0x00(%[y])\n\t"
-    "movdqa %%xmm1,0x10(%[y])\n\t"
-    "movdqa %%xmm2,0x20(%[y])\n\t"
-    "movdqa %%xmm3,0x30(%[y])\n\t"
-    "movdqa %%xmm4,0x40(%[y])\n\t"
-    "movdqa %%xmm5,0x50(%[y])\n\t"
-    "movdqa %%xmm6,0x60(%[y])\n\t"
-    "movdqa %%xmm7,0x70(%[y])\n\t"
-    :[a]"=&r"(a)
-    :[y]"r"(_y),[x]"r"(_x)
-    :"memory"
-  );
-}
-#endif
diff --git a/drivers/theora/x86/x86enc.c b/drivers/theora/x86/x86enc.c
deleted file mode 100644
index 43b7be3ea3..0000000000
--- a/drivers/theora/x86/x86enc.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
-
- ********************************************************************/
-#include "x86enc.h"
-
-#if defined(OC_X86_ASM)
-
-#include "../cpu.c"
-
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
-  ogg_uint32_t cpu_flags;
-  cpu_flags=oc_cpu_flags_get();
-  oc_enc_vtable_init_c(_enc);
-  if(cpu_flags&OC_CPU_X86_MMX){
-    _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
-    _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
-    _enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
-    _enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
-  }
-  if(cpu_flags&OC_CPU_X86_MMXEXT){
-    _enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
-    _enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
-    _enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
-    _enc->opt_vtable.frag_satd_thresh=oc_enc_frag_satd_thresh_mmxext;
-    _enc->opt_vtable.frag_satd2_thresh=oc_enc_frag_satd2_thresh_mmxext;
-    _enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
-    _enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
-  }
-  if(cpu_flags&OC_CPU_X86_SSE2){
-# if defined(OC_X86_64_ASM)
-    /*_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;*/
-# endif
-  }
-}
-#endif
diff --git a/drivers/theora/x86/x86enc.h b/drivers/theora/x86/x86enc.h
deleted file mode 100644
index 06c3908bcd..0000000000
--- a/drivers/theora/x86/x86enc.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
-
- ********************************************************************/
-
-#if !defined(_x86_x86enc_H)
-# define _x86_x86enc_H (1)
-# include "../encint.h"
-# include "x86int.h"
-
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
-
-unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride);
-unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_satd_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref,int _ystride,unsigned _thresh);
-unsigned oc_enc_frag_satd2_thresh_mmxext(const unsigned char *_src,
- const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
- unsigned _thresh);
-unsigned oc_enc_frag_intra_satd_mmxext(const unsigned char *_src,int _ystride);
-void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,const unsigned char *_y,int _stride);
-void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
- const unsigned char *_x,int _stride);
-void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
- const unsigned char *_src1,const unsigned char *_src2,int _ystride);
-void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-
-#endif
diff --git a/drivers/theora/x86/x86int.h b/drivers/theora/x86/x86int.h
deleted file mode 100644
index ede724f5aa..0000000000
--- a/drivers/theora/x86/x86int.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: x86int.h 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-
-#if !defined(_x86_x86int_H)
-# define _x86_x86int_H (1)
-# include "../internal.h"
-
-void oc_state_vtable_init_x86(oc_theora_state *_state);
-
-void oc_frag_copy_mmx(unsigned char *_dst,
- const unsigned char *_src,int _ystride);
-void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
- const ogg_int16_t *_residue);
-void oc_frag_recon_inter_mmx(unsigned char *_dst,
- const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
-void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
- const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
-void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
-void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
-void oc_restore_fpu_mmx(void);
-
-#endif
diff --git a/drivers/theora/x86/x86state.c b/drivers/theora/x86/x86state.c
deleted file mode 100644
index a786bec284..0000000000
--- a/drivers/theora/x86/x86state.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id: x86state.c 16503 2009-08-22 18:14:02Z giles $
-
- ********************************************************************/
-
-#include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-#include "../cpu.c"
-
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[128]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-  64,64,64,64,64,64,64,64,
-};
-
-void oc_state_vtable_init_x86(oc_theora_state *_state){
-  _state->cpu_flags=oc_cpu_flags_get();
-  if(_state->cpu_flags&OC_CPU_X86_MMX){
-    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
-    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
-    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
-    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
-    _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
-    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
-    _state->opt_vtable.state_loop_filter_frag_rows=
-     oc_state_loop_filter_frag_rows_mmx;
-    _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
-    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
-  }
-  else oc_state_vtable_init_c(_state);
-}
-#endif