thirdparty/libtheora/x86/x86enquant.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
 *                                                                  *
 ********************************************************************

  function:
    last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $

 ********************************************************************/

#include "x86enc.h"

#if defined(OC_X86_ASM)


/*The default enquant table is not quite suitable for SIMD purposes.
  First, the m and l parameters need to be separated so that an entire row full
   of m's or l's can be loaded at a time.
  Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
   emulate one with a multiply.
  Therefore we translate the shift count into a scale factor.*/
void oc_enc_enquant_table_init_x86(void *_enquant,
 const ogg_uint16_t _dequant[64]){
  ogg_int16_t *m;
  ogg_int16_t *l;
  int          zzi;
  m=(ogg_int16_t *)_enquant;
  l=m+64;
  for(zzi=0;zzi<64;zzi++){
    oc_iquant q;
    oc_iquant_init(&q,_dequant[zzi]);
    m[zzi]=q.m;
    /*q.l must be at least 2 for this to work; fortunately, once all the scale
       factors are baked in, the minimum quantizer is much larger than that.*/
    l[zzi]=1<<16-q.l;
  }
}

void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
  int pli;
  int qii;
  int qti;
  for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
    ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
     ((ogg_int16_t *)_enquant[pli][0][qti])[0];
    ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
     ((ogg_int16_t *)_enquant[pli][0][qti])[64];
  }
}

int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
 const ogg_uint16_t _dequant[64],const void *_enquant){
  ptrdiff_t r;
  __asm__ __volatile__(
    "xor %[r],%[r]\n\t"
    /*Loop through two rows at a time.*/
    ".p2align 4\n\t"
    "0:\n\t"
    /*Load the first two rows of the data and the quant matrices.*/
    "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
    "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
    "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
    "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
    "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
    "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
    /*Double the input and propagate its sign to the rounding factor.
      Using SSSE3's psignw would help here, but we need the mask later anyway.*/
    "movdqa %%xmm0,%%xmm6\n\t"
    "psraw $15,%%xmm0\n\t"
    "movdqa %%xmm1,%%xmm7\n\t"
    "paddw %%xmm6,%%xmm6\n\t"
    "psraw $15,%%xmm1\n\t"
    "paddw %%xmm7,%%xmm7\n\t"
    "paddw %%xmm0,%%xmm2\n\t"
    "paddw %%xmm1,%%xmm3\n\t"
    "pxor %%xmm0,%%xmm2\n\t"
    "pxor %%xmm1,%%xmm3\n\t"
    /*Add the rounding factor and perform the first multiply.*/
    "paddw %%xmm2,%%xmm6\n\t"
    "paddw %%xmm3,%%xmm7\n\t"
    "pmulhw %%xmm6,%%xmm4\n\t"
    "pmulhw %%xmm7,%%xmm5\n\t"
    "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
    "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
    "paddw %%xmm4,%%xmm6\n\t"
    "paddw %%xmm5,%%xmm7\n\t"
    /*Emulate an element-wise right-shift via a second multiply.*/
    "pmulhw %%xmm2,%%xmm6\n\t"
    "pmulhw %%xmm3,%%xmm7\n\t"
    "add $32,%[r]\n\t"
    "cmp $96,%[r]\n\t"
    /*Correct for the sign.*/
    "psubw %%xmm0,%%xmm6\n\t"
    "psubw %%xmm1,%%xmm7\n\t"
    /*Save the result.*/
    "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
    "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
    "jle 0b\n\t"
    /*Now find the location of the last non-zero value.*/
    "movdqa 0x50(%[qdct]),%%xmm5\n\t"
    "movdqa 0x40(%[qdct]),%%xmm4\n\t"
    "packsswb %%xmm7,%%xmm6\n\t"
    "packsswb %%xmm5,%%xmm4\n\t"
    "pxor %%xmm0,%%xmm0\n\t"
    "mov $-1,%k[dq]\n\t"
    "pcmpeqb %%xmm0,%%xmm6\n\t"
    "pcmpeqb %%xmm0,%%xmm4\n\t"
    "pmovmskb %%xmm6,%k[q]\n\t"
    "pmovmskb %%xmm4,%k[r]\n\t"
    "shl $16,%k[q]\n\t"
    "or %k[r],%k[q]\n\t"
    "mov $32,%[r]\n\t"
    /*We have to use xor here instead of not in order to set the flags.*/
    "xor %k[dq],%k[q]\n\t"
    "jnz 1f\n\t"
    "movdqa 0x30(%[qdct]),%%xmm7\n\t"
    "movdqa 0x20(%[qdct]),%%xmm6\n\t"
    "movdqa 0x10(%[qdct]),%%xmm5\n\t"
    "movdqa 0x00(%[qdct]),%%xmm4\n\t"
    "packsswb %%xmm7,%%xmm6\n\t"
    "packsswb %%xmm5,%%xmm4\n\t"
    "pcmpeqb %%xmm0,%%xmm6\n\t"
    "pcmpeqb %%xmm0,%%xmm4\n\t"
    "pmovmskb %%xmm6,%k[q]\n\t"
    "pmovmskb %%xmm4,%k[r]\n\t"
    "shl $16,%k[q]\n\t"
    "or %k[r],%k[q]\n\t"
    "xor %[r],%[r]\n\t"
    "not %k[q]\n\t"
    "or $1,%k[q]\n\t"
    "1:\n\t"
    "bsr %k[q],%k[q]\n\t"
    "add %k[q],%k[r]\n\t"
    :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
    :[dct]"r"(_dct),[qdct]"r"(_qdct)
    :"cc","memory"
  );
  return (int)r;
}

#endif