diff options
Diffstat (limited to 'drivers/theora/mathops.c')
-rw-r--r-- | drivers/theora/mathops.c | 296 |
1 files changed, 296 insertions, 0 deletions
diff --git a/drivers/theora/mathops.c b/drivers/theora/mathops.c new file mode 100644 index 0000000000..d3fb909194 --- /dev/null +++ b/drivers/theora/mathops.c @@ -0,0 +1,296 @@ +#include "mathops.h" +#include <limits.h> + +/*The fastest fallback strategy for platforms with fast multiplication appears + to be based on de Bruijn sequences~\cite{LP98}. + Tests confirmed this to be true even on an ARM11, where it is actually faster + than using the native clz instruction. + Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where + multiplication or table lookups are too expensive. + + @UNPUBLISHED{LP98, + author="Charles E. Leiserson and Harald Prokop", + title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word", + month=Jun, + year=1998, + note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}" + }*/ +#if !defined(OC_ILOG_NODEBRUIJN)&& \ + !defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL +static const unsigned char OC_DEBRUIJN_IDX32[32]={ + 0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8, + 31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9 +}; +#endif + +int oc_ilog32(ogg_uint32_t _v){ +#if defined(OC_CLZ32) + return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v; +#else +/*On a Pentium M, this branchless version tested as the fastest version without + multiplications on 1,000,000,000 random 32-bit integers, edging out a + similar version with branches, and a 256-entry LUT version.*/ +# if defined(OC_ILOG_NODEBRUIJN) + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFU)<<4; + _v>>=m; + ret|=m; + m=(_v>0xFFU)<<3; + _v>>=m; + ret|=m; + m=(_v>0xFU)<<2; + _v>>=m; + ret|=m; + m=(_v>3)<<1; + _v>>=m; + ret|=m; + ret+=_v>1; + return ret; +/*This de Bruijn sequence version is faster if you have a fast multiplier.*/ +# else + int ret; + ret=_v>0; + _v|=_v>>1; + _v|=_v>>2; + _v|=_v>>4; + _v|=_v>>8; + _v|=_v>>16; + _v=(_v>>1)+1; + ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F]; + return ret; +# endif +#endif +} + +int oc_ilog64(ogg_int64_t _v){ +#if defined(OC_CLZ64) + return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v; +#else +# if defined(OC_ILOG_NODEBRUIJN) + ogg_uint32_t v; + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFFFFFU)<<5; + v=(ogg_uint32_t)(_v>>m); + ret|=m; + m=(v>0xFFFFU)<<4; + v>>=m; + ret|=m; + m=(v>0xFFU)<<3; + v>>=m; + ret|=m; + m=(v>0xFU)<<2; + v>>=m; + ret|=m; + m=(v>3)<<1; + v>>=m; + ret|=m; + ret+=v>1; + return ret; +# else +/*If we don't have a 64-bit word, split it into two 32-bit halves.*/ +# if LONG_MAX<9223372036854775807LL + ogg_uint32_t v; + int ret; + int m; + ret=_v>0; + m=(_v>0xFFFFFFFFU)<<5; + v=(ogg_uint32_t)(_v>>m); + ret|=m; + v|=v>>1; + v|=v>>2; + v|=v>>4; + v|=v>>8; + v|=v>>16; + v=(v>>1)+1; + ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F]; + return ret; +/*Otherwise do it in one 64-bit operation.*/ +# else + static const unsigned char OC_DEBRUIJN_IDX64[64]={ + 0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40, + 5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57, + 63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56, + 62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58 + }; + int ret; + ret=_v>0; + _v|=_v>>1; + _v|=_v>>2; + _v|=_v>>4; + _v|=_v>>8; + _v|=_v>>16; + _v|=_v>>32; + _v=(_v>>1)+1; + ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F]; + return ret; +# endif +# endif +#endif +} + +/*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/ +static const ogg_int64_t OC_ATANH_LOG2[32]={ + 0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL, + 0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL, + 0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL, + 0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL, + 0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL, + 0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL, + 0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL, + 0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL, + 0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL, + 0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL, + 0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL +}; + +/*Computes the binary exponential of _z, a log base 2 in Q57 format.*/ +ogg_int64_t oc_bexp64(ogg_int64_t _z){ + ogg_int64_t w; + ogg_int64_t z; + int ipart; + ipart=(int)(_z>>57); + if(ipart<0)return 0; + if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL; + z=_z-OC_Q57(ipart); + if(z){ + ogg_int64_t mask; + long wlo; + int i; + /*C doesn't give us 64x64->128 muls, so we use CORDIC. + This is not particularly fast, but it's not being used in time-critical + code; it is very accurate.*/ + /*z is the fractional part of the log in Q62 format. + We need 1 bit of headroom since the magnitude can get larger than 1 + during the iteration, and a sign bit.*/ + z<<=5; + /*w is the exponential in Q61 format (since it also needs headroom and can + get as large as 2.0); we could get another bit if we dropped the sign, + but we'll recover that bit later anyway. + Ideally this should start out as + \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}} + but in order to guarantee convergence we have to repeat iterations 4, + 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/ + w=0x26A3D0E401DD846DLL; + for(i=0;;i++){ + mask=-(z<0); + w+=(w>>i+1)+mask^mask; + z-=OC_ATANH_LOG2[i]+mask^mask; + /*Repeat iteration 4.*/ + if(i>=3)break; + z<<=1; + } + for(;;i++){ + mask=-(z<0); + w+=(w>>i+1)+mask^mask; + z-=OC_ATANH_LOG2[i]+mask^mask; + /*Repeat iteration 13.*/ + if(i>=12)break; + z<<=1; + } + for(;i<32;i++){ + mask=-(z<0); + w+=(w>>i+1)+mask^mask; + z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1; + } + wlo=0; + /*Skip the remaining iterations unless we really require that much + precision. + We could have bailed out earlier for smaller iparts, but that would + require initializing w from a table, as the limit doesn't converge to + 61-bit precision until n=30.*/ + if(ipart>30){ + /*For these iterations, we just update the low bits, as the high bits + can't possibly be affected. + OC_ATANH_LOG2 has also converged (it actually did so one iteration + earlier, but that's no reason for an extra special case).*/ + for(;;i++){ + mask=-(z<0); + wlo+=(w>>i)+mask^mask; + z-=OC_ATANH_LOG2[31]+mask^mask; + /*Repeat iteration 40.*/ + if(i>=39)break; + z<<=1; + } + for(;i<61;i++){ + mask=-(z<0); + wlo+=(w>>i)+mask^mask; + z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1; + } + } + w=(w<<1)+wlo; + } + else w=(ogg_int64_t)1<<62; + if(ipart<62)w=(w>>61-ipart)+1>>1; + return w; +} + +/*Computes the binary logarithm of _w, returned in Q57 format.*/ +ogg_int64_t oc_blog64(ogg_int64_t _w){ + ogg_int64_t z; + int ipart; + if(_w<=0)return -1; + ipart=OC_ILOGNZ_64(_w)-1; + if(ipart>61)_w>>=ipart-61; + else _w<<=61-ipart; + z=0; + if(_w&_w-1){ + ogg_int64_t x; + ogg_int64_t y; + ogg_int64_t u; + ogg_int64_t mask; + int i; + /*C doesn't give us 64x64->128 muls, so we use CORDIC. + This is not particularly fast, but it's not being used in time-critical + code; it is very accurate.*/ + /*z is the fractional part of the log in Q61 format.*/ + /*x and y are the cosh() and sinh(), respectively, in Q61 format. + We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/ + x=_w+((ogg_int64_t)1<<61); + y=_w-((ogg_int64_t)1<<61); + for(i=0;i<4;i++){ + mask=-(y<0); + z+=(OC_ATANH_LOG2[i]>>i)+mask^mask; + u=x>>i+1; + x-=(y>>i+1)+mask^mask; + y-=u+mask^mask; + } + /*Repeat iteration 4.*/ + for(i--;i<13;i++){ + mask=-(y<0); + z+=(OC_ATANH_LOG2[i]>>i)+mask^mask; + u=x>>i+1; + x-=(y>>i+1)+mask^mask; + y-=u+mask^mask; + } + /*Repeat iteration 13.*/ + for(i--;i<32;i++){ + mask=-(y<0); + z+=(OC_ATANH_LOG2[i]>>i)+mask^mask; + u=x>>i+1; + x-=(y>>i+1)+mask^mask; + y-=u+mask^mask; + } + /*OC_ATANH_LOG2 has converged.*/ + for(;i<40;i++){ + mask=-(y<0); + z+=(OC_ATANH_LOG2[31]>>i)+mask^mask; + u=x>>i+1; + x-=(y>>i+1)+mask^mask; + y-=u+mask^mask; + } + /*Repeat iteration 40.*/ + for(i--;i<62;i++){ + mask=-(y<0); + z+=(OC_ATANH_LOG2[31]>>i)+mask^mask; + u=x>>i+1; + x-=(y>>i+1)+mask^mask; + y-=u+mask^mask; + } + z=z+8>>4; + } + return OC_Q57(ipart)+z; +} |