diff options
author | Rémi Verschelde <rverschelde@gmail.com> | 2016-10-13 19:40:40 +0200 |
---|---|---|
committer | Rémi Verschelde <rverschelde@gmail.com> | 2016-10-15 11:50:41 +0200 |
commit | cfcc8a20e862b758c32bd3f152186e6df0591a24 (patch) | |
tree | e9cf6ebad67c17ae60b8beb4d1468cbc94dbb982 /thirdparty/libtheora/mcenc.c | |
parent | da09c6131bcdace7e8e62c3dabc62890e9564c97 (diff) |
theora: Move to a module and split thirdparty lib
Same rationale as the previous commits.
Diffstat (limited to 'thirdparty/libtheora/mcenc.c')
-rw-r--r-- | thirdparty/libtheora/mcenc.c | 767 |
1 files changed, 767 insertions, 0 deletions
diff --git a/thirdparty/libtheora/mcenc.c b/thirdparty/libtheora/mcenc.c new file mode 100644 index 0000000000..797e81f4f9 --- /dev/null +++ b/thirdparty/libtheora/mcenc.c @@ -0,0 +1,767 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id$ + + ********************************************************************/ +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include "encint.h" + + + +typedef struct oc_mcenc_ctx oc_mcenc_ctx; + + + +/*Temporary state used for motion estimation.*/ +struct oc_mcenc_ctx{ + /*The candidate motion vectors.*/ + int candidates[13][2]; + /*The start of the Set B candidates.*/ + int setb0; + /*The total number of candidates.*/ + int ncandidates; +}; + + + +/*The maximum Y plane SAD value for accepting the median predictor.*/ +#define OC_YSAD_THRESH1 (256) +/*The amount to right shift the minimum error by when inflating it for + computing the second maximum Y plane SAD threshold.*/ +#define OC_YSAD_THRESH2_SCALE_BITS (4) +/*The amount to add to the second maximum Y plane threshold when inflating + it.*/ +#define OC_YSAD_THRESH2_OFFSET (64) + +/*The vector offsets in the X direction for each search site in the square + pattern.*/ +static const int OC_SQUARE_DX[9]={-1,0,1,-1,0,1,-1,0,1}; +/*The vector offsets in the Y direction for each search site in the square + pattern.*/ +static const int OC_SQUARE_DY[9]={-1,-1,-1,0,0,0,1,1,1}; +/*The number of sites to search for each boundary condition in the square + pattern. + Bit flags for the boundary conditions are as follows: + 1: -16==dx + 2: dx==15(.5) + 4: -16==dy + 8: dy==15(.5)*/ +static const int OC_SQUARE_NSITES[11]={8,5,5,0,5,3,3,0,5,3,3}; +/*The list of sites to search for each boundary condition in the square + pattern.*/ +static const int OC_SQUARE_SITES[11][8]={ + /* -15.5<dx<31, -15.5<dy<15(.5)*/ + {0,1,2,3,5,6,7,8}, + /*-15.5==dx, -15.5<dy<15(.5)*/ + {1,2,5,7,8}, + /* dx==15(.5), -15.5<dy<15(.5)*/ + {0,1,3,6,7}, + /*-15.5==dx==15(.5), -15.5<dy<15(.5)*/ + {-1}, + /* -15.5<dx<15(.5), -15.5==dy*/ + {3,5,6,7,8}, + /*-15.5==dx, -15.5==dy*/ + {5,7,8}, + /* dx==15(.5), -15.5==dy*/ + {3,6,7}, + /*-15.5==dx==15(.5), -15.5==dy*/ + {-1}, + /*-15.5dx<15(.5), dy==15(.5)*/ + {0,1,2,3,5}, + /*-15.5==dx, dy==15(.5)*/ + {1,2,5}, + /* dx==15(.5), dy==15(.5)*/ + {0,1,3} +}; + + +static void oc_mcenc_find_candidates(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, + int _accum[2],int _mbi,int _frame){ + oc_mb_enc_info *embs; + int a[3][2]; + int ncandidates; + unsigned nmbi; + int i; + embs=_enc->mb_info; + /*Skip a position to store the median predictor in.*/ + ncandidates=1; + if(embs[_mbi].ncneighbors>0){ + /*Fill in the first part of set A: the vectors from adjacent blocks.*/ + for(i=0;i<embs[_mbi].ncneighbors;i++){ + nmbi=embs[_mbi].cneighbors[i]; + _mcenc->candidates[ncandidates][0]=embs[nmbi].analysis_mv[0][_frame][0]; + _mcenc->candidates[ncandidates][1]=embs[nmbi].analysis_mv[0][_frame][1]; + ncandidates++; + } + } + /*Add a few additional vectors to set A: the vectors used in the previous + frames and the (0,0) vector.*/ + _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,_accum[0],31); + _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,_accum[1],31); + ncandidates++; + _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, + embs[_mbi].analysis_mv[1][_frame][0]+_accum[0],31); + _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, + embs[_mbi].analysis_mv[1][_frame][1]+_accum[1],31); + ncandidates++; + _mcenc->candidates[ncandidates][0]=0; + _mcenc->candidates[ncandidates][1]=0; + ncandidates++; + /*Use the first three vectors of set A to find our best predictor: their + median.*/ + memcpy(a,_mcenc->candidates+1,sizeof(a)); + OC_SORT2I(a[0][0],a[1][0]); + OC_SORT2I(a[0][1],a[1][1]); + OC_SORT2I(a[1][0],a[2][0]); + OC_SORT2I(a[1][1],a[2][1]); + OC_SORT2I(a[0][0],a[1][0]); + OC_SORT2I(a[0][1],a[1][1]); + _mcenc->candidates[0][0]=a[1][0]; + _mcenc->candidates[0][1]=a[1][1]; + /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/ + _mcenc->setb0=ncandidates; + /*The first time through the loop use the current macro block.*/ + nmbi=_mbi; + for(i=0;;i++){ + _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, + 2*embs[_mbi].analysis_mv[1][_frame][0] + -embs[_mbi].analysis_mv[2][_frame][0]+_accum[0],31); + _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, + 2*embs[_mbi].analysis_mv[1][_frame][1] + -embs[_mbi].analysis_mv[2][_frame][1]+_accum[1],31); + ncandidates++; + if(i>=embs[_mbi].npneighbors)break; + nmbi=embs[_mbi].pneighbors[i]; + } + /*Truncate to full-pel positions.*/ + for(i=0;i<ncandidates;i++){ + _mcenc->candidates[i][0]=OC_DIV2(_mcenc->candidates[i][0]); + _mcenc->candidates[i][1]=OC_DIV2(_mcenc->candidates[i][1]); + } + _mcenc->ncandidates=ncandidates; +} + +#if 0 +static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc, + const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], + int _mvoffset0,int _mvoffset1,const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _best_err){ + unsigned err; + int bi; + err=0; + for(bi=0;bi<4;bi++){ + ptrdiff_t frag_offs; + frag_offs=_frag_buf_offs[_fragis[bi]]; + err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0, + _ref+frag_offs+_mvoffset1,_ystride,_best_err-err); + } + return err; +} +#endif + +static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc, + const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], + int _mvoffset0,int _mvoffset1,const unsigned char *_src, + const unsigned char *_ref,int _ystride,unsigned _best_err){ + unsigned err; + int bi; + err=0; + for(bi=0;bi<4;bi++){ + ptrdiff_t frag_offs; + frag_offs=_frag_buf_offs[_fragis[bi]]; + err+=oc_enc_frag_satd2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0, + _ref+frag_offs+_mvoffset1,_ystride,_best_err-err); + } + return err; +} + +static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, + const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, + const unsigned char *_src,const unsigned char *_ref,int _ystride, + unsigned _block_err[4]){ + unsigned err; + int mvoffset; + int bi; + mvoffset=_dx+_dy*_ystride; + err=0; + for(bi=0;bi<4;bi++){ + ptrdiff_t frag_offs; + unsigned block_err; + frag_offs=_frag_buf_offs[_fragis[bi]]; + block_err=oc_enc_frag_sad(_enc, + _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); + _block_err[bi]=block_err; + err+=block_err; + } + return err; +} + +static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, + const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, + const unsigned char *_src,const unsigned char *_ref,int _ystride){ + int mvoffset; + int err; + int bi; + mvoffset=_dx+_dy*_ystride; + err=0; + for(bi=0;bi<4;bi++){ + ptrdiff_t frag_offs; + frag_offs=_frag_buf_offs[_fragis[bi]]; + err+=oc_enc_frag_satd_thresh(_enc, + _src+frag_offs,_ref+frag_offs+mvoffset,_ystride,UINT_MAX); + } + return err; +} + +static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc, + ptrdiff_t _frag_offs,int _dx,int _dy, + const unsigned char *_src,const unsigned char *_ref,int _ystride){ + return oc_enc_frag_satd_thresh(_enc, + _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride,UINT_MAX); +} + +/*Perform a motion vector search for this macro block against a single + reference frame. + As a bonus, individual block motion vectors are computed as well, as much of + the work can be shared. + The actual motion vector is stored in the appropriate place in the + oc_mb_enc_info structure. + _mcenc: The motion compensation context. + _accum: Drop frame/golden MV accumulators. + _mbi: The macro block index. + _frame: The frame to search, either OC_FRAME_PREV or OC_FRAME_GOLD.*/ +void oc_mcenc_search_frame(oc_enc_ctx *_enc,int _accum[2],int _mbi,int _frame){ + /*Note: Traditionally this search is done using a rate-distortion objective + function of the form D+lambda*R. + However, xiphmont tested this and found it produced a small degredation, + while requiring extra computation. + This is most likely due to Theora's peculiar MV encoding scheme: MVs are + not coded relative to a predictor, and the only truly cheap way to use a + MV is in the LAST or LAST2 MB modes, which are not being considered here. + Therefore if we use the MV found here, it's only because both LAST and + LAST2 performed poorly, and therefore the MB is not likely to be uniform + or suffer from the aperture problem. + Furthermore we would like to re-use the MV found here for as many MBs as + possible, so picking a slightly sub-optimal vector to save a bit or two + may cause increased degredation in many blocks to come. + We could artificially reduce lambda to compensate, but it's faster to just + disable it entirely, and use D (the distortion) as the sole criterion.*/ + oc_mcenc_ctx mcenc; + const ptrdiff_t *frag_buf_offs; + const ptrdiff_t *fragis; + const unsigned char *src; + const unsigned char *ref; + int ystride; + oc_mb_enc_info *embs; + ogg_int32_t hit_cache[31]; + ogg_int32_t hitbit; + unsigned best_block_err[4]; + unsigned block_err[4]; + unsigned best_err; + int best_vec[2]; + int best_block_vec[4][2]; + int candx; + int candy; + int bi; + embs=_enc->mb_info; + /*Find some candidate motion vectors.*/ + oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame); + /*Clear the cache of locations we've examined.*/ + memset(hit_cache,0,sizeof(hit_cache)); + /*Start with the median predictor.*/ + candx=mcenc.candidates[0][0]; + candy=mcenc.candidates[0][1]; + hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15; + frag_buf_offs=_enc->state.frag_buf_offs; + fragis=_enc->state.mb_maps[_mbi][0]; + src=_enc->state.ref_frame_data[OC_FRAME_IO]; + ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]]; + ystride=_enc->state.ref_ystride[0]; + /*TODO: customize error function for speed/(quality+size) tradeoff.*/ + best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, + frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); + best_vec[0]=candx; + best_vec[1]=candy; + if(_frame==OC_FRAME_PREV){ + for(bi=0;bi<4;bi++){ + best_block_err[bi]=block_err[bi]; + best_block_vec[bi][0]=candx; + best_block_vec[bi][1]=candy; + } + } + /*If this predictor fails, move on to set A.*/ + if(best_err>OC_YSAD_THRESH1){ + unsigned err; + unsigned t2; + int ncs; + int ci; + /*Compute the early termination threshold for set A.*/ + t2=embs[_mbi].error[_frame]; + ncs=OC_MINI(3,embs[_mbi].ncneighbors); + for(ci=0;ci<ncs;ci++){ + t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]); + } + t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET; + /*Examine the candidates in set A.*/ + for(ci=1;ci<mcenc.setb0;ci++){ + candx=mcenc.candidates[ci][0]; + candy=mcenc.candidates[ci][1]; + /*If we've already examined this vector, then we would be using it if it + was better than what we are using.*/ + hitbit=(ogg_int32_t)1<<candx+15; + if(hit_cache[candy+15]&hitbit)continue; + hit_cache[candy+15]|=hitbit; + err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, + frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); + if(err<best_err){ + best_err=err; + best_vec[0]=candx; + best_vec[1]=candy; + } + if(_frame==OC_FRAME_PREV){ + for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ + best_block_err[bi]=block_err[bi]; + best_block_vec[bi][0]=candx; + best_block_vec[bi][1]=candy; + } + } + } + if(best_err>t2){ + /*Examine the candidates in set B.*/ + for(;ci<mcenc.ncandidates;ci++){ + candx=mcenc.candidates[ci][0]; + candy=mcenc.candidates[ci][1]; + hitbit=(ogg_int32_t)1<<candx+15; + if(hit_cache[candy+15]&hitbit)continue; + hit_cache[candy+15]|=hitbit; + err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, + frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); + if(err<best_err){ + best_err=err; + best_vec[0]=candx; + best_vec[1]=candy; + } + if(_frame==OC_FRAME_PREV){ + for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ + best_block_err[bi]=block_err[bi]; + best_block_vec[bi][0]=candx; + best_block_vec[bi][1]=candy; + } + } + } + /*Use the same threshold for set B as in set A.*/ + if(best_err>t2){ + int best_site; + int nsites; + int sitei; + int site; + int b; + /*Square pattern search.*/ + for(;;){ + best_site=4; + /*Compose the bit flags for boundary conditions.*/ + b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1| + OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3; + nsites=OC_SQUARE_NSITES[b]; + for(sitei=0;sitei<nsites;sitei++){ + site=OC_SQUARE_SITES[b][sitei]; + candx=best_vec[0]+OC_SQUARE_DX[site]; + candy=best_vec[1]+OC_SQUARE_DY[site]; + hitbit=(ogg_int32_t)1<<candx+15; + if(hit_cache[candy+15]&hitbit)continue; + hit_cache[candy+15]|=hitbit; + err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, + frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); + if(err<best_err){ + best_err=err; + best_site=site; + } + if(_frame==OC_FRAME_PREV){ + for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ + best_block_err[bi]=block_err[bi]; + best_block_vec[bi][0]=candx; + best_block_vec[bi][1]=candy; + } + } + } + if(best_site==4)break; + best_vec[0]+=OC_SQUARE_DX[best_site]; + best_vec[1]+=OC_SQUARE_DY[best_site]; + } + /*Final 4-MV search.*/ + /*Simply use 1/4 of the macro block set A and B threshold as the + individual block threshold.*/ + if(_frame==OC_FRAME_PREV){ + t2>>=2; + for(bi=0;bi<4;bi++){ + if(best_block_err[bi]>t2){ + /*Square pattern search. + We do this in a slightly interesting manner. + We continue to check the SAD of all four blocks in the + macro block. + This gives us two things: + 1) We can continue to use the hit_cache to avoid duplicate + checks. + Otherwise we could continue to read it, but not write to it + without saving and restoring it for each block. + Note that we could still eliminate a large number of + duplicate checks by taking into account the site we came + from when choosing the site list. + We can still do that to avoid extra hit_cache queries, and + it might even be a speed win. + 2) It gives us a slightly better chance of escaping local + minima. + We would not be here if we weren't doing a fairly bad job + in finding a good vector, and checking these vectors can + save us from 100 to several thousand points off our SAD 1 + in 15 times. + TODO: Is this a good idea? + Who knows. + It needs more testing.*/ + for(;;){ + int bestx; + int besty; + int bj; + bestx=best_block_vec[bi][0]; + besty=best_block_vec[bi][1]; + /*Compose the bit flags for boundary conditions.*/ + b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1| + OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3; + nsites=OC_SQUARE_NSITES[b]; + for(sitei=0;sitei<nsites;sitei++){ + site=OC_SQUARE_SITES[b][sitei]; + candx=bestx+OC_SQUARE_DX[site]; + candy=besty+OC_SQUARE_DY[site]; + hitbit=(ogg_int32_t)1<<candx+15; + if(hit_cache[candy+15]&hitbit)continue; + hit_cache[candy+15]|=hitbit; + err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, + frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); + if(err<best_err){ + best_err=err; + best_vec[0]=candx; + best_vec[1]=candy; + } + for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){ + best_block_err[bj]=block_err[bj]; + best_block_vec[bj][0]=candx; + best_block_vec[bj][1]=candy; + } + } + if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){ + break; + } + } + } + } + } + } + } + } + embs[_mbi].error[_frame]=(ogg_uint16_t)best_err; + candx=best_vec[0]; + candy=best_vec[1]; + embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc, + frag_buf_offs,fragis,candx,candy,src,ref,ystride); + embs[_mbi].analysis_mv[0][_frame][0]=(signed char)(candx<<1); + embs[_mbi].analysis_mv[0][_frame][1]=(signed char)(candy<<1); + if(_frame==OC_FRAME_PREV){ + for(bi=0;bi<4;bi++){ + candx=best_block_vec[bi][0]; + candy=best_block_vec[bi][1]; + embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc, + frag_buf_offs[fragis[bi]],candx,candy,src,ref,ystride); + embs[_mbi].block_mv[bi][0]=(signed char)(candx<<1); + embs[_mbi].block_mv[bi][1]=(signed char)(candy<<1); + } + } +} + +void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){ + oc_mv2 *mvs; + int accum_p[2]; + int accum_g[2]; + mvs=_enc->mb_info[_mbi].analysis_mv; + if(_enc->prevframe_dropped){ + accum_p[0]=mvs[0][OC_FRAME_PREV][0]; + accum_p[1]=mvs[0][OC_FRAME_PREV][1]; + } + else accum_p[1]=accum_p[0]=0; + accum_g[0]=mvs[2][OC_FRAME_GOLD][0]; + accum_g[1]=mvs[2][OC_FRAME_GOLD][1]; + mvs[0][OC_FRAME_PREV][0]-=mvs[2][OC_FRAME_PREV][0]; + mvs[0][OC_FRAME_PREV][1]-=mvs[2][OC_FRAME_PREV][1]; + /*Move the motion vector predictors back a frame.*/ + memmove(mvs+1,mvs,2*sizeof(*mvs)); + /*Search the last frame.*/ + oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV); + mvs[2][OC_FRAME_PREV][0]=accum_p[0]; + mvs[2][OC_FRAME_PREV][1]=accum_p[1]; + /*GOLDEN MVs are different from PREV MVs in that they're each absolute + offsets from some frame in the past rather than relative offsets from the + frame before. + For predictor calculation to make sense, we need them to be in the same + form as PREV MVs.*/ + mvs[1][OC_FRAME_GOLD][0]-=mvs[2][OC_FRAME_GOLD][0]; + mvs[1][OC_FRAME_GOLD][1]-=mvs[2][OC_FRAME_GOLD][1]; + mvs[2][OC_FRAME_GOLD][0]-=accum_g[0]; + mvs[2][OC_FRAME_GOLD][1]-=accum_g[1]; + /*Search the golden frame.*/ + oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD); + /*Put GOLDEN MVs back into absolute offset form. + The newest MV is already an absolute offset.*/ + mvs[2][OC_FRAME_GOLD][0]+=accum_g[0]; + mvs[2][OC_FRAME_GOLD][1]+=accum_g[1]; + mvs[1][OC_FRAME_GOLD][0]+=mvs[2][OC_FRAME_GOLD][0]; + mvs[1][OC_FRAME_GOLD][1]+=mvs[2][OC_FRAME_GOLD][1]; +} + +#if 0 +static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi, + int _vec[2],int _best_err,int _frame){ + const unsigned char *src; + const unsigned char *ref; + const ptrdiff_t *frag_buf_offs; + const ptrdiff_t *fragis; + int offset_y[9]; + int ystride; + int mvoffset_base; + int best_site; + int sitei; + int err; + src=_enc->state.ref_frame_data[OC_FRAME_IO]; + ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_framei]]; + frag_buf_offs=_enc->state.frag_buf_offs; + fragis=_enc->state.mb_maps[_mbi][0]; + ystride=_enc->state.ref_ystride[0]; + mvoffset_base=_vec[0]+_vec[1]*ystride; + offset_y[0]=offset_y[1]=offset_y[2]=-ystride; + offset_y[3]=offset_y[5]=0; + offset_y[6]=offset_y[7]=offset_y[8]=ystride; + best_site=4; + for(sitei=0;sitei<8;sitei++){ + int site; + int xmask; + int ymask; + int dx; + int dy; + int mvoffset0; + int mvoffset1; + site=OC_SQUARE_SITES[0][sitei]; + dx=OC_SQUARE_DX[site]; + dy=OC_SQUARE_DY[site]; + /*The following code SHOULD be equivalent to + oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, + (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); + However, it should also be much faster, as it involves no multiplies and + doesn't have to handle chroma vectors.*/ + xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); + ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); + mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); + mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); + err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, + mvoffset0,mvoffset1,src,ref,ystride,_best_err); + if(err<_best_err){ + _best_err=err; + best_site=site; + } + } + _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; + _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; + return _best_err; +} +#endif + +static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc, + int _mbi,int _vec[2],unsigned _best_err,int _frame){ + const unsigned char *src; + const unsigned char *ref; + const ptrdiff_t *frag_buf_offs; + const ptrdiff_t *fragis; + int offset_y[9]; + int ystride; + int mvoffset_base; + int best_site; + int sitei; + int err; + src=_enc->state.ref_frame_data[OC_FRAME_IO]; + ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]]; + frag_buf_offs=_enc->state.frag_buf_offs; + fragis=_enc->state.mb_maps[_mbi][0]; + ystride=_enc->state.ref_ystride[0]; + mvoffset_base=_vec[0]+_vec[1]*ystride; + offset_y[0]=offset_y[1]=offset_y[2]=-ystride; + offset_y[3]=offset_y[5]=0; + offset_y[6]=offset_y[7]=offset_y[8]=ystride; + best_site=4; + for(sitei=0;sitei<8;sitei++){ + int site; + int xmask; + int ymask; + int dx; + int dy; + int mvoffset0; + int mvoffset1; + site=OC_SQUARE_SITES[0][sitei]; + dx=OC_SQUARE_DX[site]; + dy=OC_SQUARE_DY[site]; + /*The following code SHOULD be equivalent to + oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, + (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); + However, it should also be much faster, as it involves no multiplies and + doesn't have to handle chroma vectors.*/ + xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); + ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); + mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); + mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); + err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis, + mvoffset0,mvoffset1,src,ref,ystride,_best_err); + if(err<_best_err){ + _best_err=err; + best_site=site; + } + } + _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; + _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; + return _best_err; +} + +void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){ + oc_mb_enc_info *embs; + int vec[2]; + embs=_enc->mb_info; + vec[0]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][0]); + vec[1]=OC_DIV2(embs[_mbi].analysis_mv[0][_frame][1]); + embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc, + _mbi,vec,embs[_mbi].satd[_frame],_frame); + embs[_mbi].analysis_mv[0][_frame][0]=(signed char)vec[0]; + embs[_mbi].analysis_mv[0][_frame][1]=(signed char)vec[1]; +} + +#if 0 +static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc, + int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, + int _offset_y[9],unsigned _best_err){ + int mvoffset_base; + int best_site; + int sitei; + mvoffset_base=_vec[0]+_vec[1]*_ystride; + best_site=4; + for(sitei=0;sitei<8;sitei++){ + unsigned err; + int site; + int xmask; + int ymask; + int dx; + int dy; + int mvoffset0; + int mvoffset1; + site=OC_SQUARE_SITES[0][sitei]; + dx=OC_SQUARE_DX[site]; + dy=OC_SQUARE_DY[site]; + /*The following code SHOULD be equivalent to + oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, + (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); + However, it should also be much faster, as it involves no multiplies and + doesn't have to handle chroma vectors.*/ + xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); + ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); + mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); + mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); + err=oc_enc_frag_sad2_thresh(_enc,_src, + _ref+mvoffset0,_ref+mvoffset1,ystride,_best_err); + if(err<_best_err){ + _best_err=err; + best_site=site; + } + } + _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; + _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; + return _best_err; +} +#endif + +static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc, + int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, + int _offset_y[9],unsigned _best_err){ + int mvoffset_base; + int best_site; + int sitei; + mvoffset_base=_vec[0]+_vec[1]*_ystride; + best_site=4; + for(sitei=0;sitei<8;sitei++){ + unsigned err; + int site; + int xmask; + int ymask; + int dx; + int dy; + int mvoffset0; + int mvoffset1; + site=OC_SQUARE_SITES[0][sitei]; + dx=OC_SQUARE_DX[site]; + dy=OC_SQUARE_DY[site]; + /*The following code SHOULD be equivalent to + oc_state_get_mv_offsets(&_enc->state,&mvoffsets,0, + (_vec[0]<<1)+dx,(_vec[1]<<1)+dy); + However, it should also be much faster, as it involves no multiplies and + doesn't have to handle chroma vectors.*/ + xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); + ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); + mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); + mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); + err=oc_enc_frag_satd2_thresh(_enc,_src, + _ref+mvoffset0,_ref+mvoffset1,_ystride,_best_err); + if(err<_best_err){ + _best_err=err; + best_site=site; + } + } + _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; + _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; + return _best_err; +} + +void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){ + oc_mb_enc_info *embs; + const ptrdiff_t *frag_buf_offs; + const ptrdiff_t *fragis; + const unsigned char *src; + const unsigned char *ref; + int offset_y[9]; + int ystride; + int bi; + ystride=_enc->state.ref_ystride[0]; + frag_buf_offs=_enc->state.frag_buf_offs; + fragis=_enc->state.mb_maps[_mbi][0]; + src=_enc->state.ref_frame_data[OC_FRAME_IO]; + ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]]; + offset_y[0]=offset_y[1]=offset_y[2]=-ystride; + offset_y[3]=offset_y[5]=0; + offset_y[6]=offset_y[7]=offset_y[8]=ystride; + embs=_enc->mb_info; + for(bi=0;bi<4;bi++){ + ptrdiff_t frag_offs; + int vec[2]; + frag_offs=frag_buf_offs[fragis[bi]]; + vec[0]=OC_DIV2(embs[_mbi].block_mv[bi][0]); + vec[1]=OC_DIV2(embs[_mbi].block_mv[bi][1]); + embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec, + src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]); + embs[_mbi].ref_mv[bi][0]=(signed char)vec[0]; + embs[_mbi].ref_mv[bi][1]=(signed char)vec[1]; + } +} |