H.264完整的C语言代码和DCT的代码

源代码在线查看: blocksearch_kc.cpp

软件大小: 4747 K
上传用户: aoaoaoao
关键词: 264 DCT C语言代码 代码
下载地址: 免注册下载 普通下载 VIP

相关代码

				#include "idb_kernelc.hpp"   
				#include "mpeg.hpp"
				#include "idb_kernelc2.hpp"   
				
				KERNELDEF(blocksearch, "mpeg_sc/blocksearch_kc.uc");
				
				// Performs a right shift by two (division by four) on a ubyte4
				#define UBYTE4_DIV4( x ) \
				  ubyte4(int(shift( half2(x), minus_2 )) & shift_mask )
				
				// COMPUTE_SAD does the sad calculation for a complete macroblock against one
				// vertical position in the current reference column.  Two different reference
				// variables are passed in because the compare can straddle the rows in some
				// cases.
				//
				// m      - macroblock expand
				// moff   - offset into the macroblock expand (for picking one of the two macroblocks)
				// r1     - first reference row
				// r1_off - offset into the first reference row
				// r2     - second reference row
				// r2_off - offset into the second reference row
				// valid  - non-zero if the row should be considered valid (allows mv_sad and
				//            mv_xy to be updated). This is used at the top and bottom of the image.
				// xoff   - Horizontal offset of the current search (used for motion vector).
				// yoff   - Vertical offset of the current search (used for motion vector).
				// mv_sad - Lowest current sad of the given macroblock.
				// mv_xy  - Motion vector of the lowest sad.
				//
				// NOTES: I didn't put in anything special to limit the chance of saturation of
				// the ubyte4's when summing the rows of the macroblock.
				
				#define COMPUTE_SAD( m, moff, r1, r1_off, r2, r2_off, valid, xoff, yoff, mv_sad, mv_xy ) \
				  sad4 = \
				      ( ( abd( m[moff], r1[r1_off] )     + abd( mb[moff+1], r2[r2_off] ) ) + \
				        ( abd( m[moff+2], r1[r1_off+2] ) + abd( mb[moff+3], r2[r2_off+2] ) ) ); \
				  t1 = shuffled( sad4, 0x88318820 ); \
				  sad_total = half2(hi(t1)) + half2(lo(t1)); \
				  sad4 = \
				      ( ( abd( m[moff+4], r1[r1_off+4] ) + abd( mb[moff+5], r2[r2_off+4] ) ) + \
				        ( abd( m[moff+6], r1[r1_off+6] ) + abd( mb[moff+7], r2[r2_off+6] ) ) ); \
				  t1 = shuffled( sad4, 0x88318820 ); \
				  sad_total = sad_total + half2(hi(t1)) + half2(lo(t1)); \
				  t2 = shuffled( sad_total, 0x88883120 ); \
				  sad_total = half2(hi(t2)) + half2(lo(t2)); \
				  sad_total = sad_total + commucperm( tree_sum1, sad_total ); \
				  sad_total = sad_total + commucperm( tree_sum2, sad_total ); \
				  sad_total = sad_total + commucperm( tree_sum3, sad_total ); \
				  \
				  sad_less = itocc( ( int( sad_total ) < int( mv_sad ) ) & int(valid) ); \
				  mv_sad = select( sad_less, sad_total, mv_sad ); \
				  mv_xy  = select( sad_less, shift( int(yoff) & 0xffff, 16 ) | int(xoff) & 0xffff, mv_xy );
				
				// COMPUTE_ALL_MB does the sad calculation for both macroblocks against
				// the current reference column.
				//
				// r1     - first reference row
				// r1_off - offset into the first reference row
				// r2     - second reference row
				// r2_off - offset into the second reference row
				// valid  - non-zero if the row should be considered valid (allows mv_sad and
				//            mv_xy to be updated). This is used at the top and bottom of the image.
				// xoff   - Horizontal offset of the current search (used for motion vector).
				// yoff   - Vertical offset of the current search (used for motion vector).
				
				#define COMPUTE_ALL_MB( r1, r1_off, r2, r2_off, valid, xoff, yoff ) \
				  COMPUTE_SAD( mb, 0,  r1, r1_off, r2, r2_off, valid, -16+xoff, yoff, mv_sad[0], mv_xy[0] ); \
				  COMPUTE_SAD( mb, 8,  r1, r1_off, r2, r2_off, valid, xoff, yoff, mv_sad[1], mv_xy[1] );
				
				//  COMPUTE_SAD( mb, 16, r1, r1_off, r2, r2_off, 16+xoff, yoff, mv_sad[2], mv_xy[2] );
				
				// VERTICAL_SHIFT rotates the entire current column up (towards the top of the
				// image).  This eliminates the topmost row.
				
				#define VERTICAL_SHIFT \
				  r0[0] = commucperm( rotate1, select( node0, r0[1], r0[0] ) ); \
				  r0[2] = commucperm( rotate1, select( node0, r0[3], r0[2] ) ); \
				  r0[4] = commucperm( rotate1, select( node0, r0[5], r0[4] ) ); \
				  r0[6] = commucperm( rotate1, select( node0, r0[7], r0[6] ) ); \
				  \
				  r0[1] = commucperm( rotate1, select( node0, r1[0], r0[1] ) ); \
				  r0[3] = commucperm( rotate1, select( node0, r1[2], r0[3] ) ); \
				  r0[5] = commucperm( rotate1, select( node0, r1[4], r0[5] ) ); \
				  r0[7] = commucperm( rotate1, select( node0, r1[6], r0[7] ) ); \
				  \
				  r1[0] = commucperm( rotate1, select( node0, r1[1], r1[0] ) ); \
				  r1[2] = commucperm( rotate1, select( node0, r1[3], r1[2] ) ); \
				  r1[4] = commucperm( rotate1, select( node0, r1[5], r1[4] ) ); \
				  r1[6] = commucperm( rotate1, select( node0, r1[7], r1[6] ) ); \
				  \
				  r1[1] = commucperm( rotate1, select( node0, r2[0], r1[1] ) ); \
				  r1[3] = commucperm( rotate1, select( node0, r2[2], r1[3] ) ); \
				  r1[5] = commucperm( rotate1, select( node0, r2[4], r1[5] ) ); \
				  r1[7] = commucperm( rotate1, select( node0, r2[6], r1[7] ) ); \
				  \
				  r2[0] = commucperm( rotate1, select( node0, r2[1], r2[0] ) ); \
				  r2[2] = commucperm( rotate1, select( node0, r2[3], r2[2] ) ); \
				  r2[4] = commucperm( rotate1, select( node0, r2[5], r2[4] ) ); \
				  r2[6] = commucperm( rotate1, select( node0, r2[7], r2[6] ) ); \
				  \
				  r2[1] = commucperm( rotate1, r2[1] ); \
				  r2[3] = commucperm( rotate1, r2[3] ); \
				  r2[5] = commucperm( rotate1, r2[5] ); \
				  r2[7] = commucperm( rotate1, r2[7] );
				
				// COMPUTE_VERT does the 4 overlapping macroblock sad's that can be 
				// computed for a single rotation of the reference column.
				
				#define PASS_MB_STATE \
				  mb[0] %= mb[0]; \
				  mb[1] %= mb[1]; \
				  mb[2] %= mb[2]; \
				  mb[3] %= mb[3]; \
				  mb[4] %= mb[4]; \
				  mb[5] %= mb[5]; \
				  mb[6] %= mb[6]; \
				  mb[7] %= mb[7]; \
				  mb[8] %= mb[8]; \
				  mb[9] %= mb[9]; \
				  mb[10] %= mb[10]; \
				  mb[11] %= mb[11]; \
				  mb[12] %= mb[12]; \
				  mb[13] %= mb[13]; \
				  mb[14] %= mb[14]; \
				  mb[15] %= mb[15];  \
				  \
				  mv_sad[0] %= mv_sad[0]; \
				  mv_sad[1] %= mv_sad[1]; \
				  \
				  mv_xy[0] %= mv_xy[0]; \
				  mv_xy[1] %= mv_xy[1];
				
				
				#define PASS_R_STATE(n) \
				  r##n[0] %= r##n[0]; \
				  r##n[1] %= r##n[1]; \
				  r##n[2] %= r##n[2]; \
				  r##n[3] %= r##n[3]; \
				  r##n[4] %= r##n[4]; \
				  r##n[5] %= r##n[5]; \
				  r##n[6] %= r##n[6]; \
				  r##n[7] %= r##n[7];
				
				
				#define COMPUTE_VERT( xoff, yoff, valid ) \
				  PASS_MB_STATE \
				  PASS_R_STATE(0) \
				  COMPUTE_ALL_MB( r0, 0, r0, 1, row0_valid, xoff, -16+yoff ) \
				  PASS_R_STATE(1) \
				  COMPUTE_ALL_MB( r0, 1, r1, 0, row0_valid, xoff, -8+yoff  ) \
				  barrier(); \
				  PASS_MB_STATE \
				  COMPUTE_ALL_MB( r1, 0, r1, 1, int( valid ), xoff, yoff   ) \
				  PASS_R_STATE(2) \
				  COMPUTE_ALL_MB( r1, 1, r2, 0, row2_valid, xoff, 8+yoff   ) \
				  VERTICAL_SHIFT
				
				// Blocksearch
				//
				// row0,row1,row2 - The rows of the reference image.
				// mblocks - The row of macroblocks to be checked against the image.
				// motions - The motion vectors of the mblocks (one per mblock).
				// location - bit 0 is set for top row, bit 1 is set for bottom row
				//
				// *The mblocks stream should be the same length as all of the row streams.
				//
				
				kernel blocksearch( istream row0, istream row1, istream row2, 
				                    istream mblocks, costream motions, 
				                    uc& location )
				{
				  // stored rows
				  expand r0( 8 );
				  expand r1( 8 );
				  expand r2( 8 );
				  array r_save( 4 );
				
				  uc rotate1   = 0x07654321;
				
				  uc tree_sum1 = 0x67452301;
				  uc tree_sum2 = 0x44660022;
				  uc tree_sum3 = 0x00004444;
				
				  half2 mv;
				
				  // stored macroblocks
				  expand mb( 16 );
				
				  // best motion vectors
				  expand mv_xy( 3 );
				  expand mv_sad( 3 );
				
				  // mask to make right shift by 4 work for ubyte4's
				  int shift_mask = 0xff3fff3f;
				  int minus_2    = 0 - 2;
				
				  mblocks >> mb[0]; mb[0] = UBYTE4_DIV4( mb[0] );
				  mblocks >> mb[1]; mb[1] = UBYTE4_DIV4( mb[1] );
				  mblocks >> mb[2]; mb[2] = UBYTE4_DIV4( mb[2] );
				  mblocks >> mb[3]; mb[3] = UBYTE4_DIV4( mb[3] );
				  mblocks >> mb[4]; mb[4] = UBYTE4_DIV4( mb[4] );
				  mblocks >> mb[5]; mb[5] = UBYTE4_DIV4( mb[5] );
				  mblocks >> mb[6]; mb[6] = UBYTE4_DIV4( mb[6] );
				  mblocks >> mb[7]; mb[7] = UBYTE4_DIV4( mb[7] );
				
				  row0 >> r0[0]; r0[0] = UBYTE4_DIV4( r0[0] );
				  row0 >> r0[1]; r0[1] = UBYTE4_DIV4( r0[1] );
				  row0 >> r0[2]; r0[2] = UBYTE4_DIV4( r0[2] );
				  row0 >> r0[3]; r0[3] = UBYTE4_DIV4( r0[3] );
				  row0 >> r0[4]; r0[4] = UBYTE4_DIV4( r0[4] );
				  row0 >> r0[5]; r0[5] = UBYTE4_DIV4( r0[5] );
				  row0 >> r0[6]; r0[6] = UBYTE4_DIV4( r0[6] );
				  row0 >> r0[7]; r0[7] = UBYTE4_DIV4( r0[7] );
				
				  row1 >> r1[0]; r1[0] = UBYTE4_DIV4( r1[0] );
				  row1 >> r1[1]; r1[1] = UBYTE4_DIV4( r1[1] );
				  row1 >> r1[2]; r1[2] = UBYTE4_DIV4( r1[2] );
				  row1 >> r1[3]; r1[3] = UBYTE4_DIV4( r1[3] );
				  row1 >> r1[4]; r1[4] = UBYTE4_DIV4( r1[4] );
				  row1 >> r1[5]; r1[5] = UBYTE4_DIV4( r1[5] );
				  row1 >> r1[6]; r1[6] = UBYTE4_DIV4( r1[6] );
				  row1 >> r1[7]; r1[7] = UBYTE4_DIV4( r1[7] );
				
				  row2 >> r2[0]; r2[0] = UBYTE4_DIV4( r2[0] );
				  row2 >> r2[1]; r2[1] = UBYTE4_DIV4( r2[1] );
				  row2 >> r2[2]; r2[2] = UBYTE4_DIV4( r2[2] );
				  row2 >> r2[3]; r2[3] = UBYTE4_DIV4( r2[3] );
				  row2 >> r2[4]; r2[4] = UBYTE4_DIV4( r2[4] );
				  row2 >> r2[5]; r2[5] = UBYTE4_DIV4( r2[5] );
				  row2 >> r2[6]; r2[6] = UBYTE4_DIV4( r2[6] );
				  row2 >> r2[7]; r2[7] = UBYTE4_DIV4( r2[7] );
				
				  cc node0 = itocc( cid( ) == 0 );
				
				  int row0_valid = ( ( commclperm( 8, 0, location ) & 0x1 ) == 0 );
				  int row2_valid = ( ( commclperm( 8, 0, location ) & 0x2 ) == 0 );
				
				  // if both the top and bottom rows are invalid we can only search
				  // vertical offsets of zero - store this in master_vert
				
				  int master_vert;
				
				  master_vert = select( itocc( row0_valid | row2_valid ), 8, 1 );
				
				  mv_sad[0] = 0x7fff;
				  mv_sad[1] = 0x7fff;
				
				  mv_xy[0]  = 0;
				  mv_xy[1]  = 0;
				
				  loop_stream( row0 ) {
				   
				    // rotate the stored macroblock and read the next one
				
				    mb[8]  = mb[0]; mb[9]  = mb[1]; mb[10] = mb[2]; mb[11] = mb[3];
				    mb[12] = mb[4]; mb[13] = mb[5]; mb[14] = mb[6]; mb[15] = mb[7];
				    mv_sad[1] = mv_sad[0]; mv_xy[1] = mv_xy[0];
				
				    mblocks >> mb[0]; mb[0] = UBYTE4_DIV4( mb[0] );
				    mblocks >> mb[1]; mb[1] = UBYTE4_DIV4( mb[1] );
				    mblocks >> mb[2]; mb[2] = UBYTE4_DIV4( mb[2] );
				    mblocks >> mb[3]; mb[3] = UBYTE4_DIV4( mb[3] );
				    mblocks >> mb[4]; mb[4] = UBYTE4_DIV4( mb[4] );
				    mblocks >> mb[5]; mb[5] = UBYTE4_DIV4( mb[5] );
				    mblocks >> mb[6]; mb[6] = UBYTE4_DIV4( mb[6] );
				    mblocks >> mb[7]; mb[7] = UBYTE4_DIV4( mb[7] );
				
				    mv_sad[0] = 0x7fff; mv_xy[0] = 0;
				
				    uc horz_cnt = 4;
				    int horz_off = 0;
				
				    loop_count( horz_cnt ) { 
				      uc vert_cnt;
				      int vert_off = 0;
				      int dummy;
				
				      dummy = commclperm( 0, master_vert, 0, vert_cnt );
				
				      // save the "top" row data which gets
				      // destroyed by the vertical rotations
				
				      r_save[0] = r0[0]; r_save[1] = r0[2];
				      r_save[2] = r0[4]; r_save[3] = r0[6];
				
				      // Do all the vertical compares for the current
				      // reference column.
				
				      int no_vert_offset = 0xffffffff;
				      loop_count( vert_cnt ) {
				        ubyte4 sad4;
				        double t1;
				        double  t2;
				
				        half2 sad_total;
				        cc sad_less;
				
				        COMPUTE_VERT( horz_off, vert_off, row2_valid | no_vert_offset );
				
				        vert_off = vert_off + 1;
				        no_vert_offset = 0;
				      }
				
				      // These copies do a couple of things:
				      //
				      //  - Restores the rows after vertical shifting (using
				      //      the r_save values).
				      //  - Does a horizontal shift of the image to make
				      //      room for the next row.
				      //
				
				      r2[1] = r2[2]; r2[3] = r2[4]; r2[5] = r2[6]; 
				      r2[0] = r1[3]; r2[2] = r1[5]; r2[4] = r1[7]; 
				      r1[1] = r1[2]; r1[3] = r1[4]; r1[5] = r1[6]; 
				      r1[0] = r0[3]; r1[2] = r0[5]; r1[4] = r0[7]; 
				      r0[1] = r0[2]; r0[3] = r0[4]; r0[5] = r0[6]; 
				      r0[0] = r_save[1]; r0[2] = r_save[2]; r0[4] = r_save[3];
				
				      horz_off = horz_off + 4;
				
				      // read the next byte4 column for the reference
				      row0 >> r0[6]; r0[6] = UBYTE4_DIV4( r0[6] );
				      row0 >> r0[7]; r0[7] = UBYTE4_DIV4( r0[7] );
				      row1 >> r1[6]; r1[6] = UBYTE4_DIV4( r1[6] );
				      row1 >> r1[7]; r1[7] = UBYTE4_DIV4( r1[7] );
				      row2 >> r2[6]; r2[6] = UBYTE4_DIV4( r2[6] );
				      row2 >> r2[7]; r2[7] = UBYTE4_DIV4( r2[7] );
				    }
				
				    // output the motion vector of the current macroblock
				
				    // Shift to make MV's 1/2 pixel values
				    mv = shift(half2(mv_xy[1]), 1);
				    motions(node0) 				    motions(node0) 				  }
				
				  // there is a single block leftover ... write it out.
				
				  // Shift to make MV's 1/2 pixel values
				  mv = shift(half2(mv_xy[0]), 1);
				  motions(node0) 				  motions(node0) 				  flush( motions, 0 );
				}
							

相关资源