#include "idb_kernelc.hpp"
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"
KERNELDEF(blocksearch, "mpeg_sc/blocksearch_kc.uc");
// Performs a right shift by two (division by four) on a ubyte4
#define UBYTE4_DIV4( x ) \
ubyte4(int(shift( half2(x), minus_2 )) & shift_mask )
// COMPUTE_SAD does the sad calculation for a complete macroblock against one
// vertical position in the current reference column. Two different reference
// variables are passed in because the compare can straddle the rows in some
// cases.
//
// m - macroblock expand
// moff - offset into the macroblock expand (for picking one of the two macroblocks)
// r1 - first reference row
// r1_off - offset into the first reference row
// r2 - second reference row
// r2_off - offset into the second reference row
// valid - non-zero if the row should be considered valid (allows mv_sad and
// mv_xy to be updated). This is used at the top and bottom of the image.
// xoff - Horizontal offset of the current search (used for motion vector).
// yoff - Vertical offset of the current search (used for motion vector).
// mv_sad - Lowest current sad of the given macroblock.
// mv_xy - Motion vector of the lowest sad.
//
// NOTES: I didn't put in anything special to limit the chance of saturation of
// the ubyte4's when summing the rows of the macroblock.
#define COMPUTE_SAD( m, moff, r1, r1_off, r2, r2_off, valid, xoff, yoff, mv_sad, mv_xy ) \
sad4 = \
( ( abd( m[moff], r1[r1_off] ) + abd( mb[moff+1], r2[r2_off] ) ) + \
( abd( m[moff+2], r1[r1_off+2] ) + abd( mb[moff+3], r2[r2_off+2] ) ) ); \
t1 = shuffled( sad4, 0x88318820 ); \
sad_total = half2(hi(t1)) + half2(lo(t1)); \
sad4 = \
( ( abd( m[moff+4], r1[r1_off+4] ) + abd( mb[moff+5], r2[r2_off+4] ) ) + \
( abd( m[moff+6], r1[r1_off+6] ) + abd( mb[moff+7], r2[r2_off+6] ) ) ); \
t1 = shuffled( sad4, 0x88318820 ); \
sad_total = sad_total + half2(hi(t1)) + half2(lo(t1)); \
t2 = shuffled( sad_total, 0x88883120 ); \
sad_total = half2(hi(t2)) + half2(lo(t2)); \
sad_total = sad_total + commucperm( tree_sum1, sad_total ); \
sad_total = sad_total + commucperm( tree_sum2, sad_total ); \
sad_total = sad_total + commucperm( tree_sum3, sad_total ); \
\
sad_less = itocc( ( int( sad_total ) < int( mv_sad ) ) & int(valid) ); \
mv_sad = select( sad_less, sad_total, mv_sad ); \
mv_xy = select( sad_less, shift( int(yoff) & 0xffff, 16 ) | int(xoff) & 0xffff, mv_xy );
// COMPUTE_ALL_MB does the sad calculation for both macroblocks against
// the current reference column.
//
// r1 - first reference row
// r1_off - offset into the first reference row
// r2 - second reference row
// r2_off - offset into the second reference row
// valid - non-zero if the row should be considered valid (allows mv_sad and
// mv_xy to be updated). This is used at the top and bottom of the image.
// xoff - Horizontal offset of the current search (used for motion vector).
// yoff - Vertical offset of the current search (used for motion vector).
#define COMPUTE_ALL_MB( r1, r1_off, r2, r2_off, valid, xoff, yoff ) \
COMPUTE_SAD( mb, 0, r1, r1_off, r2, r2_off, valid, -16+xoff, yoff, mv_sad[0], mv_xy[0] ); \
COMPUTE_SAD( mb, 8, r1, r1_off, r2, r2_off, valid, xoff, yoff, mv_sad[1], mv_xy[1] );
// COMPUTE_SAD( mb, 16, r1, r1_off, r2, r2_off, 16+xoff, yoff, mv_sad[2], mv_xy[2] );
// VERTICAL_SHIFT rotates the entire current column up (towards the top of the
// image). This eliminates the topmost row.
#define VERTICAL_SHIFT \
r0[0] = commucperm( rotate1, select( node0, r0[1], r0[0] ) ); \
r0[2] = commucperm( rotate1, select( node0, r0[3], r0[2] ) ); \
r0[4] = commucperm( rotate1, select( node0, r0[5], r0[4] ) ); \
r0[6] = commucperm( rotate1, select( node0, r0[7], r0[6] ) ); \
\
r0[1] = commucperm( rotate1, select( node0, r1[0], r0[1] ) ); \
r0[3] = commucperm( rotate1, select( node0, r1[2], r0[3] ) ); \
r0[5] = commucperm( rotate1, select( node0, r1[4], r0[5] ) ); \
r0[7] = commucperm( rotate1, select( node0, r1[6], r0[7] ) ); \
\
r1[0] = commucperm( rotate1, select( node0, r1[1], r1[0] ) ); \
r1[2] = commucperm( rotate1, select( node0, r1[3], r1[2] ) ); \
r1[4] = commucperm( rotate1, select( node0, r1[5], r1[4] ) ); \
r1[6] = commucperm( rotate1, select( node0, r1[7], r1[6] ) ); \
\
r1[1] = commucperm( rotate1, select( node0, r2[0], r1[1] ) ); \
r1[3] = commucperm( rotate1, select( node0, r2[2], r1[3] ) ); \
r1[5] = commucperm( rotate1, select( node0, r2[4], r1[5] ) ); \
r1[7] = commucperm( rotate1, select( node0, r2[6], r1[7] ) ); \
\
r2[0] = commucperm( rotate1, select( node0, r2[1], r2[0] ) ); \
r2[2] = commucperm( rotate1, select( node0, r2[3], r2[2] ) ); \
r2[4] = commucperm( rotate1, select( node0, r2[5], r2[4] ) ); \
r2[6] = commucperm( rotate1, select( node0, r2[7], r2[6] ) ); \
\
r2[1] = commucperm( rotate1, r2[1] ); \
r2[3] = commucperm( rotate1, r2[3] ); \
r2[5] = commucperm( rotate1, r2[5] ); \
r2[7] = commucperm( rotate1, r2[7] );
// COMPUTE_VERT does the 4 overlapping macroblock sad's that can be
// computed for a single rotation of the reference column.
#define PASS_MB_STATE \
mb[0] %= mb[0]; \
mb[1] %= mb[1]; \
mb[2] %= mb[2]; \
mb[3] %= mb[3]; \
mb[4] %= mb[4]; \
mb[5] %= mb[5]; \
mb[6] %= mb[6]; \
mb[7] %= mb[7]; \
mb[8] %= mb[8]; \
mb[9] %= mb[9]; \
mb[10] %= mb[10]; \
mb[11] %= mb[11]; \
mb[12] %= mb[12]; \
mb[13] %= mb[13]; \
mb[14] %= mb[14]; \
mb[15] %= mb[15]; \
\
mv_sad[0] %= mv_sad[0]; \
mv_sad[1] %= mv_sad[1]; \
\
mv_xy[0] %= mv_xy[0]; \
mv_xy[1] %= mv_xy[1];
#define PASS_R_STATE(n) \
r##n[0] %= r##n[0]; \
r##n[1] %= r##n[1]; \
r##n[2] %= r##n[2]; \
r##n[3] %= r##n[3]; \
r##n[4] %= r##n[4]; \
r##n[5] %= r##n[5]; \
r##n[6] %= r##n[6]; \
r##n[7] %= r##n[7];
#define COMPUTE_VERT( xoff, yoff, valid ) \
PASS_MB_STATE \
PASS_R_STATE(0) \
COMPUTE_ALL_MB( r0, 0, r0, 1, row0_valid, xoff, -16+yoff ) \
PASS_R_STATE(1) \
COMPUTE_ALL_MB( r0, 1, r1, 0, row0_valid, xoff, -8+yoff ) \
barrier(); \
PASS_MB_STATE \
COMPUTE_ALL_MB( r1, 0, r1, 1, int( valid ), xoff, yoff ) \
PASS_R_STATE(2) \
COMPUTE_ALL_MB( r1, 1, r2, 0, row2_valid, xoff, 8+yoff ) \
VERTICAL_SHIFT
// Blocksearch
//
// row0,row1,row2 - The rows of the reference image.
// mblocks - The row of macroblocks to be checked against the image.
// motions - The motion vectors of the mblocks (one per mblock).
// location - bit 0 is set for top row, bit 1 is set for bottom row
//
// *The mblocks stream should be the same length as all of the row streams.
//
kernel blocksearch( istream row0, istream row1, istream row2,
istream mblocks, costream motions,
uc& location )
{
// stored rows
expand r0( 8 );
expand r1( 8 );
expand r2( 8 );
array r_save( 4 );
uc rotate1 = 0x07654321;
uc tree_sum1 = 0x67452301;
uc tree_sum2 = 0x44660022;
uc tree_sum3 = 0x00004444;
half2 mv;
// stored macroblocks
expand mb( 16 );
// best motion vectors
expand mv_xy( 3 );
expand mv_sad( 3 );
// mask to make right shift by 4 work for ubyte4's
int shift_mask = 0xff3fff3f;
int minus_2 = 0 - 2;
mblocks >> mb[0]; mb[0] = UBYTE4_DIV4( mb[0] );
mblocks >> mb[1]; mb[1] = UBYTE4_DIV4( mb[1] );
mblocks >> mb[2]; mb[2] = UBYTE4_DIV4( mb[2] );
mblocks >> mb[3]; mb[3] = UBYTE4_DIV4( mb[3] );
mblocks >> mb[4]; mb[4] = UBYTE4_DIV4( mb[4] );
mblocks >> mb[5]; mb[5] = UBYTE4_DIV4( mb[5] );
mblocks >> mb[6]; mb[6] = UBYTE4_DIV4( mb[6] );
mblocks >> mb[7]; mb[7] = UBYTE4_DIV4( mb[7] );
row0 >> r0[0]; r0[0] = UBYTE4_DIV4( r0[0] );
row0 >> r0[1]; r0[1] = UBYTE4_DIV4( r0[1] );
row0 >> r0[2]; r0[2] = UBYTE4_DIV4( r0[2] );
row0 >> r0[3]; r0[3] = UBYTE4_DIV4( r0[3] );
row0 >> r0[4]; r0[4] = UBYTE4_DIV4( r0[4] );
row0 >> r0[5]; r0[5] = UBYTE4_DIV4( r0[5] );
row0 >> r0[6]; r0[6] = UBYTE4_DIV4( r0[6] );
row0 >> r0[7]; r0[7] = UBYTE4_DIV4( r0[7] );
row1 >> r1[0]; r1[0] = UBYTE4_DIV4( r1[0] );
row1 >> r1[1]; r1[1] = UBYTE4_DIV4( r1[1] );
row1 >> r1[2]; r1[2] = UBYTE4_DIV4( r1[2] );
row1 >> r1[3]; r1[3] = UBYTE4_DIV4( r1[3] );
row1 >> r1[4]; r1[4] = UBYTE4_DIV4( r1[4] );
row1 >> r1[5]; r1[5] = UBYTE4_DIV4( r1[5] );
row1 >> r1[6]; r1[6] = UBYTE4_DIV4( r1[6] );
row1 >> r1[7]; r1[7] = UBYTE4_DIV4( r1[7] );
row2 >> r2[0]; r2[0] = UBYTE4_DIV4( r2[0] );
row2 >> r2[1]; r2[1] = UBYTE4_DIV4( r2[1] );
row2 >> r2[2]; r2[2] = UBYTE4_DIV4( r2[2] );
row2 >> r2[3]; r2[3] = UBYTE4_DIV4( r2[3] );
row2 >> r2[4]; r2[4] = UBYTE4_DIV4( r2[4] );
row2 >> r2[5]; r2[5] = UBYTE4_DIV4( r2[5] );
row2 >> r2[6]; r2[6] = UBYTE4_DIV4( r2[6] );
row2 >> r2[7]; r2[7] = UBYTE4_DIV4( r2[7] );
cc node0 = itocc( cid( ) == 0 );
int row0_valid = ( ( commclperm( 8, 0, location ) & 0x1 ) == 0 );
int row2_valid = ( ( commclperm( 8, 0, location ) & 0x2 ) == 0 );
// if both the top and bottom rows are invalid we can only search
// vertical offsets of zero - store this in master_vert
int master_vert;
master_vert = select( itocc( row0_valid | row2_valid ), 8, 1 );
mv_sad[0] = 0x7fff;
mv_sad[1] = 0x7fff;
mv_xy[0] = 0;
mv_xy[1] = 0;
loop_stream( row0 ) {
// rotate the stored macroblock and read the next one
mb[8] = mb[0]; mb[9] = mb[1]; mb[10] = mb[2]; mb[11] = mb[3];
mb[12] = mb[4]; mb[13] = mb[5]; mb[14] = mb[6]; mb[15] = mb[7];
mv_sad[1] = mv_sad[0]; mv_xy[1] = mv_xy[0];
mblocks >> mb[0]; mb[0] = UBYTE4_DIV4( mb[0] );
mblocks >> mb[1]; mb[1] = UBYTE4_DIV4( mb[1] );
mblocks >> mb[2]; mb[2] = UBYTE4_DIV4( mb[2] );
mblocks >> mb[3]; mb[3] = UBYTE4_DIV4( mb[3] );
mblocks >> mb[4]; mb[4] = UBYTE4_DIV4( mb[4] );
mblocks >> mb[5]; mb[5] = UBYTE4_DIV4( mb[5] );
mblocks >> mb[6]; mb[6] = UBYTE4_DIV4( mb[6] );
mblocks >> mb[7]; mb[7] = UBYTE4_DIV4( mb[7] );
mv_sad[0] = 0x7fff; mv_xy[0] = 0;
uc horz_cnt = 4;
int horz_off = 0;
loop_count( horz_cnt ) {
uc vert_cnt;
int vert_off = 0;
int dummy;
dummy = commclperm( 0, master_vert, 0, vert_cnt );
// save the "top" row data which gets
// destroyed by the vertical rotations
r_save[0] = r0[0]; r_save[1] = r0[2];
r_save[2] = r0[4]; r_save[3] = r0[6];
// Do all the vertical compares for the current
// reference column.
int no_vert_offset = 0xffffffff;
loop_count( vert_cnt ) {
ubyte4 sad4;
double t1;
double t2;
half2 sad_total;
cc sad_less;
COMPUTE_VERT( horz_off, vert_off, row2_valid | no_vert_offset );
vert_off = vert_off + 1;
no_vert_offset = 0;
}
// These copies do a couple of things:
//
// - Restores the rows after vertical shifting (using
// the r_save values).
// - Does a horizontal shift of the image to make
// room for the next row.
//
r2[1] = r2[2]; r2[3] = r2[4]; r2[5] = r2[6];
r2[0] = r1[3]; r2[2] = r1[5]; r2[4] = r1[7];
r1[1] = r1[2]; r1[3] = r1[4]; r1[5] = r1[6];
r1[0] = r0[3]; r1[2] = r0[5]; r1[4] = r0[7];
r0[1] = r0[2]; r0[3] = r0[4]; r0[5] = r0[6];
r0[0] = r_save[1]; r0[2] = r_save[2]; r0[4] = r_save[3];
horz_off = horz_off + 4;
// read the next byte4 column for the reference
row0 >> r0[6]; r0[6] = UBYTE4_DIV4( r0[6] );
row0 >> r0[7]; r0[7] = UBYTE4_DIV4( r0[7] );
row1 >> r1[6]; r1[6] = UBYTE4_DIV4( r1[6] );
row1 >> r1[7]; r1[7] = UBYTE4_DIV4( r1[7] );
row2 >> r2[6]; r2[6] = UBYTE4_DIV4( r2[6] );
row2 >> r2[7]; r2[7] = UBYTE4_DIV4( r2[7] );
}
// output the motion vector of the current macroblock
// Shift to make MV's 1/2 pixel values
mv = shift(half2(mv_xy[1]), 1);
motions(node0) motions(node0) }
// there is a single block leftover ... write it out.
// Shift to make MV's 1/2 pixel values
mv = shift(half2(mv_xy[0]), 1);
motions(node0) motions(node0) flush( motions, 0 );
}