#include "idb_kernelc.hpp"
#include "mpeg.hpp"
#include "idb_kernelc2.hpp"
KERNELDEF(icolor, KERNELS_DIR "icolor_kc.uc");
// color.i (originally rgc_yc.i)
// Ujval Kapasi
// 3/28/97
// 7/22/97
// 6/31/98, revised by Manman Ren
// 12/10/99, revised by ujk
//
// Color space warp (RGB space ---> YCrCb space) for JPEG
// Includes a 2:1 subsampling in the horizontal and vertical direction
// for Cr and Cb. These are obtained by interpolating between two pixels.
// The sampling is done as follows (4:2:0 MPEG-2 format) :
// o o o o o (the "o" represents a luminance sample,
// . . . the "." represents a chrominance sample)
// o o o o o
//
// o o o o o
// . . .
// o o o o o
//
//
// Block layout : 1 2 -- each block is 8x8
// 3 4
//
// The input comes in such that the first row of block 1 comes in, one
// per cluster. Then the first row of block 2, then the second row of
// block 1, second row of block 2, and so on. Then after the eighth
// row of 2, the same pattern repeats for the rows of blocks 3 and 4.
//
// Input : color = 0 | R || G | B // in 8.0 format
//
// Output : Y = Y2 || Y1 // in 16.0 format
// Y (contd) = Y4 || Y3 // in 16.0 format
// C = Cr || Cb // in 16.0 format
//
//
// NOTE : IGNORING GAMMA CORRECTION for now
//
kernel icolor(istream datain,
ostream Yout,
ostream CrCbout)
{
// constants
cc low = itocc(cid() < 4);
cc Y_combine = itocc(half2(0) == half2(1)); // TRUE || FALSE
// Shuffle control words
byte4 shuf_func1 = 0x08020800; // zero | 3rd || zero | 1st
byte4 shuf_func2 = 0x08080801; // zero | zero || zero | 2nd
byte4 shuf_func3 = 0x01000100; // 1st || 1st
// Luminance transform constants (in 1.15 format)
half2 RB_SCALE = 0x26460e98; // 0.299 || 0.114
half2 G_SCALE = 0x00004b23; // 0 || 0.587
half2 C_SCALE = 0x4fe33f35; // 0.62411 || 0.4938
// For adding by 128
half2 one_two_eight = 0x00800080;
// communication permutations
// cluster : 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
// perm_a : C7 | C6 | C5 | C4 | C6 | C4 | C2 | C0 |
// perm_b : C6 | C4 | C2 | C0 | C3 | C2 | C1 | C0 |
uc perm_a = 0x76546420;
uc perm_b = 0x64203210;
// each iter. does one pixel in each of two blocks for two rows (4 pixels)
loop_stream(datain) pipeline(1) {
ubyte4 color1, color2, color3, color4;
half2 first, second;
half2 a1, a2, a3, a4, b1, b2, b3, b4, c1, c2, c3, c4, d1, d2, d3, d4;
half2 e1, e2, e3, e4, a1a3, a2a4, y1, y2, y3, y4, z1, z2, z3, z4;
half2 temp0, temp1, another0, another1, out00, out01, out10, out11;
// The input data are in 16.0.
datain >> color1 >> color2 >> color3 >> color4;
// a = R || B
// b = 0 || G
a1 = half2(shuffle(color1, shuf_func1));
b1 = half2(shuffle(color1, shuf_func2));
a2 = half2(shuffle(color2, shuf_func1));
b2 = half2(shuffle(color2, shuf_func2));
a3 = half2(shuffle(color3, shuf_func1));
b3 = half2(shuffle(color3, shuf_func2));
a4 = half2(shuffle(color4, shuf_func1));
b4 = half2(shuffle(color4, shuf_func2));
// After shifting, the inputs are in 15.1 format. The constants are in
// 1.15 format, so the result will be in 16.0 format. The results is
// c = 0.299R || 0.114B
c1 = hi(mulrnd(RB_SCALE, shift(a1, 1)));
c2 = hi(mulrnd(RB_SCALE, shift(a2, 1)));
c3 = hi(mulrnd(RB_SCALE, shift(a3, 1)));
c4 = hi(mulrnd(RB_SCALE, shift(a4, 1)));
// d = 0 || 0.114B + 0.587G
d1 = c1 + hi(mulrnd(G_SCALE, shift(b1, 1)));
d2 = c2 + hi(mulrnd(G_SCALE, shift(b2, 1)));
d3 = c3 + hi(mulrnd(G_SCALE, shift(b3, 1)));
d4 = c4 + hi(mulrnd(G_SCALE, shift(b4, 1)));
// e = 0 || 0.299R
e1 = half2(shift(int(c1), -16));
e2 = half2(shift(int(c2), -16));
e3 = half2(shift(int(c3), -16));
e4 = half2(shift(int(c4), -16));
a1a3 = a1 + a3;
a2a4 = a2 + a4;
// y = - || Y (0.114B + 0.587G + 299R)
y1 = d1 + e1;
y2 = d2 + e2;
y3 = d3 + e3;
y4 = d4 + e4;
// z = Y || Y
z1 = half2(shuffle(y1, shuf_func3));
z2 = half2(shuffle(y2, shuf_func3));
z3 = half2(shuffle(y3, shuf_func3));
z4 = half2(shuffle(y4, shuf_func3));
temp0 = select(Y_combine, z2, z1);
temp1 = select(Y_combine, z4, z3);
half2 Ymadj = 0x6d806d80; // 219/256
half2 Yaadj = 0x00100010; // 16
Yout Yout
// a and z are in 16.0. first and second are averages of two pixels,
// where each pixel adds the following weight to the average :
// (a-z)/1.6 + 128 || (a-z)/2 + 128. A little math is done to factor
// the multiplication (division) out of the average to reduce the number
// of necessary mutliplications and to obtain the maximum precision
// without shifts. C_SCALE is in 1.15, so the division by two is implicit,
// and no shift is necessary. The addition by 128 is also factored out.
// Each pixel has an additive factor of 128, for a total of 256, divided
// by two is 128.
first = hi(mulrnd((a1a3 - (z1 + z3)), C_SCALE)) + one_two_eight;
second = hi(mulrnd((a2a4 - (z2 + z4)), C_SCALE)) + one_two_eight;
// The first four clusters get first, and the second half get second.
first = commucperm(perm_a, first);
second = commucperm(perm_b, second);
CrCbout }
}