// dct.i
// Ujval Kapasi
// 1/22/97
// 3/28/97
//
// 8x8 DCT (for JPEG and MPEG)
//
// Test out a fast 1-d dct algorithm for the imagine chip implementation
// From Pennebaker/Mitchell, pg. 50-52. See also Arai, Agui, Nakajima.
// This algorithm is based on the 16-pt DFT. Basically, the 8-pt DCT can
// be calculated by scaling the real parts of the output of the 16-pt DFT.
// STUFF TO DO ONLY ONCE -- I.E., OUTSIDE OF LOOP
// DEBUG : ISTREAM 2 : constants stored in VRF until ability to load constants
// ----- onto imagine is implemented in simulator
// Unnecessary : only exist because constants as of yet are not handled
int one, two;
//one = 0x00000001;
one = istream(1);
two = one + one;
byte4 shuf_func1, shuf_func2, shuf_func3, shuf_func4;
byte4 shuf_func5, shuf_func6, shuf_func7, shuf_func8;
//shuf_func1 = 0x08080800; // zero | zero || zero | 1st
//shuf_func2 = 0x08080801; // zero | zero || zero | 2nd
//shuf_func3 = 0x08080802; // zero | zero || zero | 3rd
//shuf_func4 = 0x08080803; // zero | zero || zero | 4th
//shuf_func5 = 0x08000808; // zero | 1st || zero | zero
//shuf_func6 = 0x08010808; // zero | 2nd || zero | zero
//shuf_func7 = 0x08020808; // zero | 3rd || zero | zero
//shuf_func8 = 0x08030808; // zero | 4th || zero | zero
shuf_func1 = istream(1);
shuf_func2 = istream(1);
shuf_func3 = istream(1);
shuf_func4 = istream(1);
shuf_func5 = istream(1);
shuf_func6 = istream(1);
shuf_func7 = istream(1);
shuf_func8 = istream(1);
half2 COS_2, COS_3, COS_1_plus_COS_3, COS_1_minus_COS_3;
// Stored in 2.14 format
//COS_2 = 0x2d412d41; // cos(2*pi/8) || cos(2*pi/8);
//COS_3 = 0x187e187e; // cos(3*pi/8) || cos(3*pi/8);
//COS_1_plus_COS_3 = 0x539f539f; // cos(pi/8) + cos(3*pi/8) || same
//COS_1_minus_COS_3 = 0x22a322a3; // cos(pi/8) - cos(3*pi/8) || same
COS_2 = istream(1);
COS_3 = istream(1);
COS_1_plus_COS_3 = istream(1);
COS_1_minus_COS_3 = istream(1);
half2 K0, K1, K2, K3, K4, K5, K6, K7;
//K0 = 0x16a116a1 // 0.25 * sqrt(2) || 0.25 * sqrt(2);
//K1 = 0x10501050 // 0.25 * sec(pi/16) || 0.25 * sec(pi/16);
//K2 = 0x11511151 // 0.25 * sec(2*pi/16) || 0.25 * sec(2*pi/16);
//K3 = 0x133e133e // 0.25 * sec(3*pi/16) || 0.25 * sec(3*pi/16);
//K4 = 0x16a116a1 // 0.25 * sec(4*pi/16) || 0.25 * sec(4*pi/16);
//K5 = 0x1ccd1ccd // 0.25 * sec(5*pi/16) || 0.25 * sec(5*pi/16);
//K6 = 0x29cf29cf // 0.25 * sec(6*pi/16) || 0.25 * sec(6*pi/16);
//K7 = 0x52035203 // 0.25 * sec(7*pi/16) || 0.25 * sec(7*pi/16);
K0 = istream(1);
K1 = istream(1);
K2 = istream(1);
K3 = istream(1);
K4 = istream(1);
K5 = istream(1);
K6 = istream(1);
K7 = istream(1);
int junk;
junk = istream(1); // pad out stream length to a multiple of 8 words
junk = istream(1);
junk = istream(1);
persistent uc int i = 8;
// Each iteration --> 2 rows
loop count i {
// Input Streams :
// let n = 2*i
// 0 : row n
// 1 : row n+1
byte4 i0, i1, i2, i3;
// notation : (row, column)
i0 = istream(0); // i0 = (n,3) | (n,2) | (n,1) | (n,0)
i1 = istream(0); // i1 = (n,4) | (n,5) | (n,6) | (n,7)
i2 = istream(0); // i2 = (n+1,4) | (n+1,5) | (n+1,6) | (n+1,7)
i3 = istream(0); // i3 = (n+1,4) | (n+1,5) | (n+1,6) | (n+1,7)
half2 a0, a1, a2, a3, a4, a5, a6, a7, c0, c1, c2, c3, c4, c5, c6, c7;
a0 = half2(shuffle(i0,shuf_func1)); // a0 = ---- || (n,0)
a1 = half2(shuffle(i0,shuf_func2)); // a1 = ---- || (n,1)
a2 = half2(shuffle(i0,shuf_func3)); // a2 = ---- || (n,2)
a3 = half2(shuffle(i0,shuf_func4)); // a3 = ---- || (n,3)
a4 = half2(shuffle(i1,shuf_func1)); // a4 = ---- || (n,4)
a5 = half2(shuffle(i1,shuf_func2)); // a5 = ---- || (n,5)
a6 = half2(shuffle(i1,shuf_func3)); // a6 = ---- || (n,6)
a7 = half2(shuffle(i1,shuf_func4)); // a7 = ---- || (n,7)
c0 = half2(shuffle(i2,shuf_func5)); // c0 = (n+1,0) || ----
c1 = half2(shuffle(i2,shuf_func6)); // c1 = (n+1,1) || ----
c2 = half2(shuffle(i2,shuf_func7)); // c2 = (n+1,2) || ----
c3 = half2(shuffle(i2,shuf_func8)); // c3 = (n+1,3) || ----
c4 = half2(shuffle(i3,shuf_func5)); // c4 = (n+1,4) || ----
c5 = half2(shuffle(i3,shuf_func6)); // c5 = (n+1,5) || ----
c6 = half2(shuffle(i3,shuf_func7)); // c6 = (n+1,6) || ----
c7 = half2(shuffle(i3,shuf_func8)); // c7 = (n+1,7) || ----
// combine a's and b's
a0 = a0 | c0; // a0 = (n+1,0) || (n,0)
a1 = a1 | c1; // a1 = (n+1,1) || (n,1)
a2 = a2 | c2; // a2 = (n+1,2) || (n,2)
a3 = a3 | c3; // a3 = (n+1,3) || (n,3)
a4 = a4 | c4; // a4 = (n+1,4) || (n,4)
a5 = a5 | c5; // a5 = (n+1,5) || (n,5)
a6 = a6 | c6; // a6 = (n+1,6) || (n,6)
a7 = a7 | c7; // a7 = (n+1,7) || (n,7)
half2 s16, s07, s25, s34, s1625, s0734;
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
half2 d16, d07, d25, d34, d1625, d0734;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
half2 sd16d07, sd25d34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
half2 m1_over_2, m2, m5, m6, m7, m8, m9;
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(COS_2 * shift(d1625 + d0734, two));
m6 = hi(COS_2 * shift(d25 + d16, two));
m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
half2 s5, s6, s7, s8;
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
half2 S0, S1, S2, S3, S4, S5, S6, S7;
ostream(0) = hi(K0 * shift(m1_over_2, two));
ostream(0) = hi(K1 * shift(s5 + s7, two));
ostream(0) = hi(K2 * shift(d0734 + m5, two));
ostream(0) = hi(K3 * shift(s6 - s8, two));
ostream(0) = hi(K4 * shift(m2, two));
ostream(0) = hi(K5 * shift(s6 + s8, two));
ostream(0) = hi(K6 * shift(d0734 - m5, two));
ostream(0) = hi(K7 * shift(s5 - s7, two));
}