H.264完整的C语言代码和DCT的代码

源代码在线查看: dct8c.i

软件大小: 4747 K
上传用户: aoaoaoao
关键词: 264 DCT C语言代码 代码
下载地址: 免注册下载 普通下载 VIP

相关代码

				// dct.i      (based on dct/dct.i)
				// Ujval Kapasi
				// 1/22/97
				// 3/28/97
				// 7/22/97
				//
				// 8x8 DCT (for JPEG and MPEG)
				// 
				// From Pennebaker/Mitchell, pg. 50-52.  See also Arai, Agui, Nakajima.
				// This algorithm is based on the 16-pt DFT.  Basically, the 8-pt DCT can
				//   be calculated by scaling the real parts of the output of the 16-pt DFT.
				//
				// This code performs two DCTs every iteration of the loop. Thus the input
				// data must have one 8x8 block in the upper 16 bits of every word, and another
				// 8x8 block in the lower 16 bits of every word.
				
				kernel dct(istream consts,
				           istream datain,
				           ostream   out)
				{
				  // DCT constants
				
				  // Stored in 2.14 format
				  // COS_2             = 0x2d412d41;    // cos(2*pi/8) || cos(2*pi/8);
				  // COS_3             = 0x187e187e;    // cos(3*pi/8) || cos(3*pi/8);
				  // COS_1_plus_COS_3  = 0x539f539f;    // cos(pi/8) + cos(3*pi/8) || same
				  // COS_1_minus_COS_3 = 0x22a322a3;    // cos(pi/8) - cos(3*pi/8) || same
				
				  half2 COS_2, COS_3, COS_1_plus_COS_3, COS_1_minus_COS_3;
				
				  consts >> COS_2 >> COS_3 >> COS_1_plus_COS_3 >> COS_1_minus_COS_3;
				
				
				  // Stored in 2.14 format
				  // K0 = 0x16a116a1           // 0.25 * sqrt(2)       || 0.25 * sqrt(2);
				  // K1 = 0x10501050           // 0.25 * sec(pi/16)    || 0.25 * sec(pi/16);
				  // K2 = 0x11511151           // 0.25 * sec(2*pi/16)  || 0.25 * sec(2*pi/16);
				  // K3 = 0x133e133e           // 0.25 * sec(3*pi/16)  || 0.25 * sec(3*pi/16);
				  // K4 = 0x16a116a1           // 0.25 * sec(4*pi/16)  || 0.25 * sec(4*pi/16);
				  // K5 = 0x1ccd1ccd           // 0.25 * sec(5*pi/16)  || 0.25 * sec(5*pi/16);
				  // K6 = 0x29cf29cf           // 0.25 * sec(6*pi/16)  || 0.25 * sec(6*pi/16);
				  // K7 = 0x52035203           // 0.25 * sec(7*pi/16)  || 0.25 * sec(7*pi/16);
				
				  half2 K0, K1, K2, K3, K4, K5, K6, K7;
				
				  consts >> K0 >> K1 >> K2 >> K3 >> K4 >> K5 >> K6 >> K7;
				
				
				  // SP arrays  (not really persistent)
				
				  array buf1(8); // intermediate dct output.  ie, do rows then
				  array buf2(8); //   store here.  Then index into this
				                        //   differently to get the columns
				
				  // Comm permutations used to transpose the block
				
				  uc perm_a = 0x07654321;
				  uc perm_b = 0x10765432;
				  uc perm_c = 0x21076543;
				  uc perm_d = 0x32107654;
				  uc perm_e = 0x43210765;
				  uc perm_f = 0x54321076;
				  uc perm_g = 0x65432107;
				
				  int src_idx = 0;
				  int idx0 = cid();
				  int idx1 = (idx0 - 1) & 7;
				  int idx2 = (idx0 - 2) & 7;
				  int idx3 = (idx0 - 3) & 7;
				  int idx4 = (idx0 - 4) & 7;
				  int idx5 = (idx0 - 5) & 7;
				  int idx6 = (idx0 - 6) & 7;
				  int idx7 = (idx0 - 7) & 7;
				
				  loop_stream(datain) pipeline(1) {
				    half2 a0, a1, a2, a3, a4, a5, a6, a7;
				
				    datain >> a0 >> a1 >> a2 >> a3 >> a4 >> a5 >> a6 >> a7;
				
				    // do the 1d dct
				    half2 s16, s07, s25, s34, s1625, s0734;
				
				    s07 = a0 + a7;
				    s16 = a1 + a6;
				    s25 = a2 + a5;
				    s34 = a3 + a4;
				    s1625 = s16 + s25;
				    s0734 = s07 + s34;
				    // 12 OPS (count double because we are using half2's)
				
				    half2 d16, d07, d25, d34, d1625, d0734;
				
				    d07 = a0 - a7;
				    d16 = a1 - a6;
				    d25 = a2 - a5;
				    d34 = a3 - a4;
				    d1625 = s16 - s25;
				    d0734 = s07 - s34;
				    // 12 OPS
				
				    half2 sd16d07, sd25d34;
				
				    sd16d07 = d07 + d16;
				    sd25d34 = d25 + d34;
				    // 4 OPS
				
				    half2 m1_over_2, m2, m5, m6, m7, m8, m9;
				
				    // All results in 16.0
				    m1_over_2 = s0734 + s1625;
				    m2 = s0734 - s1625;
				    m5 = hi(COS_2 * shift(d1625 + d0734, 2));
				    m6 = hi(COS_2 * shift(d25 + d16, 2));
				    m7 = hi(COS_3 * shift(sd16d07 - sd25d34, 2));
				    m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, 2));
				    m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, 2));
				    // 30 OPS
				
				    half2 s5, s6, s7, s8;
				
				    s5 = d07 + m6;
				    s6 = d07 - m6;
				    s7 = m8 - m7;
				    s8 = m9 - m7;
				    // 8 OPS
				
				    // All results in 16.0
				    buf1[0] = hi(K0 * shift(m1_over_2, 2));
				    buf1[1] = hi(K1 * shift(s5 + s7, 2));
				    buf1[2] = hi(K2 * shift(d0734 + m5, 2));
				    buf1[3] = hi(K3 * shift(s6 - s8, 2));
				    buf1[4] = hi(K4 * shift(m2, 2));
				    buf1[5] = hi(K5 * shift(s6 + s8, 2));
				    buf1[6] = hi(K6 * shift(d0734 - m5, 2));
				    buf1[7] = hi(K7 * shift(s5 - s7, 2));
				    // 44 OPS
				
				    // Do comm stuff to transpose the matrix to do rows now
				    
				    buf2[idx0] = buf1[idx0];
				    buf2[idx7] = commucperm(perm_a, buf1[idx1]);
				    buf2[idx6] = commucperm(perm_b, buf1[idx2]);
				    buf2[idx5] = commucperm(perm_c, buf1[idx3]);
				    buf2[idx4] = commucperm(perm_d, buf1[idx4]);
				    buf2[idx3] = commucperm(perm_e, buf1[idx5]);
				    buf2[idx2] = commucperm(perm_f, buf1[idx6]);
				    buf2[idx1] = commucperm(perm_g, buf1[idx7]);
				    // 0 OPS
				  
				    // get a's from scratchpad -- In 16.0 format
				    a0 = buf2[0];
				    a1 = buf2[1];
				    a2 = buf2[2];
				    a3 = buf2[3];
				    a4 = buf2[4];
				    a5 = buf2[5];
				    a6 = buf2[6];
				    a7 = buf2[7];
				
				    s07 = a0 + a7;
				    s16 = a1 + a6;
				    s25 = a2 + a5;
				    s34 = a3 + a4;
				
				    s1625 = s16 + s25;
				    s0734 = s07 + s34;
				    // 12 OPS
				
				    d07 = a0 - a7;
				    d16 = a1 - a6;
				    d25 = a2 - a5;
				    d34 = a3 - a4;
				    d1625 = s16 - s25;
				    d0734 = s07 - s34;
				    // 12 OPS
				
				    sd16d07 = d07 + d16;
				    sd25d34 = d25 + d34;
				    // 4 OPS
				
				    // All results in 16.0
				    m1_over_2 = s0734 + s1625;
				    m2 = s0734 - s1625;
				    m5 = hi(COS_2 * shift(d1625 + d0734, 2));
				    m6 = hi(COS_2 * shift(d25 + d16, 2));
				    m7 = hi(COS_3 * shift(sd16d07 - sd25d34, 2));
				    m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, 2));
				    m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, 2));
				    // 30 OPS
				
				    s5 = d07 + m6;
				    s6 = d07 - m6;
				    s7 = m8 - m7;
				    s8 = m9 - m7;
				    // 8 OPS
				
				    half2 d0, d1, d2, d3, d4, d5, d6, d7;
				
				    // All results in 16.0
				    d0 = hi(K0 * shift(m1_over_2, 2));
				    d1 = hi(K1 * shift(s5 + s7, 2));
				    d2 = hi(K2 * shift(d0734 + m5, 2));
				    d3 = hi(K3 * shift(s6 - s8, 2));
				    d4 = hi(K4 * shift(m2, 2));
				    d5 = hi(K5 * shift(s6 + s8, 2));
				    d6 = hi(K6 * shift(d0734 - m5, 2));
				    d7 = hi(K7 * shift(s5 - s7, 2));
				    // 44 OPS
				
				    // TOTAL :  220 OPS per lop iter per cluster
				    //          1760 OPS TOTAL for 8 clusters
				
				    ///////////////////////////////////
				    //      -->  110 OPS per BLOCK  //
				    //           880   OPS total     //
				    ///////////////////////////////////
				
				    out 				  }
				}
							

相关资源