?? dct1c.i
字號:
// dct.i
// Ujval Kapasi
// 1/22/97
// 3/28/97
//
// 8x8 DCT (for JPEG and MPEG)
//
// Test out a fast 1-d dct algorithm for the imagine chip implementation
// From Pennebaker/Mitchell, pg. 50-52. See also Arai, Agui, Nakajima.
// This algorithm is based on the 16-pt DFT. Basically, the 8-pt DCT can
// be calculated by scaling the real parts of the output of the 16-pt DFT.
// This kernel processes two blocks at the same time, one in each half
// of the half2 registers
// STUFF TO DO ONLY ONCE -- I.E., OUTSIDE OF LOOP
// DEBUG : ISTREAM 0 : constants stored in VRF until ability to load constants
// ----- onto imagine is implemented in simulator
// Unnecessary : only exist because constants as of yet are not handled
kernel dct(istream<half2> consts,
istream<half2> datain,
ostream<int> out)
{
int two = 1 + 1;
int four = two + two;
int eight = four + four;
int minus_eight = 0 - eight;
half2 COS_2, COS_3, COS_1_plus_COS_3, COS_1_minus_COS_3;
// Stored in 2.14 format
//COS_2 = 0x2d412d41; // cos(2*pi/8) || cos(2*pi/8);
//COS_3 = 0x187e187e; // cos(3*pi/8) || cos(3*pi/8);
//COS_1_plus_COS_3 = 0x539f539f; // cos(pi/8) + cos(3*pi/8) || same
//COS_1_minus_COS_3 = 0x22a322a3; // cos(pi/8) - cos(3*pi/8) || same
consts >> COS_2 >> COS_3 >> COS_1_plus_COS_3 >> COS_1_minus_COS_3;
half2 K0, K1, K2, K3, K4, K5, K6, K7;
// Stored in 2.14 format
//K0 = 0x16a116a1 // 0.25 * sqrt(2) || 0.25 * sqrt(2);
//K1 = 0x10501050 // 0.25 * sec(pi/16) || 0.25 * sec(pi/16);
//K2 = 0x11511151 // 0.25 * sec(2*pi/16) || 0.25 * sec(2*pi/16);
//K3 = 0x133e133e // 0.25 * sec(3*pi/16) || 0.25 * sec(3*pi/16);
//K4 = 0x16a116a1 // 0.25 * sec(4*pi/16) || 0.25 * sec(4*pi/16);
//K5 = 0x1ccd1ccd // 0.25 * sec(5*pi/16) || 0.25 * sec(5*pi/16);
//K6 = 0x29cf29cf // 0.25 * sec(6*pi/16) || 0.25 * sec(6*pi/16);
//K7 = 0x52035203 // 0.25 * sec(7*pi/16) || 0.25 * sec(7*pi/16);
consts >> K0 >> K1 >> K2 >> K3 >> K4 >> K5 >> K6 >> K7;
// half to consume a multiple of 8 words from stream
half2 junk;
consts >> junk >> junk >> junk >> junk;
array<half2> buf(64); // intermediate dct output. ie, do rows then
// store here. Then index into this
// differently to get the columns
loop_stream(datain) { // loop over blocks
int index = 0 - eight;
int index2 = 0 + 0;
uc<int> i = 8;
uc<int> i2 = 8;
loop_count(i) pipeline(1) {
half2 a0, a1, a2, a3, a4, a5, a6, a7;
datain >> a0 >> a1 >> a2 >> a3 >> a4 >> a5 >> a6 >> a7;
half2 s16, s07, s25, s34, s1625, s0734;
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
// 12 OPS (count double because we are using half2's)
half2 d16, d07, d25, d34, d1625, d0734;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
// 12 OPS
half2 sd16d07, sd25d34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
// 4 OPS
half2 m1_over_2, m2, m5, m6, m7, m8, m9;
// All results in 16.0
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(COS_2 * shift(d1625 + d0734, two));
m6 = hi(COS_2 * shift(d25 + d16, two));
m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
// 30 OPS
half2 s5, s6, s7, s8;
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
// 8 OPS
// All results in 16.0
index = index + eight;
buf[0+index] = hi(K0 * shift(m1_over_2, two));
buf[1+index] = hi(K1 * shift(s5 + s7, two));
buf[2+index] = hi(K2 * shift(d0734 + m5, two));
buf[3+index] = hi(K3 * shift(s6 - s8, two));
buf[4+index] = hi(K4 * shift(m2, two));
buf[5+index] = hi(K5 * shift(s6 + s8, two));
buf[6+index] = hi(K6 * shift(d0734 - m5, two));
buf[7+index] = hi(K7 * shift(s5 - s7, two));
// 44 OPS
// TOTAL : 110 per loop iter (same for next loop also)
}
// do the columns now
loop_count(i2) pipeline(1) {
a0 = buf[0+index2];
a1 = buf[8+index2];
a2 = buf[16+index2];
a3 = buf[24+index2];
a4 = buf[32+index2];
a5 = buf[40+index2];
a6 = buf[48+index2];
a7 = buf[56+index2];
index2 = index2 + 1;
s07 = a0 + a7;
s16 = a1 + a6;
s25 = a2 + a5;
s34 = a3 + a4;
s1625 = s16 + s25;
s0734 = s07 + s34;
d07 = a0 - a7;
d16 = a1 - a6;
d25 = a2 - a5;
d34 = a3 - a4;
d1625 = s16 - s25;
d0734 = s07 - s34;
sd16d07 = d07 + d16;
sd25d34 = d25 + d34;
m1_over_2 = s0734 + s1625;
m2 = s0734 - s1625;
m5 = hi(COS_2 * shift(d1625 + d0734, two));
m6 = hi(COS_2 * shift(d25 + d16, two));
m7 = hi(COS_3 * shift(sd16d07 - sd25d34, two));
m8 = hi((COS_1_plus_COS_3) * shift(sd16d07, two));
m9 = hi((COS_1_minus_COS_3) * shift(sd25d34, two));
s5 = d07 + m6;
s6 = d07 - m6;
s7 = m8 - m7;
s8 = m9 - m7;
out << hi(K0 * shift(m1_over_2, two));
out << hi(K1 * shift(s5 + s7, two));
out << hi(K2 * shift(d0734 + m5, two));
out << hi(K3 * shift(s6 - s8, two));
out << hi(K4 * shift(m2, two));
out << hi(K5 * shift(s6 + s8, two));
out << hi(K6 * shift(d0734 - m5, two));
out << hi(K7 * shift(s5 - s7, two));
}
}
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -