?? fdct_8x8.asm
字號:
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* fdct_8x8 -- fdct_8x8 *
* *
* *
* REVISION DATE *
* 19-May-2005 *
* *
* USAGE *
* This routine has following C prototype: *
* void fdct_8x8(short dct_data[], unsigned num_dcts) *
* *
* The fdct_8x8 routine accepts a list of 8x8 input data blocks *
* and performs DCTs on each. The array should be aligned to a *
* 64-bit boundary, and be laid out equivalently to the C array *
* dct_data[num_dcts][8][8]. The input data should be in 9Q0 *
* format. *
* *
* The routine operates entirely in-place, requiring no additional *
* storage for intermediate results. *
* *
* *
* DESCRIPTION *
* The fdct_8x8 algorithm performs an DCT based on Chen's algorithm. *
* The input coefficients are assumed to be signed 16-bit data in *
* coefficients in 9Q0 format. *
* *
* void fdct_8x8_cn(short *dct_data, unsigned num_dcts) *
* { *
* const unsigned short C1 = 0x7D8A, C2 = 0x7642; *
* const unsigned short C3 = 0x6A6E, C4 = 0x5A82; *
* const unsigned short C5 = 0x471D, C6 = 0x30FC; *
* const unsigned short C7 = 0x18F9; *
* const unsigned short D1 = C1>>1, D2 = C2>>1; *
* const unsigned short D3 = C3>>1, D4 = C4>>1; *
* const unsigned short D5 = C5>>1, D6 = C6>>1; *
* const unsigned short D7 = C7>>1; *
* *
* short f0, f1, f2, f3, f4, f5, f6, f7; /* Spatial domain samples.*/ *
* short g0, g1, h0, h1, p0, p1; /* Even-half intermediate.*/ *
* short r0, r1; /* Even-half intermediate.*/ *
* short P0, P1, R0, R1; /* Even-half intermediate.*/ *
* short g2, g3, h2, h3; /* Odd-half intermediate. */ *
* short q0, q1, s0, s1; /* Odd-half intermediate. */ *
* short Q0, Q1, S0, S1; /* Odd-half intermediate. */ *
* short F0, F1, F2, F3, F4, F5, F6, F7; /* Freq. domain results. */ *
* int i, j; /* loop counts */ *
* short (*dct)[8][8] = (short (*)[8][8])dct_data; *
* *
* if (!num_dcts) return; *
* *
* /* ------------------------------------------------------------ */ *
* /* Vertical Pass */ *
* /* ------------------------------------------------------------ */ *
* for (i = 0; i < num_dcts; i++) *
* { *
* for (j = 0; j < 8; j++) *
* { *
* /* -------------------------------------------------------- */ *
* /* Stage 0: Load in sample-domain coefficients. */ *
* /* -------------------------------------------------------- */ *
* f0 = dct[i][0][j]; *
* f1 = dct[i][1][j]; *
* f2 = dct[i][2][j]; *
* f3 = dct[i][3][j]; *
* f4 = dct[i][4][j]; *
* f5 = dct[i][5][j]; *
* f6 = dct[i][6][j]; *
* f7 = dct[i][7][j]; *
* *
* /* -------------------------------------------------------- */ *
* /* Stage 1: Separate into even and odd halves. */ *
* /* -------------------------------------------------------- */ *
* g0 = f0 + f7; h2 = f0 - f7; *
* g1 = f1 + f6; h3 = f1 - f6; *
* h1 = f2 + f5; g3 = f2 - f5; *
* h0 = f3 + f4; g2 = f3 - f4; *
* *
* /* -------------------------------------------------------- */ *
* /* Stage 2 */ *
* /* -------------------------------------------------------- */ *
* p0 = g0 + h0; r0 = g0 - h0; *
* p1 = g1 + h1; r1 = g1 - h1; *
* q1 = g2; s1 = h2; *
* *
* q0 = ((h3*C4 - g3*C4) + 0x4000) >> 15; *
* s0 = ((h3*C4 + g3*C4) + 0x4000) >> 15; *
* *
* /* -------------------------------------------------------- */ *
* /* Stage 3 */ *
* /* -------------------------------------------------------- */ *
* P0 = ((C4 * p0 + C4 * p1) + 0x4000) >> 15; *
* P1 = ((C4 * p0 - C4 * p1) + 0x4000) >> 15; *
* R1 = ((C6 * r1 + C2 * r0) + 0x4000) >> 15; *
* R0 = ((C6 * r0 - C2 * r1) + 0x4000) >> 15; *
* *
* Q1 = q1 + q0; Q0 = q1 - q0; *
* S1 = s1 + s0; S0 = s1 - s0; *
* *
* /* -------------------------------------------------------- */ *
* /* Stage 4 */ *
* /* -------------------------------------------------------- */ *
* F0 = P0; F4 = P1; *
* F2 = R1; F6 = R0; *
* *
* F1 = ((C7 * Q1 + C1 * S1) + 0x4000) >> 15; *
* F7 = ((C7 * S1 - C1 * Q1) + 0x4000) >> 15; *
* F5 = ((C3 * Q0 + C5 * S0) + 0x4000) >> 15; *
* F3 = ((C3 * S0 - C5 * Q0) + 0x4000) >> 15; *
* *
* /* -------------------------------------------------------- */ *
* /* Stage 5: Write frequency-domain results. */ *
* /* -------------------------------------------------------- */ *
* dct[i][0][j] = F0; *
* dct[i][1][j] = F1; *
* dct[i][2][j] = F2; *
* dct[i][3][j] = F3; *
* dct[i][4][j] = F4; *
* dct[i][5][j] = F5; *
* dct[i][6][j] = F6; *
* dct[i][7][j] = F7; *
* } *
* } *
* *
* /* ------------------------------------------------------------- */ *
* /* Horizontal Pass */ *
* /* ------------------------------------------------------------- */ *
* for (i = 0; i < num_dcts; i++) *
* { *
* for (j = 0; j < 8; j++) *
* { *
* /* --------------------------------------------------------- */ *
* /* Stage 0: Load in sample-domain coefficients. */ *
* /* --------------------------------------------------------- */ *
* f0 = dct[i][j][0]; *
* f1 = dct[i][j][1]; *
* f2 = dct[i][j][2]; *
* f3 = dct[i][j][3]; *
* f4 = dct[i][j][4]; *
* f5 = dct[i][j][5]; *
* f6 = dct[i][j][6]; *
* f7 = dct[i][j][7]; *
* *
* /* --------------------------------------------------------- */ *
* /* Stage 1: Separate into even and odd halves. */ *
* /* --------------------------------------------------------- */ *
* g0 = f0 + f7; h2 = f0 - f7; *
* g1 = f1 + f6; h3 = f1 - f6; *
* h1 = f2 + f5; g3 = f2 - f5; *
* h0 = f3 + f4; g2 = f3 - f4; *
* *
* /* --------------------------------------------------------- */ *
* /* Stage 2 */ *
* /* --------------------------------------------------------- */ *
* p0 = g0 + h0; r0 = g0 - h0; *
* p1 = g1 + h1; r1 = g1 - h1; *
* q1 = g2; s1 = h2; *
* *
* q0 = ((h3*C4 - g3*C4) + 0x4000) >> 15; *
* s0 = ((h3*C4 + g3*C4) + 0x4000) >> 15; *
* *
* /* --------------------------------------------------------- */ *
* /* Stage 3 */ *
* /* --------------------------------------------------------- */ *
* P0 = ((D4 * p0 + D4 * p1) + 0x8000) >> 16; *
* P1 = ((D4 * p0 - D4 * p1) + 0x8000) >> 16; *
* R1 = ((D6 * r1 + D2 * r0) + 0x8000) >> 16; *
* R0 = ((D6 * r0 - D2 * r1) + 0x8000) >> 16; *
* *
* Q1 = q1 + q0; Q0 = q1 - q0; *
* S1 = s1 + s0; S0 = s1 - s0; *
* *
* /* --------------------------------------------------------- */ *
* /* Stage 4 */ *
* /* --------------------------------------------------------- */ *
* F0 = P0; F4 = P1; *
* F2 = R1; F6 = R0; *
* *
* F1 = ((D7 * Q1 + D1 * S1) + 0x8000) >> 16; *
* F7 = ((D7 * S1 - D1 * Q1) + 0x8000) >> 16; *
* F5 = ((D3 * Q0 + D5 * S0) + 0x8000) >> 16; *
* F3 = ((D3 * S0 - D5 * Q0) + 0x8000) >> 16; *
* *
* /* --------------------------------------------------------- */ *
* /* Stage 5: Store frequency-domain results. */ *
* /* --------------------------------------------------------- */ *
* dct[i][j][0] = F0; *
* dct[i][j][1] = F1; *
* dct[i][j][2] = F2; *
* dct[i][j][3] = F3; *
* dct[i][j][4] = F4; *
* dct[i][j][5] = F5; *
* dct[i][j][6] = F6; *
* dct[i][j][7] = F7; *
* } *
* } *
* *
* return; *
* } *
* *
* TECHNIQUES *
* All levels of looping are collapsed into single loops which are *
* pipelined. The outer loop focuses on 8-pt IDCTs, whereas the *
* inner loop controls the column-pointer to handle jumps between *
* IDCT blocks. (The column-pointer adjustment is handled by a *
* four-phase rotating "fixup" constant which takes the place of *
* the original inner-loop.) *
* *
* For performance, portions of the outer-loop code have been *
* inter-scheduled with the prologs and epilogs of both loops. *
* *
* *
* ASSUMPTIONS *
* The input array must be aligned on a double-word boundary. *
* *
* *
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -