?? r8x8invdct.asm
字號(hào):
/*******************************************************************************************
Copyright(c) 2000 Analog Devices/Intel
Developed by JD(FRIO) Software Application Team, IPDC, Bangalore, India
********************************************************************************************
File Name : r8x8invdct.asm
Module Name : The implementation of Inverse DCT for 8x8 real data.
Label Name : __r8x8invdct
Description : This is the implementation of Chen's algorithm of IDCT.
It is based on the separable nature of IDCT for multi-
dimension. The input matrix is 8x8 real data. First, one dime-
sional 8-point IDCT is calculated for each of the 8 rows. The
output is stored in a separate matrix after transpose. Then again
8-point IDCT is calculated on each row of matrix. The output
is again stored in a transpose matrix. This is final output.
Chen's algorithm has 4 stages (parts) of implementation.
This implementation works only for 8x8 input. The input data
should be real. The range of input should be -256 to 255.
The algorithm is in-placed.
The prototype of the C callable is as follows:
_r8x8dct(fract16 *in, fract16 *coeff, fract16 *temp);
*in -> Pointer to Input vector.
*coeff -> Pointer to coefficients.
*temp -> Pointer to temproary data.
Note : The algorithm reads the input data from the "in" matrix.
First 8-point IDCT will be calculated for all the 8 rows.
This output is stored in "temp" buffer in the transposed
form at bit reversed locations.
Again the 8-point IDCT is applied on all the 8 rows of
"temp" buffer. Final output computed is stored in "in"
buffer in transposed form at bit reversed locations.
The operation of transposing the matrix and calculation of
bit reversed are carried out while writing the data without
any explicit code.
Output of function is provided "in" buffer in normal order.
Registers Used : R0, R1, R2, R3, R4, R5, R6,R7, P0, P1, P2, P3, P4, P5, A0, A1.
Other Register Used : I0, I1, I2, I3, B0, B2, B3, M0, M1, M2, L3 registers and LC0.
Performance : (Timer version 0.6.33)
Code Size : 344 Bytes.
Memory Required :
Input Matrix : 8 * 8 * 2 Bytes.
Coefficients : 16 Bytes
Temporary Matrix: 8 * 8 * 2 Bytes
Cycle Count :
-----------------------------------------
| Size | Forward DCT | Inverse DCT |
-----------------------------------------
| 8x8 | 284 Cycles | 311 Cycles |
-----------------------------------------
**************************************************************************************/
/*
* All the buffers input, temp and coeff are allocated here. The alignment for
* 4096 gaurantees for the different memory bank.
*/
.section data1;
.align 4096;
.global _in;
.var _in[64];
.align 4096;
.global _temp;
.var _temp[64];
.align 4096;
.global _coeff;
.var _coeff[10];
/**************************************************************************************/
.section program;
.global __r8x8invdct;
.align 8;
__r8x8invdct:
/******************************** Function Prologue ***********************************/
[--SP] = (R7:4, P5:3); // Pushing the registers on stack.
B0 = R0; // Pointer to Input matrix.
B3 = R1; // Pointer to Coefficients
B2 = R2; // Pointer to Temporary matrix.
L0 = 0; // L registers are initialized to 0
L1 = 0; // --------- do --------
L2 = 0; // --------- do --------
L3 = 16; // L3 is used for making coefficients array
// circular.
//-------------------------------------------------------------------------------------
M1 = 16 (X); // All these registers are initialized for
M2 = 7 (X); // modifying the address offsets.
M3 = 8(X);
P2 = 16;
P3 = 32 (X);
P4 = -110 (X);
P5 = -62 (X);
P0 = 2;
MNOP;
NOP;
/*
* According to Chen's algorithm, first 8-point IDCT will be calculated for all
* the 8 rows. The output of this calculation is stored in another transpose
* matrix. Now again the 8-point IDCT is applied on all the 8 rows. The output
* is stored in matirix transpose form. This is the final output. Therefore,
* a loop of 2 iteration (IDCT_strt, IDCT_end) is set.
*
* B0 points to the "in" buffer and B2 points to "temp" buffer in the first
* iteration. The input is read from "in" buffer and output is written to
* "temp" buffer. In the second iteration of IDCT_strt B0 points to "temp" and
* B2 points to "in" buffer. The input is read from "temp" buffer and output
* is written to "in" buffer. "in" buffer holds the final output.
*/
lsetup(IDCT_strt, IDCT_end) LC0 = P0;
IDCT_strt:
I0 = B0; // I0 points to Input Element (0, 0)
I2 = B0; // I2 points to Input Element (0, 0)
I2 += M3 || R3.L = W[I0]; // Element 0 is read in R3.L
I1 = I2; // I1 points to input Element (0, 6)
I1 += 4 || R3.H = W[I2++]; // I2 points to input Element (0, 4)
// Element 4 is read in R3.H
I3 = B3; // I3 points to Coefficients
P0 = B2; // P0 points to array Element (0, 0) for writing output
P1 = B2;
R7.L = 0x5a82; // R7.L holds the coefficuents C4.
P1 = P1 + P2; // P1 points to array element (1, 0) for writing output
/**************************** Implementation of Part 1 **********************************/
/*
* The following operation is done in 2 instructions.
* A1 = Element 0 * cos(pi/4)
* A0 = Element 0 * cos(pi/4)
* A1 = A1 - Element 4 * cos(pi/4)
* A0 = A0 + Element 4 * cos(pi/4)
* At the same time the value of Element 2 and 6 are read in RH3 and RL3 respectively.
*/
A1 = R3.L * R7.L, A0 = R3.L * R7.L || I0 += 4 || R1.L = W[I1++];
R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L) || R1.H = W[I0++] || R7 = [I3++];
/*
* The following two instructions do -
* A1 = Element 2 * cos(3pi/8)
* A0 = Element 6 * cos(3pi/8)
* A1 = A1 - Element 6 * cos(pi/8)
* A0 = A0 + Element 2 * cos(pi/8)
* R2 reads the input elements ( 5, 3).
* R7 reads the coefficients value C5 and C3.
*/
A1= R1.H * R7.L, A0 = R1.L * R7.L || I0 -= 4 || R2.L = W[I0];
R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) || R2.H = W[I2--] || R7 = [I3++];
/*
* The following three instructions do -
* A1 = Element 5 * cos(3pi/16)
* A0 = Element 3 * cos(3pi/16)
* A1 = A1 - Element 3 * cos(5pi/16)
* A0 = A0 + Element 5 * cos(5pi/16)
* Element 0 = (Element 0 + Element 6) / 2.
* Element 4 = (Element 4 + Element 2) / 2.
* Element 2 = (Element 4 - Element 2) / 2.
* Element 6 = (Element 0 - Element 6) / 2.
* The writing W[P0] = R5.L is done for packing purpose. The register locations
* for element 4 and 6 are swapped.
*/
A1 = R2.H * R7.L, A0 = R2.L * R7.L || R0.H = W[I0--] || NOP;
R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || R0.L = W[I1--] || NOP;
R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H) || W[P0] = R5.L || R7 = [I3++];
/*
* At the end of part 1 R0 has (1, 7), R5 has (2, 6), R2 has (5, 3) and
* R3 has (4, 0).
* Where notation (x, y) means the element from column x is in upper half of register
* and element from column y is in lower half of the register.
*/
// The loop for 7 is set. The last iteration is computed separately.
P2 = M2;
lsetup (Row_strt, Row_end) LC1 = P2;
P2 = 16;
Row_strt:
/**************************** Implementation of Part 2 **********************************/
/*
* The following two instruction does the following job -
* A1 = Element 1 * cos(7pi/16)
* A0 = Element 7 * cos(7pi/16)
* A1 = A1 - Element 7 * cos(pi/16)
* A0 = A0 + Element 1 * cos(pi/16)
* The read to R1 is dummy.
* R7 reads the coefficient value C2 and C6.
*/
A1 = R0.H * R7.L, A0 = R0.L * R7.L || R1 = [I1++M1] || W[P1] = R3.H;
R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || I0 += M1 || R7 = [I3++];
/*
* The following single instructions operates on 4 data as -
* Element 1 = (Element 1 + Element 5) / 2.
* Element 5 = (Element 1 - Element 5) / 2.
* Element 3 = (Element 7 - Element 3) / 2.
* Element 7 = (Element 7 + Element 3) / 2.
*/
R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0] || R1.L = W[I1++];
/*
* At the end of part 2 R0 has (1, 7), and R2 has (5, 3).
* The The registers R3.H and R5.L are being swapped.
*/
/**************************** Implementation of Part 3 **********************************/
/*
* The follwoing instruction does the following job.
* Element 0 = Element 0 + Element 7.
* Element 7 = Element 7 - Element 0.
* Element 6 = Element 6 + Element 1.
* Element 1 = Element 6 - Element 1.
* The elements 0, 1, 6 and 7 are final.
*/
R4 = R3 +|+ R0, R0 = R3 -|- R0 || I2 += M1 || R3.L = W[I0];
/*
* The following two instructions do -
* A1 = Element 3 * cos(pi/4)
* A0 = Element 3 * cos(pi/4)
* A1 = A1 - Element 5 * cos(pi/4)
* A0 = A0 + Element 5 * cos(pi/4)
*/
A1 = R2.L * R7.L, A0 = R2.L * R7.L || I0 += 4 || R3.H = W[I2++];
R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1] || R1.H = W[I0++];
/*
* At the end of part 3 R0 has (1, 7), and R2 has (5, 3), R4 has (6, 0)
* and R5 has (2, 4). Registers R4 and R0 holds final output.
*/
/**************************** Implementation of Part 4 **********************************/
/*
* It is the final stage computation.
* Element 4 = Element 4 + Element 3.
* Element 2 = Element 2 + Element 5.
* Element 5 = Element 5 - Element 2.
* Element 3 = Element 3 - Element 4.
*/
R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L || R2.L = W[I0--];
//R5 = 2, 4 and R6 = 5, 3 final
/*
* At the end of part 4 R0 has (1, 7), and R4 has (6, 0), R5 has (2, 4)
* and R6 has (5, 3). All the registers hold final output.
*/
/**************************** Implementation of Part 1 **********************************/
/*
* This is the same part as part 1 specified earlier. First time the part 1 calculation is
* done outside the loop, after wards it is done here. It serves two purpose.
* Firts it computes part 1 and it writes the data 2, 1, 3, 7, 4, 6 and 5 to its bit
* reversed order in transpose way.
*/
A1 = R3.L * R7.L, A0 = R3.L * R7.L || W[P0++P3] = R5.H || R2.H = W[I0--];
R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L) || W[P0++P3] = R0.H || R7 = [I3++];
A1= R1.H * R7.L, A0 = R1.L * R7.L || W[P0++P2] = R6.L || R2.H = W[I2--];
R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) || W[P0++P4] = R0.L || R7 = [I3++];
A1 = R2.H * R7.L, A0 = R2.L * R7.L || W[P1++P3] = R5.L || R0.H = W[I0--];
R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || W[P1++P3] = R4.H || R0.L = W[I1--];
W[P0] = R5.L;
Row_end: R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)|| W[P1++P5] = R6.H || R7 = [I3++];
/*
* The computation for 7 rows are over. The last row computation is done here.
* This is the same part as part 2, 3 and 4 done inside the loop.
*/
A1 = R0.H * R7.L, A0 = R0.L * R7.L || NOP || W[P1] = R3.H;
R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || R7 = [I3++] || NOP;
R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0] || NOP;
R4 = R3 +|+ R0, R0 = R3 -|- R0;
A1 = R2.L * R7.L, A0 = R2.L * R7.L;
R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1] || NOP;
R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L || NOP;
W[P0++P3] = R5.H; // The last outputs are written here.
W[P0++P3] = R0.H;
W[P0++P2] = R6.L;
W[P0++P4] = R0.L;
W[P1++P3] = R5.L;
W[P1++P3] = R4.H;
W[P1++P5] = R6.H;
B1 = B2; // The pointers to output and input are swapped
B2 = B0; // B0 points to Input buffer
IDCT_end: B0 = B1; // B2 points to output buffer
/*************************************************************************************************/
Terminate:
(R7:4,P5:3)=[SP++]; //Pop the registers before returning.
RTS; //Return.
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -