?? r8x8invdct.asm

?? IDCT 反離散余弦變換原程序
?? ASM
字號(hào):
/*******************************************************************************************
Copyright(c) 2000 Analog Devices/Intel
Developed by JD(FRIO) Software Application Team, IPDC, Bangalore, India
********************************************************************************************
  File Name    : r8x8invdct.asm
  Module Name  : The implementation of Inverse DCT for 8x8 real data.
  Label Name   : __r8x8invdct
  Description  : This is the implementation of Chen's algorithm of IDCT.
                 It is based on the separable nature of IDCT for multi-
				 dimension. The input matrix is 8x8 real data. First, one dime-
				 sional 8-point IDCT is calculated for each of the 8 rows. The
				 output is stored in a separate matrix after transpose. Then again 
				 8-point IDCT is calculated on each row of matrix. The output
				 is again stored in a transpose matrix. This is final output.
				 
				 Chen's algorithm has 4 stages (parts) of implementation.

				 This implementation works only for 8x8 input. The input data 
				 should be real. The range of input should be -256 to 255. 
				 
				 The algorithm is in-placed. 
				 The prototype of the C callable is as follows:

				 _r8x8dct(fract16 *in, fract16 *coeff, fract16 *temp);

				 *in -> Pointer to Input vector.
				 *coeff -> Pointer to coefficients.
				 *temp -> Pointer to temproary data. 

				 Note :  The algorithm reads the input data from the "in" matrix.  
				         First 8-point IDCT will be calculated for all the 8 rows.
						 This output is stored in "temp" buffer in the transposed 
                         form at bit reversed locations. 
						 Again the 8-point IDCT is applied on all the 8 rows of 
						 "temp" buffer. Final output computed is stored in "in" 
						 buffer in transposed form at bit reversed locations.
						 The operation of transposing the matrix and calculation of
						 bit reversed are carried out while writing the data without
						 any explicit code.
						 
				         Output of function is provided "in" buffer in normal order.

	Registers Used : R0, R1, R2, R3, R4, R5, R6,R7,  P0, P1, P2, P3, P4, P5, A0, A1.
  	Other Register Used : I0, I1, I2, I3, B0, B2, B3, M0, M1, M2, L3 registers and LC0.

Performance : (Timer version 0.6.33)

              Code Size : 344 Bytes.

			  Memory Required :
			                    Input Matrix : 8 * 8 * 2 Bytes.
								Coefficients : 16 Bytes
                                Temporary Matrix: 8 * 8 * 2 Bytes
	
			  Cycle Count :

			                -----------------------------------------
							|  Size  |  Forward DCT  |  Inverse DCT |
							-----------------------------------------
							|  8x8   |   284 Cycles  |  311 Cycles  |
							-----------------------------------------

**************************************************************************************/
/*
*  All the buffers input, temp and coeff are allocated here. The alignment for 
*  4096 gaurantees for the different memory bank.
*/

		.section					data1;
		.align						4096;
		.global             		_in;
		.var 						_in[64];
		.align						4096;
		.global             		_temp;
		.var 						_temp[64];
		.align						4096;
		.global             		_coeff;
		.var 						_coeff[10];

/**************************************************************************************/

.section 				program;
.global					__r8x8invdct;
.align 					8;

__r8x8invdct:
/******************************** Function Prologue ***********************************/
		[--SP] = (R7:4, P5:3);    // Pushing the registers on stack.
		B0 = R0;                  // Pointer to Input matrix.
		B3 = R1;                  // Pointer to Coefficients
		B2 = R2;                  // Pointer to Temporary matrix.
		L0 = 0;                   // L registers are initialized to 0
		L1 = 0;                   // --------- do --------
		L2 = 0;                   // --------- do --------
		L3 = 16;                  // L3 is used for making coefficients array 
                                  // circular.

//-------------------------------------------------------------------------------------
		M1 = 16 (X);              // All these registers are initialized for
		M2 = 7 (X);               // modifying the address offsets.
		M3 = 8(X);
		P2 = 16;
		P3 = 32 (X);
 	    P4 = -110 (X);
		P5 = -62 (X);
		P0 = 2;
		MNOP;
		NOP;

/*
*	According to Chen's algorithm, first 8-point IDCT will be calculated for all
*	the 8 rows. The output of this calculation is stored in another transpose 
*   matrix. Now again the 8-point IDCT is applied on all the 8 rows. The output
*   is stored in matirix transpose form. This is the final output. Therefore,
*   a loop of 2 iteration (IDCT_strt, IDCT_end) is set.
*
*   B0 points to the "in" buffer and B2 points to "temp" buffer in the first 
*   iteration. The input is read from "in" buffer and output is written to
*   "temp" buffer. In the second iteration of IDCT_strt B0 points to "temp" and
*   B2 points to "in" buffer. The input is read from "temp" buffer and output
*   is written to "in" buffer. "in" buffer holds the final output. 
*/

	lsetup(IDCT_strt, IDCT_end) LC0 = P0;

	IDCT_strt: 
		I0 = B0;			        // I0 points to Input Element (0, 0)
		I2 = B0;                    // I2 points to Input Element (0, 0)
		I2 += M3 || R3.L = W[I0];	// Element 0 is read in R3.L				
		I1 = I2; 				    // I1 points to input Element (0, 6)
        I1 += 4  || R3.H = W[I2++];	// I2 points to input Element (0, 4)
                                    // Element 4 is read in R3.H
		I3 = B3;                    // I3 points to Coefficients		
		P0 = B2;                    // P0 points to array Element (0, 0) for writing output
		P1 = B2;
		R7.L = 0x5a82;              // R7.L holds the coefficuents C4.
		P1 = P1 + P2;               // P1 points to array element (1, 0) for writing output

/**************************** Implementation of Part 1 **********************************/
                       
/*
*	The following operation is done in 2 instructions.
*	A1 = Element 0 * cos(pi/4) 
*	A0 =  Element 0 * cos(pi/4)
*	A1 = A1 - Element 4 * cos(pi/4)
*	A0 = A0 + Element 4 * cos(pi/4)
*	At the same time the value of Element 2 and 6 are read in RH3 and RL3 respectively.
*/

		A1 = R3.L * R7.L, A0 = R3.L * R7.L	|| I0 += 4	|| R1.L = W[I1++];
		R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L)	|| R1.H = W[I0++] || R7 = [I3++];

/*
*	The following two instructions do -
*	A1 = Element 2 * cos(3pi/8) 
*	A0 =  Element 6 * cos(3pi/8)
*	A1 = A1 - Element 6 * cos(pi/8)
*	A0 = A0 + Element 2 * cos(pi/8)
*   R2 reads the input elements ( 5, 3).
*   R7 reads the coefficients value C5 and C3.
*/

		A1= R1.H * R7.L, A0 = R1.L * R7.L || I0 -= 4 || R2.L = W[I0];
		R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H) || R2.H = W[I2--] || R7 = [I3++];

/*
*	The following three instructions do -
*	A1 = Element 5 * cos(3pi/16) 
*	A0 =  Element 3 * cos(3pi/16)
*	A1 = A1 - Element 3 * cos(5pi/16)
*	A0 = A0 + Element 5 * cos(5pi/16)
*	Element 0 = (Element 0 + Element 6) / 2.
*	Element 4 = (Element 4 + Element 2) / 2.
*	Element 2 = (Element 4 - Element 2) / 2.
*	Element 6 = (Element 0 - Element 6) / 2.
*   The writing W[P0] = R5.L is done for packing purpose. The register locations
*   for element 4 and 6 are swapped.
*/

		A1 = R2.H * R7.L, A0 = R2.L * R7.L	|| R0.H = W[I0--] 	|| NOP;
		R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || R0.L = W[I1--] 	|| NOP;
		R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)	|| W[P0] = R5.L	|| R7 = [I3++];

/*
*	At the end of part 1 R0 has (1, 7), R5 has (2, 6), R2 has (5, 3) and
*	R3 has (4, 0). 
*   Where notation (x, y) means the element from column x is in upper half of register
*   and element from column y is in lower half of the register.
*/

// The loop for 7 is set. The last iteration is computed separately.
		P2 = M2;
	lsetup (Row_strt, Row_end) LC1 = P2;
		P2 = 16;

	Row_strt:

/**************************** Implementation of Part 2 **********************************/ 
/*
*   The following two instruction does the following job -
*	A1 = Element 1 * cos(7pi/16) 
*	A0 =  Element 7 * cos(7pi/16)
*	A1 = A1 - Element 7 * cos(pi/16)
*	A0 = A0 + Element 1 * cos(pi/16)
*   The read to R1 is dummy.
*   R7 reads the coefficient value C2 and C6.
*/

		A1 = R0.H * R7.L, A0 = R0.L * R7.L 	|| R1 = [I1++M1] || W[P1] = R3.H; 
		R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) 	|| I0 += M1	|| R7 = [I3++];

/*
*	The following single instructions operates on 4 data as -
*	Element 1 = (Element 1 + Element 5) / 2.
*	Element 5 = (Element 1 - Element 5) / 2.
*	Element 3 = (Element 7 - Element 3) / 2.
*	Element 7 = (Element 7 + Element 3) / 2.
*/

		R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0] || R1.L = W[I1++];

/*
*	At the end of part 2 R0 has (1, 7), and R2 has (5, 3).
*   The The registers R3.H and R5.L are being swapped. 
*/

/**************************** Implementation of Part 3 **********************************/

/*
*  The follwoing instruction does the following job.
*  Element 0 = Element 0 + Element 7.
*  Element 7 = Element 7 - Element 0.
*  Element 6 = Element 6 + Element 1.
*  Element 1 = Element 6 - Element 1.
*  The elements 0, 1, 6 and 7 are final.
*/

		R4 = R3 +|+ R0, R0 = R3 -|- R0 || I2 += M1 || R3.L = W[I0];

/*
*  The following two instructions do -
*  A1 = Element 3 * cos(pi/4) 
*  A0 =  Element 3 * cos(pi/4)
*  A1 = A1 - Element 5 * cos(pi/4)
*  A0 = A0 + Element 5 * cos(pi/4)
*/

		A1 = R2.L * R7.L, A0 = R2.L * R7.L || I0 += 4 || R3.H = W[I2++];
		R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1] || R1.H = W[I0++];

/*
*	At the end of part 3 R0 has (1, 7), and R2 has (5, 3), R4 has (6, 0) 
*   and R5 has (2, 4). Registers R4 and R0 holds final output.
*/

/**************************** Implementation of Part 4 **********************************/
/*
*  It is the final stage computation.
*  Element 4 = Element 4 + Element 3.
*  Element 2 = Element 2 + Element 5.
*  Element 5 = Element 5 - Element 2.
*  Element 3 = Element 3 - Element 4.
*/

		R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L	|| R2.L = W[I0--];
//R5 = 2, 4 and R6 = 5, 3 final
/*
*	At the end of part 4 R0 has (1, 7), and R4 has (6, 0), R5 has (2, 4) 
*   and R6 has (5, 3). All the registers hold final output.
*/

/**************************** Implementation of Part 1 **********************************/
/*
*  This is the same part as part 1 specified earlier. First time the part 1 calculation is
*  done outside the loop, after wards it is done here. It serves two purpose.
*  Firts it computes part 1 and it writes the data 2, 1, 3, 7, 4, 6 and 5 to its bit 
*  reversed order in transpose way. 
*/
		A1 = R3.L * R7.L, A0 = R3.L * R7.L || W[P0++P3] = R5.H || R2.H = W[I0--];
		R3.H = (A1 -= R3.H * R7.L), R3.L = ( A0 += R3.H * R7.L)	|| W[P0++P3] = R0.H || R7 = [I3++];
		A1= R1.H * R7.L, A0 = R1.L * R7.L || W[P0++P2] = R6.L	|| R2.H = W[I2--];
		R1.H = (A1 -= R1.L * R7.H), R1.L = (A0 += R1.H * R7.H)	|| W[P0++P4] = R0.L || R7 = [I3++];
		A1 = R2.H * R7.L, A0 = R2.L * R7.L || W[P1++P3] = R5.L || R0.H = W[I0--];
		R3 = R3 +|+ R1, R5 = R3 -|- R1 (ASR) || W[P1++P3] = R4.H || R0.L = W[I1--];
		W[P0] = R5.L;
	Row_end: R2.H = ( A1 -= R2.L * R7.H), R2.L = (A0 += R2.H * R7.H)|| W[P1++P5] = R6.H	|| R7 = [I3++];

/*
*  The computation for 7 rows are over. The last row computation is done here.
*  This is the same part as part 2, 3 and 4 done inside the loop. 
*/

		A1 = R0.H * R7.L, A0 = R0.L * R7.L 	|| NOP 	|| W[P1] = R3.H; 
		R0.H = (A1 -= R0.L * R7.H), R0.L = (A0 += R0.H * R7.H) || R7 = [I3++] || NOP;
		R0 = R0 +|+ R2, R2 = R0 -|- R2 (ASR) || R3.H = W[P0] || NOP;
		R4 = R3 +|+ R0, R0 = R3 -|- R0;
		A1 = R2.L * R7.L, A0 = R2.L * R7.L;
		R2.H = (A1 -= R2.H * R7.L), R2.L = (A0 += R2.H * R7.L) || R5.L = W[P1] || NOP;
		R5 = R5 +|+ R2, R6 = R5 -|- R2 || W[P0++P3] = R4.L || NOP;

		W[P0++P3] = R5.H;       // The last outputs are written here.
		W[P0++P3] = R0.H;
		W[P0++P2] = R6.L;
		W[P0++P4] = R0.L;
		W[P1++P3] = R5.L;
		W[P1++P3] = R4.H;
		W[P1++P5] = R6.H;

		B1 = B2;               // The pointers to output and input are swapped
		B2 = B0;               // B0 points to Input buffer 
	IDCT_end: B0 = B1;         // B2 points to output buffer				  

/*************************************************************************************************/

	Terminate:
		(R7:4,P5:3)=[SP++];     //Pop the registers before returning.
		RTS;                    //Return.
?? 文件大小 9 K
?? 上傳用戶 wg204wg
?? 所屬分類 DSP編程
??? 相關(guān)標(biāo)簽

#IDCT #離散余弦 #變換 #程序
?? 快捷鍵說(shuō)明

復(fù)制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號(hào) Ctrl + =
減小字號(hào) Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? r8x8invdct.asm

?? 快捷鍵說(shuō)明