?? inverse discrete cosine transform.txt
字號:
* r1 = X2*c6 - X6*c2; *
* r0 = X2*c2 + X6*c6; *
* *
* g0 = p0 + r0; *
* g1 = p1 + r1; *
* h1 = p1 - r1; *
* h0 = p0 - r0; *
* *
* /* ---------------------------------------------------- */ *
* /* Odd part of decomp. */ *
* /* ---------------------------------------------------- */ *
* g2 = (X1*c7 - X3*c5) + (X5*c3 - X7*c1); *
* g3 = (X1*c5 - X3*c1) + (X5*c7 + X7*c3); *
* h3 = (X1*c3 - X3*c7) - (X5*c1 + X7*c5); *
* h2 = (X1*c1 + X3*c3) + (X5*c5 + X7*c7); *
* *
* /* ---------------------------------------------------- */ *
* /* Final butterfly. */ *
* /* ---------------------------------------------------- */ *
* x0 = g0 + h2; *
* x1 = g1 + h3; *
* x2 = h1 + g3; *
* x3 = h0 + g2; *
* x4 = h0 - g2; *
* x5 = h1 - g3; *
* x6 = g1 - h3; *
* x7 = g0 - h2; *
* *
* /* ---------------------------------------------------- */ *
* /* Truncate and saturate final results. */ *
* /* ---------------------------------------------------- */ *
* x0t = x0 >> trunc2; *
* x1t = x1 >> trunc2; *
* x2t = x2 >> trunc2; *
* x3t = x3 >> trunc2; *
* x4t = x4 >> trunc2; *
* x5t = x5 >> trunc2; *
* x6t = x6 >> trunc2; *
* x7t = x7 >> trunc2; *
* *
* x0s = x0t < -256 ? -256 : x0t > 255 ? 255 : x0t; *
* x1s = x1t < -256 ? -256 : x1t > 255 ? 255 : x1t; *
* x2s = x2t < -256 ? -256 : x2t > 255 ? 255 : x2t; *
* x3s = x3t < -256 ? -256 : x3t > 255 ? 255 : x3t; *
* x4s = x4t < -256 ? -256 : x4t > 255 ? 255 : x4t; *
* x5s = x5t < -256 ? -256 : x5t > 255 ? 255 : x5t; *
* x6s = x6t < -256 ? -256 : x6t > 255 ? 255 : x6t; *
* x7s = x7t < -256 ? -256 : x7t > 255 ? 255 : x7t; *
* *
* /* ---------------------------------------------------- */ *
* /* Store the results transposed in the result area. */ *
* /* ---------------------------------------------------- */ *
* o_ptr[ 0] = x0s; *
* o_ptr[ 8] = x1s; *
* o_ptr[16] = x2s; *
* o_ptr[24] = x3s; *
* o_ptr[32] = x4s; *
* o_ptr[40] = x5s; *
* o_ptr[48] = x6s; *
* o_ptr[56] = x7s; *
* *
* o_ptr++; /* increment ptr to next column */ *
* } *
* /* -------------------------------------------------------- */ *
* /* Update output pointer to point to next block. */ *
* /* -------------------------------------------------------- */ *
* o_ptr = o_ptr - 8 + 64; *
* } *
* } *
* *
* *
* Note: This code guarantees correct operation, even in the case *
* that 'num_idcts == 0'. In that case, the function runs for only *
* 35 cycles (counting 6 cycles of function-call overhead), due to *
* early-exit code. The early-exit case performs no accesses to the *
* idct_data[] array. *
* *
* TECHNIQUES *
* All levels of looping are collapsed into single loops which are *
* pipelined. The outer loop focuses on 8-pt IDCTs, whereas the *
* inner loop controls the column-pointer to handle jumps between *
* IDCT blocks. *
* *
* For performance, portions of the code outside the loops have been *
* inter-scheduled with the prolog and epilog code of the loops. *
* Also, twin stack-pointers are used to accelerate stack accesses. *
* Finally, pointer values and cosine term registers are reused *
* between the horizontal and vertical loops to save the need for *
* messy pointer and constant reinitialization. *
* *
* To save codesize, prolog and epilog collapsing have been performed *
* to the extent that it does not impact performance. Also, code *
* outside the loops has been scheduled to pack as tightly into *
* fetch packets as possible to avoid alignment padding NOPs. *
* *
* The IDCTs cannot be performed completely in-place due to the *
* transpose that each pass performs. In order to save data memory, *
* the horizontal pass works from the end of the array towards the *
* begining, writing its result one IDCT block later in memory, *
* thus performing the IDCT nearly-in-place. The vertical pass *
* performs its IDCTs in the opposite direction, working from the *
* start of the array towards the end, writing the results in-place. *
* A nice side effect of this is that the pointer values at the *
* end of the horizontal loop are a fixed offset relative to their *
* required values for the vertical loop, regardless of the number *
* of IDCTs performed. This makes the pointer reinitialization *
* exceptionally cheap. *
* *
* Additional section-specific optimization notes are provided below. *
* *
* ASSUMPTIONS *
* The input array must be aligned on a word boundary, and one *
* extra block's worth of storage must be present after the list *
* of IDCT input blocks. *
* *
* MEMORY NOTE *
* No bank conflicts occur. The code requires 16 words of stack *
* space to save Save-On-Entry (SOE) registers, CSR, IRP, and a *
* spill value. For correct operation, the input array must be *
* aligned to a word boundary. *
* *
* Bank usage on C6201: *
* *
* Horiz loop accesses: 1 of 4 banks for 80% of cycles *
* 4 of 4 banks for 20% of cycles *
* *
* Vert loop accesses: 1 of 4 banks for 73% of cycles *
* 4 of 4 banks for 18% of cycles *
* 0 of 4 banks for 9% of cycles *
* *
* NOTES *
* This is a LITTLE ENDIAN implementation. *
* *
* This code masks interrupts for nearly its entire duration. *
* Interrupts are locked out for '53 + 168 * num_idcts' cycles. As *
* a result, the code is interrupt-tolerant, but not interruptible. *
* *
* The cosine terms have all been scaled by sqrt(2), so that the *
* "c4" term is basically an even power of 2. *
* *
* The precision of the final results can be changed by modifying *
* the constants at the top of the code and reassembling. Usually, *
* modifying the final-shift constants in the "Symbolic Constants" *
* section is sufficient. *
* *
* SOURCE *
* The IDCT form used is the Even-Odd Decomposition IDCT. *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 1999 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".data:copyright_h"
_Copyright: .string "Copyright (C) 1999 Texas Instruments Incorporated. "
.string "All Rights Reserved."
.sect ".text:hand"
.global _idct_8x8_asm
_idct_8x8_asm:
; ============================ SYMBOLIC CONSTANTS ============================
.asg 0x0B19, cst_c1 ; Cosine term c1
.asg 0x0A74, cst_c2 ; Cosine term c2
.asg 0x0968, cst_c3 ; Cosine term c3
.asg 0x0800, cst_c4 ; Cosine term c4
.asg 0x0649, cst_c5 ; Cosine term c5
.asg 0x0454, cst_c6 ; Cosine term c6
.asg 0x0235, cst_c7 ; Cosine term c7
.asg 11, q_pt ; Q-point for calculations
.asg 16, kq_a ; Extract const for c4 "mpy"
.asg 16-q_pt, kq_b ; Extract const for c4 "mpy"
.asg 9, trunc1 ; Truncation after horizontal pass
.asg 9, results ; Final precision of results
.asg 32-results, trunc2 ; Final truncation right-shift
.asg 16-results, satl ; Final saturation left-shift
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR HORIZ LOOP ===============
.asg B13, B_c7c5 ; Cosine terms c7, c5 (packed)
.asg A13, A_c7c5 ; Cosine terms c7, c5 (packed)
.asg B12, B_c3c1 ; Cosine terms c3, c1 (packed)
.asg A12, A_c3c1 ; Cosine terms c3, c1 (packed)
.asg B14, B_c6c2 ; Cosine terms c6, c2 (packed)
.asg A14, A_i_ptr ; Input pointer #1
.asg B15, B_i_ptr ; Input pointer #2
.asg A11, A_o_ptr ; Output pointer #1
.asg B11, B_o_ptr ; Output pointer #2
.asg B2, B_o ; Outer loop counter
.asg A5, A_X1X0 ; Incoming coefs X1, X0 (packed)
.asg A10, A_X3X2 ; Incoming coefs X3, X2 (packed)
.asg B7, B_X5X4 ; Incoming coefs X5, X4 (packed)
.asg B10, B_X7X6 ; Incoming coefs X7, X6 (packed)
.asg A7, A_X2c6 ; X2 * c6
.asg B0, B_X6c2 ; X6 * c2
.asg A0, A_X2c2 ; X2 * c2
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -