?? inverse discrete cosine transform.txt
字號(hào):
.asg B1, B_X6c6 ; X6 * c6
.asg A6, A_P0 ; Node P0 in signal flow graph
.asg B8, B_P1 ; Node P1 in signal flow graph
.asg A8, A_p0 ; Node p0 in signal flow graph
.asg A0, A_p1 ; Node p1 in signal flow graph
.asg B0, B_r1 ; Node r1 in signal flow graph
.asg B4, B_r0 ; Node r0 in signal flow graph
.asg B7, B_g0 ; Node g0 in signal flow graph
.asg B3, B_g1 ; Node g1 in signal flow graph
.asg A15, A_h1 ; Node h1 in signal flow graph
.asg A15, A_h0 ; Node h0 in signal flow graph
.asg A3, A_X1c1 ; X1 * c1
.asg A0, A_X1c3 ; X1 * c3
.asg A3, A_X1c5 ; X1 * c5
.asg A9, A_X1c7 ; X1 * c7
.asg A9, A_X3c1 ; X3 * c1
.asg A0, A_X3c3 ; X3 * c3
.asg A5, A_X3c5 ; X3 * c5
.asg A5, A_X3c7 ; X3 * c7
.asg B0, B_X5c1 ; X5 * c1
.asg B4, B_X5c3 ; X5 * c3
.asg B3, B_X5c5 ; X5 * c5
.asg B6, B_X5c7 ; X5 * c7
.asg B0, B_X7c1 ; X7 * c1
.asg B3, B_X7c3 ; X7 * c3
.asg B9, B_X7c5 ; X7 * c5
.asg B1, B_X7c7 ; X7 * c7
.asg A7, A_g2a ; X1 * c7 - X3 * c5
.asg B8, B_g2b ; X5 * c3 - X7 * c1
.asg A6, A_g2 ; Node g2 in signal flow graph
.asg A3, A_g3a ; X1 * c5 - X3 * c1
.asg B6, B_g3b ; X5 * c7 + X7 * c3
.asg A4, A_g3 ; Node g3 in signal flow graph
.asg A6, A_h3a ; X1 * c3 - X2 * c7
.asg B7, B_h3b ; X5 * c1 + X7 * c5
.asg B5, B_h3n ; Node h3, negated.
.asg A0, A_h2a ; X1 * c1 + X3 * c3
.asg B3, B_h2b ; X5 * c5 + X7 * c7
.asg B1, B_h2 ; Node h2 in signal flow graph
.asg B4, B_x0 ; Output x0, pre-truncation
.asg B0, B_x1 ; Output x1, pre-truncation
.asg A4, A_x2 ; Output x2, pre-truncation
.asg A4, A_x3 ; Output x3, pre-truncation
.asg A7, A_x4 ; Output x4, pre-truncation
.asg A15, A_x5 ; Output x5, pre-truncation
.asg B6, B_x6 ; Output x6, pre-truncation
.asg B3, B_x7 ; Output x7, pre-truncation
.asg B4, B_x0t ; Output x0, truncated to 16 bits
.asg B5, B_x1t ; Output x1, truncated to 16 bits
.asg A4, A_x2t ; Output x2, truncated to 16 bits
.asg A8, A_x3t ; Output x3, truncated to 16 bits
.asg A7, A_x4t ; Output x4, truncated to 16 bits
.asg A5, A_x5t ; Output x5, truncated to 16 bits
.asg B3, B_x6t ; Output x6, truncated to 16 bits
.asg B9, B_x7t ; Output x7, truncated to 16 bits
.asg A2, A_i ; Inner-loop counter.
; ============================================================================
* ========================================================================= *
* Initialization code for horizontal loop: Saves registers to *
* the stack, sets up cosine terms, pointers and loop control. *
* *
* The stack frame for this code is 16 words large. It holds the Save *
* on Entry (SOE) registers A10..A15, B10..B14, as well as the return *
* address (B3), CSR, IRP, and a single spill value. (The loop counter *
* initializer is shared between both loops and so I spill it to the *
* stack.) I twin the stack pointer to speed up stack accesses. The *
* stack frame layout is slightly funky to avoid bank conflicts while *
* allowing me to get to everything when I need it most. *
* *
* The horizontal loop starts at the end of the IDCT array and works back *
* towards the beginning. As a result, the input and output pointers are *
* initialized like so: *
* *
* -- A_i_ptr is set to point to the coefficients "X0" and "X1" in the *
* last row of the last valid IDCT block in the input. B_i_ptr is *
* set to point to the coefficients "X4" and "X5" in that same row. *
* *
* -- A_o_ptr is set to point to the coefficient "x4" in the rightmost *
* column of the scratch block I require at the end of the array. *
* B_o_ptr is set to point to "x3" in that same column. *
* *
* The loop count is simply the number of IDCTs times 8, minus 1 to *
* handle the parallel iterations in the kernel. (It would've been more, *
* except that I've performed some limited prolog and epilog collapsing, *
* so I need to iterate the kernel more times.) A happy coincidence *
* gives both horizontal and vertical loops the exact same trip count, *
* so I spill this value to the stack and simply restore it unchanged *
* for the second loop, rather than recalculating it. *
* *
* Since I was able to free up a single predication register in the first *
* loop, I prolog-collapsed one stage of the prolog. I use A1 as my *
* prolog-collapsation fuse. To save a MVK (since this code bottlenecks *
* heavily on S units), I initialize it to -1 with an OR, rather than a *
* more traditional 1. *
* *
* Both loops use all 32 registers, so I have saved the stack pointer in *
* IRP. This is safe since interrupts are explicitly disabled for the *
* entire function. *
* *
* Note: This setup code could possibly be a cycle or two faster. For *
* instance, I could copy B15 to A15 before the decrement and use *
* negative indexes for the STWs through A15, saving a whole cycle on *
* the stack saves. The resulting code doesn't pack as nicely, though. *
* ========================================================================= *
;-
STW .D2T1 A15, *B15--[16] ; Save A15, get stack frame
|| MVC .S2 CSR, B0 ; Grab the current CSR
AND .L2 B0, -2, B1 ; Clear GIE bit in CSR
|| MV .L1X B15, A15 ; Twin the stack pointer
STW .D1T1 A14, *+A15 [13] ; Save SOE reg A14
|| STW .D2T2 B14, *+B15 [12] ; Save SOE reg B14
|| MV .L1X B0, A0 ; Partitioning MV.
|| MVC .S2 B1, CSR ; Interrupts disabled here
;-
STW .D1T1 A13, *+A15 [10] ; Save SOE reg A13
|| STW .D2T2 B13, *+B15 [11] ; Save SOE reg B13
STW .D1T1 A12, *+A15 [ 9] ; Save SOE reg A12
|| STW .D2T2 B12, *+B15 [ 8] ; Save SOE reg B12
STW .D1T1 A11, *+A15 [ 7] ; Save SOE reg A11
|| STW .D2T2 B11, *+B15 [ 6] ; Save SOE reg B11
|| SHL .S2 B4, 3, B_o ; Set up outer loop counter
|| OR .L1 A1, -1, A1 ; Prolog collapse counter
;-
STW .D1T1 A10, *+A15 [ 5] ; Save SOE reg A10
|| STW .D2T2 B10, *+B15 [ 4] ; Save SOE reg B10
|| SHL .S2 B4, 7, B4 ; Set up end-of-array ptr
||[B_o] SUB .L2 B_o, 1, B_o ; Loop count = IDCTs*8 - 1
STW .D2T2 B3, *+B15 [ 2] ; Remember the return addr
|| STW .D1T1 A0, *+A15 [ 3] ; Remember the CSR state
|| ADD .L2X A4, B4, B4 ; Point to scratch area
|| MVC .S2 IRP, B0
;-
STW .D2T2 B_o, *+B15 [ 1] ; Spill our loop count init
|| MVC .S2 B15, IRP ; Save stack ptr in IRP
|| SUB .L2 B4, 8, B_i_ptr ; Point to X5X4, row 7
|| MV .L1X B4, A_o_ptr
|| MVK .S1 7, A_i ; Set up inner loop counter
SUB .L1X B_i_ptr, 8, A_i_ptr ; Point to X1X0, row 7
|| ADDAH .D2 B4, 31, B_o_ptr ; Point to x3, col 7
|| ADDK .S1 78, A_o_ptr ; Point to x4, col 7
;-
; ============================ PIPE LOOP PROLOG ==============================
h_prolog:
[ B_o]LDW .D1T1 * A_i_ptr--[4], A_X1X0 ;[ 1,1]
||[ B_o]LDW .D2T2 *+B_i_ptr[1], B_X7X6 ;[ 1,1]
|| MVK .S1 cst_c1, A_c3c1 ; c1
||[!B_o]B .S2 idct_8x8_abort ; Abort if num_idcts == 0
[ B_o]LDW .D1T1 *+A_i_ptr[5], A_X3X2 ;[ 2,1]
||[ B_o]LDW .D2T2 * B_i_ptr--[4], B_X5X4 ;[ 2,1]
|| MVK .S1 cst_c5, A_c7c5 ; c5
|| MVK .S2 cst_c2, B_c6c2 ; c2
;-
STW .D1T2 B0, *A15[14] ; save IRP
MVKLH .S1 cst_c7, A_c7c5 ; c7
|| MVKLH .S2 cst_c6, B_c6c2 ; c6
MVKLH .S1 cst_c3, A_c3c1 ; c3
|| MVK .S2 cst_c5, B_c7c5 ; c5
MPYH .M1 A_X1X0, A_c7c5, A_X1c7 ;[ 6,1]
|| MPYLH .M2 B_X7X6, B_c6c2, B_X6c6 ;[ 6,1]
|| MVKLH .S2 cst_c7, B_c7c5 ; c7
; ===== Branch Occurs =====
;-
EXT .S1 A_X1X0, kq_a, kq_b, A_P0 ;[ 7,1]
|| MPY .M1X A_X3X2, B_c6c2, A_X2c2 ;[ 7,1]
|| MPYHL .M2 B_X7X6, B_c7c5, B_X7c5 ;[ 7,1]
|| MV .L2X A_c3c1, B_c3c1
ADDK .S1 256, A_P0 ;[ 8,1]
|| EXT .S2 B_X5X4, kq_a, kq_b, B_P1 ;[ 8,1]
|| MPYHL .M1 A_X1X0, A_c3c1, A_X1c1 ;[ 8,1]
|| MPYH .M2 B_X7X6, B_c7c5, B_X7c7 ;[ 8,1]
;-
; ============================ PIPE LOOP KERNEL ==============================
h_loop:
h_loop_0:
SUB .L2 B_g1, B_h3n, B_x1 ;[19,1]
|| STH .D2T2 B_x0t, *-B_o_ptr[24] ;[19,1]
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -