?? inverse discrete cosine transform.txt
字號:
|| ADD .D1 A_i, 1, A_i ;[19,1]
|| SHR .S1 A_x3, trunc1, A_x3t ;[19,1]
|| ADD .L1X A_g3a, B_g3b, A_g3 ;[19,1]
|| ADD .S2X A_X2c2, B_X6c6, B_r0 ;[ 9,2]
|| MPYH .M1 A_X3X2, A_c3c1, A_X3c3 ;[ 9,2]
|| MPYHL .M2 B_X5X4, B_c7c5, B_X5c5 ;[ 9,2]
h_loop_1:
ADD .L2 B_g1, B_h3n, B_x6 ;[20,1]
||[!A1] STH .D2T1 A_x3t, * B_o_ptr--[1] ;[20,1]
|| ADD .S1 A_h1, A_g3, A_x2 ;[20,1]
|| SUB .D1 A_h1, A_g3, A_x5 ;[20,1]
|| ADD .L1X A_P0, B_P1, A_p0 ;[10,2]
|| MPYHL .M1 A_X1X0, A_c7c5, A_X1c5 ;[10,2]
|| MPYHL .M2 B_X7X6, B_c3c1, B_X7c1 ;[10,2]
h_loop_2:
SHR .S1 A_x5, trunc1, A_x5t ;[21,1]
|| SHR .S2 B_x1, trunc1, B_x1t ;[21,1]
|| ADD .L1 A_X1c1, A_X3c3, A_h2a ;[11,2]
|| ADD .L2 B_X5c5, B_X7c7, B_h2b ;[11,2]
|| MPYH .M1 A_X1X0, A_c3c1, A_X1c3 ;[11,2]
|| MPYH .M2 B_X5X4, B_c7c5, B_X5c7 ;[11,2]
|| LDW .D1T1 * A_i_ptr--[4], A_X1X0 ;[ 1,3]
|| LDW .D2T2 *+B_i_ptr[1], B_X7X6 ;[ 1,3]
h_loop_3:
SHR .S2 B_x6, trunc1, B_x6t ;[22,1]
|| SHR .S1 A_x2, trunc1, A_x2t ;[22,1]
|| SUB .L1X A_p0, B_r0, A_h0 ;[12,2]
|| ADD .L2X A_h2a, B_h2b, B_h2 ;[12,2]
|| MPYH .M1 A_X3X2, A_c7c5, A_X3c7 ;[12,2]
|| MPYH .M2 B_X5X4, B_c3c1, B_X5c3 ;[12,2]
|| LDW .D1T1 *+A_i_ptr[5], A_X3X2 ;[ 2,3]
|| LDW .D2T2 * B_i_ptr--[4], B_X5X4 ;[ 2,3]
h_loop_4:
[ B_o]B .S2 h_loop ;[23,1]
|| STH .D1T1 A_x5t, *+A_o_ptr[8] ;[23,1]
|| SHR .S1 A_x4, trunc1, A_x4t ;[23,1]
|| ADD .L2X A_p0, B_r0, B_g0 ;[13,2]
||[ B_o]SUB .D2 B_o, 1, B_o ;[13,2]
||[!A1] AND .L1 A_i, 7, A_i ;[13,2]
|| MPYHL .M1 A_X3X2, A_c7c5, A_X3c5 ;[13,2]
|| MPYHL .M2 B_X5X4, B_c3c1, B_X5c1 ;[13,2]
h_loop_5:
[!A1] STH .D1T1 A_x4t, * A_o_ptr--[1] ;[24,1]
|| SUB .S1 A_X1c3, A_X3c7, A_h3a ;[14,2]
|| SUB .L1X A_P0, B_P1, A_p1 ;[14,2]
|| ADD .S2 B_g0, B_h2, B_x0 ;[14,2]
|| SUB .L2 B_X5c3, B_X7c1, B_g2b ;[14,2]
|| MPYHL .M1 A_X3X2, A_c3c1, A_X3c1 ;[14,2]
|| MPY .M2 B_X7X6, B_c6c2, B_X6c2 ;[14,2]
h_loop_6:
STH .D1T2 B_x6t, *+A_o_ptr[17] ;[25,1]
|| SUB .D2 B_g0, B_h2, B_x7 ;[15,2]
|| SHR .S2 B_x0, trunc1, B_x0t ;[15,2]
|| SUB .S1 A_X1c7, A_X3c5, A_g2a ;[15,2]
|| ADD .L2 B_X5c1, B_X7c5, B_h3b ;[15,2]
|| MPYLH .M1X A_X3X2, B_c6c2, A_X2c6 ;[15,2]
|| MPYH .M2 B_X7X6, B_c3c1, B_X7c3 ;[15,2]
||[ A1] ADD .L1 A1, 1, A1
h_loop_7:
[!A_i]SUBAW .D1 A_o_ptr, 28, A_o_ptr ;[26,1]
|| STH .D2T2 B_x1t, *-B_o_ptr[15] ;[26,1]
|| SHR .S2 B_x7, trunc1, B_x7t ;[16,2]
|| SUB .L1 A_X1c5, A_X3c1, A_g3a ;[16,2]
|| SUB .L2X B_h3b, A_h3a, B_h3n ;[16,2]
|| ADD .S1X A_g2a, B_g2b, A_g2 ;[16,2]
|| MPYH .M1 A_X1X0, A_c7c5, A_X1c7 ;[ 6,3]
|| MPYLH .M2 B_X7X6, B_c6c2, B_X6c6 ;[ 6,3]
h_loop_8:
STH .D2T1 A_x2t, *-B_o_ptr[7] ;[27,1]
|| ADD .L1 A_h0, A_g2, A_x3 ;[17,2]
|| SUB .D1 A_h0, A_g2, A_x4 ;[17,2]
|| SUB .L2X A_X2c6, B_X6c2, B_r1 ;[17,2]
|| EXT .S1 A_X1X0, kq_a, kq_b, A_P0 ;[ 7,3]
|| EXT .S2 B_X5X4, kq_a, kq_b, B_P1 ;[ 7,3]
|| MPY .M1X A_X3X2, B_c6c2, A_X2c2 ;[ 7,3]
|| MPYHL .M2 B_X7X6, B_c7c5, B_X7c5 ;[ 7,3]
h_loop_9:
[!A_i]SUBAW .D2 B_o_ptr, 28, B_o_ptr ;[28,1]
|| STH .D1T2 B_x7t, *+A_o_ptr[24] ;[18,2]
|| ADD .S2X A_p1, B_r1, B_g1 ;[18,2]
|| SUB .L1X A_p1, B_r1, A_h1 ;[18,2]
|| ADD .L2 B_X5c7, B_X7c3, B_g3b ;[18,2]
|| ADDK .S1 256, A_P0 ;[ 8,3]
|| MPYHL .M1 A_X1X0, A_c3c1, A_X1c1 ;[ 8,3]
|| MPYH .M2 B_X7X6, B_c7c5, B_X7c7 ;[ 8,3]
; ============================ PIPE LOOP EPILOG ==============================
h_epilog:
SUB .L2 B_g1, B_h3n, B_x1 ;[19,3]
|| STH .D2T2 B_x0t, *-B_o_ptr[24] ;[19,3]
|| SHR .S1 A_x3, trunc1, A_x3t ;[19,3]
|| ADD .L1X A_g3a, B_g3b, A_g3 ;[19,3]
ADD .L2 B_g1, B_h3n, B_x6 ;[20,3]
|| STH .D2T1 A_x3t, *+B_o_ptr[0] ;[20,3]
|| ADD .S1 A_h1, A_g3, A_x2 ;[20,3]
|| SUB .D1 A_h1, A_g3, A_x5 ;[20,3]
;-
SHR .S1 A_x5, trunc1, A_x5t ;[21,3]
|| SHR .S2 B_x1, trunc1, B_x1t ;[21,3]
SHR .S2 B_x6, trunc1, B_x6t ;[22,3]
|| SHR .S1 A_x2, trunc1, A_x2t ;[22,3]
|| STH .D2T2 B_x1t, *-B_o_ptr[16] ;[26,3]
STH .D1T1 A_x5t, *+A_o_ptr[8] ;[23,3]
|| SHR .S1 A_x4, trunc1, A_x4t ;[23,3]
* ========================================================================= *
* Interloop code: Performs remaining epilog from horizontal pass, and *
* begins setup of the vertical pass. *
* *
* In order to save some time between loops, I start performing pointer *
* fixups and constant initializations in the epilog of the horizontal *
* pass loop. The horizontal pass works from the bottom of the *
* IDCT list and ends at the top, whereas the vertical pass works from *
* the top of the list and ends up at the bottom. As a result, the *
* displacement between the required pointer settings between the two *
* loops is fixed, regardless of the number of IDCTs processed, since *
* the two loops pointers always meet at the top of the list. *
* *
* The vertical loop needs a new repacking of the cosine terms: c6c3 and *
* c2c1. By playing around w/ how the cosine terms are packed, *
* I was able to save two whole registers in the vertical loop and thus *
* fit into the register file. I do this repacking partly here, and *
* partly in the vertical loop's prolog. *
* ========================================================================= *
STH .D1T1 A_x4t, *+A_o_ptr[0] ;[24,3]
;-
STH .D1T2 B_x6t, *+A_o_ptr[16] ;[25,3]
|| ADDK .S1 168, A_i_ptr ; Fixup for vert loop
|| ADDK .S2 156, B_i_ptr ; Fixup for vert loop
.asg A15, A_c6c3 ; Symbolic name from vert loop
STH .D2T1 A_x2t, *-B_o_ptr[8] ;[27,3]
|| SHR .S1 A_c3c1, 16, A_c6c3 ; Set up new cosine constant
|| MVC .S2 IRP, B0 ; Get SP so we can unspill A_o
; ============================================================================
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR VERT LOOP ================
.asg A14, A_i_ptr ; Input pointer #1
.asg B15, B_i_ptr ; Input pointer #2
.asg A11, A_o_ptr ; Output pointer #1
.asg B11, B_o_ptr ; Output pointer #2
.asg B13, B_c7c5 ; Cosine terms c7, c5 (packed)
.asg A13, A_c7c5 ; Cosine terms c7, c5 (packed)
.asg A15, A_c6c3 ; Cosine terms c6, c3 (packed)
.asg B12, B_c2c1 ; Cosine terms c2, c1 (packed)
.asg A4, A_c1c4 ; Cosine term c1, c4 (alternates)
.asg A2, A_o ; Outer loop counter
.asg B2, B_i ; Inner loop counter
.asg A12, A_X7X6 ; Incoming coefs X7, X6 (packed)
.asg A8, A_X5X4 ; Incoming coefs X5, X4 (packed)
.asg B10, B_X3X2 ; Incoming coefs X3, X2 (packed)
.asg B14, B_X1X0 ; Incoming coefs X1, X0 (packed)
.asg B9, B_rnd ; Rounding value applied to P0
.asg B1, B_P0_t ; Node P0, temporary pre-rounding
.asg B5, B_P0 ; Rounded value of Node P0
.asg A7, A_P1 ; Node P1 in signal flow graph
.asg B0, B_X2c2 ; X2 * c2
.asg B4, B_X2c6 ; X2 * c6
.asg A4, A_X6c2 ; X6 * c2
.asg A3, A_X6c6 ; X6 * c6
.asg A5, A_p0 ; Node p0 in signal flow graph
.asg A8, A_p1 ; Node p1 in signal flow graph
.asg B4, B_r1 ; Node r1 in signal flow graph
.asg B3, B_r0 ; Node r0 in signal flow graph
.asg B0, B_g0 ; Node g0 in signal flow graph
.asg A1, A_g1 ; Node g1 in signal flow graph
.asg B3, B_h1 ; Node h1 in signal flow graph
.asg A3, A_h0 ; Node h0 in signal flow graph
.asg B5, B_X1c1 ; X1 * c1
.asg B1, B_X1c3 ; X1 * c3
.asg B3, B_X1c5 ; X1 * c5
.asg B8, B_X1c7 ; X1 * c7
.asg B0, B_X3c1 ; X3 * c1
.asg B0, B_X3c3 ; X3 * c3
.asg B0, B_X3c5 ; X3 * c5
.asg B9, B_X3c7 ; X3 * c7
.asg A3, A_X5c1 ; X5 * c1
.asg A1, A_X5c3 ; X5 * c3
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -