?? fft16x32.asm
字號:
|| NORM .L2 B_n, B_radix2 ;
|| MV .S1X B_SP, A_SP ; Twin Stack Pointer
STDW .D1T1 A13:A12, *+A_SP[6]
|| STW .D2T2 B_ret, *+B_SP[3] ; return
|| SHRU .S1X B_fft_jmp, 4, A_fft_jmp_3
|| MV .S2 B_i, B_i_temp ;
|| MVK .L1 4, A_radix_temp ;
|| AND .L2 B_radix2, 1, B_radix2 ;
STDW .D1T1 A11:A10, *+A_SP[4] ; Save A11,A10
|| STDW .D2T2 B11:B10, *+B_SP[3] ; Save B11,B10
||[B_radix2] MVK.S1 2, A_radix_temp ; A23
|| MV .S2X A_ptr_x, B_x ; B4 ; Restore A_ptr_x in B_x
|| SUB .L2 B_i_temp, 1, B_i_temp ; Compensate for SPLOOPD
[B_wh] SPLOOPD 10
|| STW .D1T1 A_ptr_x, *+A_SP[1] ; Save A_ptr_x
|| STW .D2T2 B_ptr_y, *+B_SP[2] ; Save B_ptr_y
|| SHL .S1 A_radix_temp, 2, A_radix_temp ; Adjust A_radix;
|| ADD .L2 B_x, -16, B_x ;
|| MVC .S2 B_i_temp, ILC
* =========================== STAGE 0 ============================ *
SPMASK
|| MV .S1X B_x, A_x ;[ 1,1]
|| LDDW .D2T2 *++B_x[2], B_x_1:B_x_0 ;[ 1,1]
||^ STDW .D1T1 A15:A14, *+A_SP[2] ; Save A15,A14
||^ ZERO .L2 B_j
||^ SHRU .S2 B_stride_temp, 2, B_h2 ;
SPMASK
||^ MV .D1X B_h2, A_h2 ;
||^ MV .L2 B_stride_temp, B_stride
||^ SHRU .S2 B_stride_temp, 1, B_l1
||^ STDW .D2T2 B13:B12, *+B_SP[5] ;
SPMASK
|| LDDW .D1T1 *++A_x[3], A_x_3:A_x_2 ;[ 3,1]
|| LDDW .D2T2 *B_x[B_h2], B_xh2_1i:B_xh2_0i ;[ 3,1]
||^ CMPGTU .L2X B_stride, A_radix_temp, B_wh
||^ ADD .S2 B_l1, B_h2, B_l2 ;
SPMASK
|| ROTL .M1X B_x, 0, A_x_ ;[ 4,1]
|| LDDW .D1T1 *++A_x[A_h2], A_xh2_3i:A_xh2_2i ;[ 4,1]
|| LDDW .D2T2 *B_x[B_l1], B_xl1_1i:B_xl1_0i ;[ 4,1]
||^ SUB .S1 A_fft_jmp_3, 3, A_fft_jmp_3 ;
||^ MV .L1 A_radix_temp, A_radix
SPMASK
|| ADD .S1X B_j, 1, A_j ;[ 5,1]
||^ SHRU .S2 B_fft_jmp, 2, B_fft_jmp ;
||^ STW .D1T1 A_radix, *+A_SP[15]
SPMASK
|| SUB .S2X B_j, A_fft_jmp_3, B_ifj ;[ 6,1]
||^ MV .L1X B_fft_jmp, A_tw_offset
SPMASK
|| LDDW .D1T1 *++A_x[A_h2], A_xl1_3i:A_xl1_2i ;[ 7,1]
|| LDDW .D2T2 *B_x[B_l2], B_xl2_1i:B_xl2_0i ;[ 7,1]
||^ SHL .S2 B_fft_jmp, 2, B_fft_jmp ;
SPMASK
||[!B_ifj] ADD .L2 B_x, B_fft_jmp, B_x ;[ 8,1]
||^ MV .D2X A_w0, B_w0
SPMASK
|| ADDSUB .L2 B_x_0, B_xl1_0i, B_xh0_0:B_xl0_0 ;[ 9,1]
|| ADD .S1 A_j, 1, A_j ;[ 9,1]
|| LDDW .D1T1 *A_w0[A_j], A_co11_si11:A_co30_si30 ;[ 9,1]
|| LDDW .D2T2 *B_w0[B_j], B_co20_si20:B_co10_si10 ;[ 9,1]
||^ SHRU .S2 B_stride, 2, B_stride ;
ADD .S2 B_j, 3, B_j ;[10,1]
|| ADDSUB .L2 B_x_1, B_xl1_1i, B_xh1_0:B_xl1_0 ;[10,1]
|| LDDW .D1T1 *++A_x[A_h2], A_xl2_3i:A_xl2_2i ;[10,1]
* =========================== STAGE 1 ============================ *
[!B_ifj] MPYSU.M2 0, B_j, B_j ;[11,1]
ADDSUB .L2 B_xh2_1i, B_xl2_1i, B_xh21_0:B_xl21_0;[12,1]
|| ADDSUB .L1 A_x_2, A_xl1_2i, A_xh0_1:A_xl0_1 ;[12,1]
|| LDDW .D1T1 *A_w0[A_j], A_co31_si31:A_co21_si21 ;[12,1]
SUB .S2 B_xh1_0, B_xh21_0, B_yt0_0 ;[13,1]
|| ADDSUB .L2 B_xh2_0i, B_xl2_0i, B_xh20_0:B_xl20_0;[13,1]
|| ADDSUB .L1 A_x_3, A_xl1_3i, A_xh1_1:A_xl1_1 ;[13,1]
|| ROTL .M1 A_x_, 0, A_x___ ;[13,1]
ADDSUB .L2 B_xl0_0, B_xl21_0, B_xt1_0:B_xt2_0 ;[14,1]
|| SUB .S2 B_xh0_0, B_xh20_0, B_xt0_0 ;[14,1]
|| MPY2IR .M2 B_co20_si20, B_yt0_0, B_pa:B_p8 ;[14,1]
ADDSUB .L2 B_xl1_0, B_xl20_0, B_yt2_0:B_yt1_0 ;[15,1]
|| ADD .S2 B_xh20_0, B_xh0_0, B_x_0o ;[15,1]
|| ADDSUB .L1 A_xh2_2i, A_xl2_2i, A_xh20_1:A_xl20_1;[15,1]
|| MPY2IR .M2 B_co10_si10, B_xt1_0, B_p1:B_p3 ;[15,1]
SUB .S1 A_xh0_1, A_xh20_1, A_xt0_1 ;[16,1]
|| ADD .L2 B_xh21_0, B_xh1_0, B_x_1o ;[16,1]
|| ADDSUB .L1 A_xh2_3i, A_xl2_3i, A_xh21_1:A_xl21_1;[16,1]
|| MPY2IR .M2 B_co20_si20, B_xt0_0, B_p9:B_pb ;[16,1]
ADDSUB .L1 A_xl0_1, A_xl21_1, A_xt1_1:A_xt2_1 ;[17,1]
|| SUB .S1 A_xh1_1, A_xh21_1, A_yt0_1 ;[17,1]
|| MPY2IR .M1 A_co21_si21, A_xt0_1, A_pd:A_pf ;[17,1]
|| MPY2IR .M2 B_co10_si10, B_yt1_0, B_p2:B_p0 ;[17,1]
ADD .L1 A_xh21_1, A_xh1_1, A_x_3o ;[18,1]
|| ADD .S1 A_xh20_1, A_xh0_1, A_x_2o ;[18,1]
|| MPY2IR .M1 A_co21_si21, A_yt0_1, A_pe:A_pc ;[18,1]
|| MVD .M2X A_x___, B_x__ ;[18,1]
MV .S2X A_co30_si30, B_co30_si30 ;[19,1]
|| ADDSUB .L1 A_xl1_1, A_xl20_1, A_yt2_1:A_yt1_1 ;[19,1]
|| MPY2IR .M1 A_co11_si11, A_xt1_1, A_p5:A_p7 ;[19,1]
ADD .D2 B_p8, B_p9, B_xl1_0o ;[20,1]
|| MPY2IR .M2 B_co30_si30, B_xt2_0, B_p11:B_p13 ;[20,1]
|| MPY2IR .M1 A_co11_si11, A_yt1_1, A_p6:A_p4 ;[20,1]
* =========================== STAGE 2 ============================ *
SUB .L2 B_p2, B_p3, B_xh2_1o ;[21,1]
|| ADD .S2 B_p0, B_p1, B_xh2_0o ;[21,1]
|| MVD .M1 A_x___, A_x__ ;[21,1]
STDW .D2T2 B_x_1o:B_x_0o, *B_x__[0] ;[22,1]
|| SUB .S1 A_pe, A_pf, A_xl1_3o ;[22,1]
|| SUB .S2 B_pa, B_pb, B_xl1_1o ;[22,1]
|| MPY2IR .M2 B_co30_si30, B_yt2_0, B_p12:B_p10 ;[22,1]
ADD .S1 A_pc, A_pd, A_xl1_2o ;[23,1]
SUB .L1 A_p6, A_p7, A_xh2_3o ;[24,1]
|| ADD .S1 A_p4, A_p5, A_xh2_2o ;[24,1]
STDW .D2T2 B_xh2_1o:B_xh2_0o, *++B_x__[B_h2] ;[25,1]
|| STDW .D1T1 A_x_3o:A_x_2o, *++A_x__[1] ;[25,1]
|| MPY2IR .M1 A_co31_si31, A_yt2_1, A_p16:A_p14 ;[25,1]
STDW .D2T2 B_xl1_1o:B_xl1_0o, *++B_x__[B_h2] ;[26,1]
|| STDW .D1T1 A_xh2_3o:A_xh2_2o, *++A_x__[A_h2] ;[26,1]
|| MPY2IR .M1 A_co31_si31, A_xt2_1, A_p15:A_p17 ;[26,1]
SUB .S2 B_p12, B_p13, B_xl2_1o ;[27,1]
|| ADD .L2 B_p10, B_p11, B_xl2_0o ;[27,1]
STDW .D2T2 B_xl2_1o:B_xl2_0o, *++B_x__[B_h2] ;[28,1]
|| STDW .D1T1 A_xl1_3o:A_xl1_2o, *++A_x__[A_h2] ;[28,1]
NOP
SUB .S1 A_p16, A_p17, A_xl2_3o ;[30,1]
|| ADD .L1 A_p14, A_p15, A_xl2_2o ;[30,1]
* =========================== STAGE 3 ============================ *
SPKERNEL 0, 0
|| STDW .D1T1 A_xl2_3o:A_xl2_2o, *++A_x__[A_h2] ;[31,1]
* =========================== END STAGE 3 ============================ *
LOOP_WHILE:
* ============ STAGES I,II,III (epilog) + Outer Loop ======================= *
LDW .D2T2 *+B_SP[1], B_x ; Restore A_ptr_x in B_x
NOP
MPYSU .M2 6, B_stride, B_fft_jmp
|| ZERO .D2 B_j
MV .D2 B_h2, B_h2_old ;
SHRU .S1X B_fft_jmp, 4, A_fft_jmp_3
SHRU .S2 B_stride, 1, B_l1
ADDAH .D1 A_ptr_w, A_tw_offset, A_w0
SHRU .S2 B_stride, 2, B_h2
MV .D2X A_w0, B_w0
|| ADD .L2 B_l1, B_h2, B_l2
SPMASKR
|| SUB .D1 A_fft_jmp_3, 3, A_fft_jmp_3
|| SHRU .S2 B_fft_jmp, 2, B_fft_jmp
|| ADD .L2 B_x, -16, B_x
* ====== STAGE 0 (prolog) +STAGES II,III (epilog) + Outer Loop ====== *
SPMASK
||^ STDW .D1T1 A_xl2_3o:A_xl2_2o, *++A_x__[A_h2] ;[0]
ADD .L1X A_tw_offset, B_fft_jmp, A_tw_offset
|| MV .D1 A_h2, A_h2_old
SHL .S2 B_fft_jmp, 2, B_fft_jmp
|| MV .L1X B_h2, A_h2
NOP
SPMASK
||^ STDW .D2T2 B_xh2_1o:B_xh2_0o, *++B_x__[B_h2_old] ;[4]
SPMASK
||^ STDW .D2T2 B_xl1_1o:B_xl1_0o, *++B_x__[B_h2_old] ;[5]
||^ STDW .D1T1 A_xh2_3o:A_xh2_2o, *++A_x__[A_h2_old] ;[26,1]
NOP
SPMASK
||^ STDW .D2T2 B_xl2_1o:B_xl2_0o, *++B_x__[B_h2_old] ;[7]
||^ STDW .D1T1 A_xl1_3o:A_xl1_2o, *++A_x__[A_h2_old] ;[28,1]
SHL .S2 B_stride, 2, B_stride ;
NOP
* ====== STAGE 0,I (prolog) +STAGES III (epilog) + Outer Loop ====== *
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -