?? fft.asm
字號:
* in the lower half, and the imaginary part is in the upper half. *
* The flow breaks in case of "xl0" and "xl1" because in this case *
* the real part needs to be combined with the imaginary part because *
* of the multiplication by "j". This requires a packed quantity like *
* "xl21xl20" to be rotated as "xl20xl21" so that it can be combined *
* using add2's and sub2's. Hence the natural version of C code *
* shown below is transformed using packed data processing as shown: *
* *
* xl0 = x[2 * i0 ] - x[2 * i2 ]; *
* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *
* xl20 = x[2 * i1 ] - x[2 * i3 ]; *
* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *
* *
* xt1 = xl0 + xl21; *
* yt2 = xl1 + xl20; *
* xt2 = xl0 - xl21; *
* yt1 = xl1 - xl20; *
* *
* xl1_xl0 = _sub2(x21_x20, x21_x20) *
* xl21_xl20 = _sub2(x32_x22, x23_x22) *
* xl20_xl21 = _rotl(xl21_xl20, 16) *
* *
* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) *
* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) *
* *
* Also notice that xt1, yt1 endup on seperate words, these need to *
* be packed together to take advantage of the packed twiddle fact *
* ors that have been loaded. In order for this to be achieved they *
* are re-aligned as follows: *
* *
* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) *
* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) *
* *
* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor *
* to be used for the complex multiplies. The complex multiply *
* is implemented on the C64x+ using _cmpyr1. *
* *
* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). *
* *
* The actual twiddle factors for the FFT are cosine, - sine. The *
* twiddle factors stored in the table are csine and sine, hence *
* the sign of the "sine" term is comprehended during multipli- *
* cation as shown above. *
* *
* *
* ASSUMPTIONS *
* *
* The size of the FFT, n, must be a power of 4 and greater than *
* or equal to 16 and less than 32768. *
* *
* The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a *
* double-word boundary for the "optimized" implementations. *
* *
* The input and output data are complex, with the real/imaginary *
* components stored in adjacent locations in the array. The real *
* components are stored at even array indices, and the imaginary *
* components are stored at odd array indices. *
* *
* C CODE *
* *
* *
* NOTES *
* *
* *
* CYCLES *
* *
* cycles = 0.75*nx*log4(nx) + 38 *
* For nx = 1024, cycles = 3878 *
* *
* CODESIZE *
* *
* 704 bytes *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2005 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
* ======================================================================== *
* ======================================================================== *
.text .global _fft_fft:
* ======================================================================== *
* ======================================================================== *
******************* SYMBOLIC REGISTER ASSIGNMENTS: SETUP *********************
* ===================== LOOP 1 SYMBOLIC REGISTER ASSIGNMENTS ============== *
.asg A0, A_h2
.asg A1, A_ifj
.asg A2, A_xl20_1_xl21_1
.asg A2, A_xh2_1_0
.asg A3, A_xh2_3_2
.asg A3, A_x_h2_0_1
.asg A3, A_x_h2_2_3
.asg A4, A_w0
.asg A4, A_ptr_w
.asg A5, A_c2
.asg A6, A_ptr_x
.asg A7, A_w1
.asg A8, A_x
.asg A9, A_l1
.asg A10, A_shr2
.asg A11, A_fft_jmp
.asg A16, A_l2
.asg A17, A_fft_jmp_3
.asg A18, A_yt0_1_xt0_1
.asg A18, A_xl20_0_xl21_0
.asg A18, A_xh2_1_xh2_0
.asg A19, A_xh2_3_xh2_2
.asg A20, A_fft_jmp1 ; 20, 21 <-> 2,3
.asg A21, A_j
.asg A22, A_xl1_1_0
.asg A23, A_xl1_3_2
.asg A22, A_xl2_1_xl2_0
.asg A23, A_xl2_3_xl2_2
.asg A24, A_co10_si10
.asg A25, A_co11_si11
.asg A26, A_co20_si20
.asg A27, A_co21_si21
.asg A27, A_yt0_0_xt0_0
.asg A28, A_xl21_0_xl20_0
.asg A29, A_xh21_0_xh20_0
.asg A30, A_xl21_1_xl20_1
.asg A31, A_xh21_1_xh20_1
.asg B0, B_x_1_x_0
.asg B1, B_x_3_x_2
.asg B0, B_yt1_1_xt2_1
.asg B1, B_yt2_1_xt1_1
.asg B0, B_x_l2_0_1
.asg B0, B_x_l2_2_3
.asg B1, B_j
.asg B2, B_wh
.asg B4, B_n
.asg B5, B_x
.asg B6, B_ptr_y
.asg B7, B_fft_jmp
.asg B8, B_w2
.asg B9, B_h2
.asg B10, B_i
.asg B16, B_l2
.asg B17, B_l1
.asg B18, B_l2_
.asg B19, B_x_
.asg B20, B_xt2_1_yt2_1
.asg B20, B_yt1_0_xt2_0
.asg B20, B_xt2_0_yt2_0
.asg B21, B_xt1_1_yt1_1
.asg B21, B_xt1_0_yt1_0
.asg B21, B_yt2_0_xt1_0
.asg B22, B_xl1_0_xl0_0
.asg B22, B_xl2_1_0
.asg B23, B_xl2_3_2
.asg B23, B_xh1_0_xh0_0
.asg B24, B_co30_si30
.asg B25, B_co31_si31
.asg B26, B_xl1_1_xl1_0
.asg B27, B_xl1_3_xl1_2
.asg B28, B_xl1_1_xl0_1
.asg B29, B_xh1_1_xh0_1
.asg B30, B_x_1o_x_0o
.asg B31, B_x_3o_x_2o
.asg B20, B_log4n
.asg B21, B_thone
.asg B15, B_SP
* ========================================================================= *
* void fft(short * w, int nx, short * x, short * y);
* ========================================================================= *
STW .D2T2 B10, *B_SP--[2] ; Reserve stack, Save B10
|| LMBD .L2 1, B_n, B_log4n ; 31-log4n
|| MVK .S2 31, B_thone
STDW .D2T1 A11:A10, *+B_SP[0] ; Save A11:A10
|| SUB .L2 B_thone, B_log4n, B_log4n ; log4n
SHR .S2 B_log4n, 1, B_log4n
SHRU .S2 B_n, 3, B_i ;[ 2,1]
SUB .L2 B_i, 1, B_i ;[ 4,1]
|| MVC .S2 B_i, RILC ;['reload' inner loop counter
|| SHRU .S1X B_n, 1, A_h2 ;[ 1,1] low 1
|| MVK .L1 2, A_c2 ;[ 2,1]
[B_wh] SPLOOPD 6
|| MVC .S2 B_i, ILC ;[ 7,1]
|| ADD .S1X A_h2, B_n, A_l2 ;bott 3
|| SUB .L2 B_log4n, 2, B_wh ;log4n-2
* ================================ LOOP STAGE I =========================== *
SPMASK
||^ SSHVR .M1 A_l2, A_c2, A_fft_jmp ;prolog
||^ SHRU .S2 B_n, 2, B_l1 ;[ 1,1]
||^ SUB .L2X A_ptr_x, 8, B_x ;[ 7,1] 1st time only
||^ ZERO .D1 A_j ;prolog
ADD .S2 B_x, 8, B_x ;[ 1,1]
SPMASK
|| LDDW .D2T2 *B_x[B_l1], B_xl1_3_xl1_2:B_xl1_1_xl1_0 ;[ 2,1]
||^ SSHVR .M1 A_l2, A_c2, A_fft_jmp1 ;prolog
||^ SHRU .S2X A_h2, 2, B_h2 ;prolog
||^ SHRU .S1 A_h2, 2, A_h2 ;prolog
SPMASK
|| LDDW .D2T1 *B_x[B_h2], A_xh2_3_xh2_2:A_xh2_1_xh2_0 ;[ 3,1]
||^ SHRU .S2X A_l2, 2, B_l2 ;prolog
||^ MPY .M2X 2, A_l2, B_fft_jmp ;prolog
||^ ADD .L1 A_w0, 8, A_w1 ;prolog
||^ SHRU .S1 A_l2, 2, A_l2 ;prolog
SPMASK
|| LDDW .D2T1 *B_x[B_l2], A_xl2_3_xl2_2:A_xl2_1_xl2_0 ;[ 4,1]
||^ SUB .L1 A_fft_jmp1, 3, A_fft_jmp_3 ;prolog
||^ SHRU .S1 A_fft_jmp1, 2, A_fft_jmp1 ;prolog
SUB .L1 A_fft_jmp_3, A_j, A_ifj ;[ 5,1]
|| LDDW .D2T2 *B_x[0], B_x_3_x_2:B_x_1_x_0 ;[ 5,1]
|| ROTL .M2X A_j, 0, B_j ;[ 5,1]
* ================================ STAGE II =============================== *
SPMASK
||[!A_ifj]ADD .D2 B_x, B_fft_jmp, B_x ;[ 6,1]
|| MVD .M2 B_x, B_x_ ;[ 6,1]
|| LDDW .D1T1 *A_w1[A_j], A_co21_si21:A_co20_si20 ;[ 6,1]
||^ ADD .L2X A_w1, 8, B_w2 ;prolog
||^ SSHVR .M1X B_n, A_c2, A_l1 ;prolog
||^ MVKL .S1 020000000h, A_shr2 ;prolog
ADD .S1 A_j, 3, A_j ;[ 7,1] was 3
|| LDDW .D2T2 *B_w2[B_j], B_co31_si31:B_co30_si30 ;[ 7,1]
|| LDDW .D1T1 *A_w0[A_j], A_co11_si11:A_co10_si10 ;[ 7,1]
SPMASK
||[!A_ifj]ZERO .L1 A_j ;[ 8,1]
||^ SHRU .S2X A_l2, 2, B_l2_ ;prolog
||^ MVKH .S1 020000000h, A_shr2 ;prolog
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -