?? fft16x32.asm
字號:
* The flow breaks in case of "xl0" and "xl1" because in this case *
* the real part needs to be combined with the imaginary part because *
* of the multiplication by "j". This requires a packed quantity like *
* "xl21xl20" to be rotated as "xl20xl21" so that it can be combined *
* using add2's and sub2's. Hence the natural version of C code *
* shown below is transformed using packed data processing as shown: *
* *
* xl0 = x[2 * i0 ] - x[2 * i2 ]; *
* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; *
* xl20 = x[2 * i1 ] - x[2 * i3 ]; *
* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; *
* *
* xt1 = xl0 + xl21; *
* yt2 = xl1 + xl20; *
* xt2 = xl0 - xl21; *
* yt1 = xl1 - xl20; *
* *
* xl1_xl0 = _sub2(x21_x20, x21_x20) *
* xl21_xl20 = _sub2(x32_x22, x23_x22) *
* xl20_xl21 = _rotl(xl21_xl20, 16) *
* *
* yt2_xt1 = _add2(xl1_xl0, xl20_xl21) *
* yt1_xt2 = _sub2(xl1_xl0, xl20_xl21) *
* *
* Also notice that xt1, yt1 endup on seperate words, these need to *
* be packed together to take advantage of the packed twiddle fact *
* ors that have been loaded. In order for this to be achieved they *
* are re-aligned as follows: *
* *
* yt1_xt1 = _packhl2(yt1_xt2, yt2_xt1) *
* yt2_xt2 = _packhl2(yt2_xt1, yt1_xt2) *
* *
* The packed words "yt1_xt1" allows the loaded"sc" twiddle factor *
* to be used for the complex multiplies. The real part os the *
* complex multiply is implemented using _dotp2. The imaginary *
* part of the complex multiply is implemented using _dotpn2 *
* after the twiddle factors are swizzled within the half word. *
* *
* (X + jY) ( C + j S) = (XC + YS) + j (YC - XS). *
* *
* The actual twiddle factors for the FFT are cosine, - sine. The *
* twiddle factors stored in the table are csine and sine, hence *
* the sign of the "sine" term is comprehended during multipli- *
* cation as shown above. *
* *
* *
* ASSUMPTIONS *
* *
* The size of the FFT, n, must be a power of 4 and greater than *
* or equal to 16 and less than 32768. *
* *
* The arrays 'x[]', 'y[]', and 'w[]' all must be aligned on a *
* double-word boundary for the "optimized" implementations. *
* *
* The input and output data are complex, with the real/imaginary *
* components stored in adjacent locations in the array. The real *
* components are stored at even array indices, and the imaginary *
* components are stored at odd array indices. *
* *
* C CODE *
* *
* *
* NOTES *
* *
* *
* CYCLES *
* *
* cycles = [10.25*N/8+10]*ceil[log4(N)-1]+6*N/4+81 *
* For nx = 512, cycles = 3513 *
* *
* CODESIZE *
* *
* 1056 bytes *
* ------------------------------------------------------------------------- *
* Copyright (c) 2005 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
* ======================================================================== *
* ======================================================================== *
.text .global _fft16x32_fft16x32:
;*================== SYMBOLIC REGISTER ASSIGNMENTS: SETUP ====================*
.asg B15, B_SP ; Stack pointer, B datapath
.asg A20, A_SP ; Stack pointer, A datapath
.asg B3, B_ret ; Return address
;*============================================================================*
; ====================== SYMBOLIC REGISTER ASSIGNMENTS: SPLOOP =======================
.asg B6, B_w0
.asg A7, A_w0
.asg B7, B_h2
.asg B8, B_l1
.asg B9, B_l2
.asg A8, A_h2
.asg A9, A_fft_jmp_3
.asg B1, B_fft_jmp
.asg B5, B_j
.asg B11, B_co20_si20
.asg B10, B_co10_si10
.asg A0, A_j
.asg A23, A_co11_si11
.asg A22, A_co30_si30
.asg A31, A_co31_si31
.asg A30, A_co21_si21
.asg B4, B_x
.asg A4, A_x
.asg B17, B_x_1
.asg B16, B_x_0
.asg B19, B_xh2_1i
.asg B18, B_xh2_0i
.asg B25, B_xl1_1i
.asg B24, B_xl1_0i
.asg B31, B_xl2_1i
.asg B30, B_xl2_0i
.asg A11, A_x_3
.asg A10, A_x_2
.asg A27, A_xh2_3i
.asg A26, A_xh2_2i
.asg A17, A_xl1_3i
.asg A16, A_xl1_2i
.asg A1, A_xl2_3i
.asg A0, A_xl2_2i
.asg B21, B_xh0_0
.asg B20, B_xl0_0
.asg B25, B_xh1_0
.asg B24, B_xl1_0
.asg A13, A_xh0_1
.asg A12, A_xl0_1
.asg A17, A_xh1_1
.asg A16, A_xl1_1
.asg B29, B_xh21_0
.asg B28, B_xl21_0
.asg B23, B_xh20_0
.asg B22, B_xl20_0
.asg A19, A_xh20_1
.asg A18, A_xl20_1
.asg A25, A_xh21_1
.asg A24, A_xl21_1
.asg A6, A_x_
.asg A5, A_x___
.asg B3, B_x__
.asg A3, A_x__
.asg B0, B_ifj
.asg B28, B_x_0o
.asg B29, B_x_1o
.asg A24, A_x_2o
.asg A25, A_x_3o
.asg B30, B_yt0_0
.asg B0, B_xt0_0
.asg A24, A_yt0_1
.asg A26, A_xt0_1
.asg B31, B_xt1_0
.asg B30, B_xt2_0
.asg B23, B_yt2_0
.asg B22, B_yt1_0
.asg A29, A_xt1_1
.asg A28, A_xt2_1
.asg A19, A_yt2_1
.asg A18, A_yt1_1
.asg B17, B_p2
.asg B16, B_p0
.asg B11, B_p1
.asg B10, B_p3
.asg B16, B_xh2_0o
.asg B17, B_xh2_1o
.asg A11, A_p6
.asg A10, A_p4
.asg A1, A_p5
.asg A0, A_p7
.asg A10, A_xh2_2o
.asg A11, A_xh2_3o
.asg B27, B_pa
.asg B26, B_p8
.asg B13, B_p9
.asg B12, B_pb
.asg B26, B_xl1_0o
.asg B27, B_xl1_1o
.asg A21, A_pe
.asg A20, A_pc
.asg A23, A_pd
.asg A22, A_pf
.asg A20, A_xl1_2o
.asg A21, A_xl1_3o
.asg B22, B_co30_si30
.asg B21, B_p12
.asg B20, B_p10
.asg B19, B_p11
.asg B18, B_p13
.asg B20, B_xl2_0o
.asg B21, B_xl2_1o
.asg A21, A_p16
.asg A20, A_p14
.asg A17, A_p15
.asg A16, A_p17
.asg A16, A_xl2_2o
.asg A17, A_xl2_3o
; variables not used in inner loop
.asg B2, B_radix2
.asg B16, B_n
.asg B24, B_i
.asg A10, A_ptr_x
.asg B20, B_stride_temp
.asg A23, A_radix_temp
.asg B25, B_i_temp
.asg B14, B_stride
.asg B2, B_wh
.asg A2, A_ptr_w
.asg A15, A_tw_offset
.asg A14, A_radix
;global variables saved on stack
.asg B4, B_n
.asg A6, A_ptr_x
.asg B6, B_ptr_y
.asg B2, B_h2_old
.asg A18, A_h2_old
; ============================================================================
; Stack frame. 14 words: A10..A15, B10..B14, B3, A_ptr_x, B_ptr_y, B_n
STW .D2T2 B14, *B_SP--[16] ; Reserve stack, Save A15
|| MV .L1 A4, A_ptr_w
|| SHRU .S2 B_n, 3, B_i
|| MV .L2 B_n, B_stride_temp
|| MV .D1 A4, A_w0
|| MPYSU .M2 6, B_n, B_fft_jmp
STW .D2T2 B_n, *+B_SP[0] ; Save B_n
|| MVC .S2 B_i, RILC
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -