?? fast hadamard transform.txt
字號:
*===========================================================
*
* TEXAS INSTRUMENTS, INC.
*
* NAME: FHT (Fast Hadamard Transform)
*
* Revision Date: 04/01/98
* Author: Jelena Nikolic-Popovic, DSP Apps. Canada
*
* USAGE
* This routine is C-callable and can be called as:
*
* void fht(short *data)
*
* data = 64-point input vector on entry
* 64-point FHT of the input vector on exit
*
* C CODE
* This is the C equivalent of the assembly code
* without restrictions:
* Note that the assembly code is hand optimized
* and restrictions may apply.
*
* void fht(short *data)
* {
* short tmp,k,i,count,j,n;
*
* for (k=0;k<6;k++) {
* n=32>>k;
* for (i=0;i<n;i++) {
* tmp=data[i]+data[i+n];
* data[i+n]=data[i]-data[i+n];
* data[i]=tmp;
* }
* count=(2<<(k-1))-1;
* if (count>0) {
* for (j=1;j<=count;j++) {
* for (i=n*j*2;i<n*(j*2+1);i++) {
* tmp=data[i]+data[i+n];
* data[i+n]=data[i]-data[i+n];
* data[i]=tmp;
* }
* } // end of for j=1:count
* } // endif
* } // end of for k=0:5
* }
*
*
* DESCRIPTION
* The routine replaces the 64-point input data
* vector with its FHT (Fast Hadamard Transform).
* The FHT is an efficient way to multiply a vector
* by the Hadamard matrix. In IS-95, the size of
* input vector is 64 points and Hadamard matrix
* H6 is a 64 x 64 matrix defined recursively as:
*
* H0 = [1]
* H1 = H0 H0 = 1 1
* H0 -H0 1 -1
*
* ...
*
* H6 = H5 H5
* H5 -H5
*
*
* FHT reduces the vector * matrix multiplication
* task to butterfly computation, similar to
* Viterbi.
* For multiplication by H6, there are 6 stages of
* 32 butterflies each. From one stage to the next,
* the butterfly "span" is divided by 2 (from 32
* points in the first stage to 2 points in the
* last stage).
* The algorithm requires:
* 6 stages * 32 butterflies/stage * 2 op's/butterfly
* = 384 add/subtract operations
*
*
* TECHNIQUES
*
* The algorithm is optimized for speed of execution
* rather than code size.
*
* A potential bottleneck in the C6x implementation
* is in the requirement to store/load intermediate
* 64-point array after each stage.
* This was overcome by making following modification
* to the H6 - FHT algorithm:
*
* Multiplication of input vector d[64] by the H6
* matrix ( 64x64 ) is broken down in two
* multiplications by H5 (32x32 matrix) as follows:
* d[64] = [ d1[32] d2[32] ], i.e. split the input
* vector in two 32-point vectors
* H6*d[64] = H5*d1[32] + H5*d2[32]
* H5*d1[32] - H5*d2[32]
* First, H5*d1[32] is calculated using the butterfly
* method. Since there are only 32 intermediate
* results of type short, they can be stored in
* 16 registers - no need to store out after each
* stage and then load in for the next one.
* Next, H5*d1[32] will be stored out to memory and
* H5*d2[32] will be calculated in the same manner as
* H5*d1[32]. Finally, H6*d[64] is computed from
* H5*d1[32] and H5*d2[32] as shown above.
*
* ASSUMPTIONS
* - interrupts are disabled
* - the range of data in the input vector is such
* that the data in the output vector will not
* overflow 16 bits. Since there are 6 stages of
* add/sub's, this means that the range of input
* data should not be more than 10 bits.
* ( in the worst case).
* - input vector is aligned on 4-byte boundary.
*
* MEMORY NOTE
* no memory bank hits.
* CYCLES
* 191 cycles (including C calling overhead)
*
*===========================================================
.global _fht
.text
_fht:
; save context
sub.l1x b15,4,a7 ; copy stack pointer
|| stw.d2 b3,*B15--[2]
stw.d1 a10,*a7--[2] ; push a10
|| stw.d2 b10,*b15--[2]; push b10
stw.d1 a11,*A7--[2] ; push a11
|| stw.d2 b11,*b15--[2]; push b11
stw.d1 a12,*a7--[2] ; push a12
|| stw.d2 b12,*b15--[2]; push b12
stw.d1 a13,*a7--[2] ; push a13
|| stw.d2 b13,*b15--[2]; push b13
stw.d1 a14,*a7 ; push a14
|| stw.d2 b14,*b15 ; push b14
; intialization
mv.l1 a4,a14 ; a14 = &d[0]
|| mvk 32,a13 ; a13 = 32
|| mvk 32,b13 ; b13 = 32
mvk 2,b13 ; b13 = 00010002
|| mvk 0ffffh,a15 ; a15 = 0000ffff
|| add.l2 a14,b13,b14 ; b14 = &d(16)
mvklh 1,b13 ; b13 = 00010002
|| mvklh 0,a15 ; a15 = 0000ffff
; H5 loop computes H5*d1(32) and H5*d2(32)
; Notation : di(), where i=0,1,2,3,4,5,
; are the intermediate vectors after stage i
H5_loop:
ldw.d1 *a14++,b0 ; d0(0) and d0(1)
ldw.d2 *b14++,a0 ; d0(16) and d0(17)
|| ldw.d1 *a14++,b1 ; d0(2) and d0(3)
ldw.d2 *b14++,a1 ; d0(18) and d0(19)
|| ldw.d1 *a14++,b2 ; d0(4) and d0(5)
ldw.d2 *b14++,a2 ; d0(20) and d0(21)
|| ldw.d1 *a14++,b3 ; d0(6) and d0(7)
ldw.d2 *b14++,a3 ; d0(22) and d0(23)
|| ldw.d1 *a14++,b4 ; d0(8) and d0(9)
ldw.d2 *b14++,a4 ; d0(24) and d0(25)
|| ldw.d1 *a14++,b5 ; d0(10) and d0(11)
ldw.d2 *b14++,a5 ; d0(26) and d0(27)
|| add2.s1 a0,b0,a0 ; d1(0) and d1(1)
|| sub2.s2 b0,a0,b0 ; d1(16) and d1(17)
|| ldw.d1 *a14++,b6 ; d0(12) and d0(13)
ldw.d2 *b14++,a6 ; d0(28) and d0(29)
|| add2.s1 a1,b1,a1 ; d1(2) and d1(3)
|| sub2.s2 b1,a1,b1 ; d1(18 and d1(19)
|| ldw.d1 *a14++,b7 ; d0(14) and d0(15)
ldw.d2 *b14++,a7 ; d0(30) and d0(31)
|| add2.s1 a2,b2,a2 ; d1(4) and d1(5)
|| sub2.s2 b2,a2,b2 ; d1(20) and d1(21)
add2.s1 a3,b3,a3 ; d1(6) and d1(7)
|| sub2.s2 b3,a3,b3 ; d(22) and d(23)
add2.s1 a4,b4,a4 ; d1(8) and d1(9)
|| sub2.s2 b4,a4,b4 ; d1(24) and d1(25)
add2.s1 a5,b5,a5 ; d1(10) and d1(11)
|| sub2.s2 b5,a5,b5 ; d1(26) and d1(27)
add2.s1 a6,b6,a6 ; d1(12) and d1(13)
|| sub2.s2 b6,a6,b6 ; d1(28) and d1(29)
add2.s1 a7,b7,a7 ; d1(14) and d1(15)
|| sub2.s2 b7,a7,b7 ; d1(30) and d1(31)
; End of 1st stage
; Start of 2nd stage
sub2.s1 a0,a4,a8 ; d2(8) and d2(9)
|| sub2.s2 b0,b4,b8 ; d2(24) and d2(25)
add2.s1 a0,a4,a0 ; d2(0) and d2(1)
|| add2.s2 b0,b4,b0 ; d2(16) and d2(17)
sub2.s1 a2,a6,a4 ; d2(12) and d2(13)
|| sub2.s2 b2,b6,b4 ; d2(28) and d2(29)
add2.s1 a2,a6,a2 ; d2(4) and d2(5)
|| add2.s2 b2,b6,b2 ; d2(20) and d2(21)
sub2.s1 a1,a5,a6 ; d2(10) and d2(11)
|| sub2.s2 b1,b5,b6 ; d2(26) and d2(27)
add2.s1 a1,a5,a1 ; d2(2) and d2(3)
|| add2.s2 b1,b5,b1 ; d2(18) and d2(19)
sub2.s1 a3,a7,a5 ; d2(14) and d2(15)
|| sub2.s2 b3,b7,b5 ; d2(30) and d2(31)
add2.s1 a3,a7,a3 ; d2(6) and d2(7)
|| add2.s2 b3,b7,b3 ; d2(22) and d2(23)
; End of 2nd stage
; Start of 3rd stage
sub2.s1 a0,a2,a7 ; d3(4) and d3(5)
|| sub2.s2 b0,b2,b7 ; d3(20) and d3(21)
add2.s1 a0,a2,a0 ; d3(0) and d3(1)
|| add2.s2 b0,b2,b0 ; d3(16) and d3(17)
sub2.s1 a1,a3,a2 ; d3(6) and d3(7)
|| sub2.s2 b1,b3,b2 ; d3(22) and d3(23)
add2.s1 a1,a3,a1 ; d3(2) and d3(3)
|| add2.s2 b1,b3,b1 ; d3(18) and d3(19)
sub2.s1 a8,a4,a3 ; d3(12) and d3(13)
|| sub2.s2 b8,b4,b3 ; d3(28) and d3(29)
add2.s1 a8,a4,a4 ; d3(8 and d3(9)
|| add2.s2 b8,b4,b4 ; d3(24) and d3(25)
sub2.s1 a6,a5,a8 ; d3(14) and d3(15)
|| sub2.s2 b6,b5,b8 ; d3(30) and d3(31)
add2.s1 a6,a5,a5 ; d3(10) and d3(11)
|| add2.s2 b6,b5,b5 ; d3(26 and d3(27)
; End of 3nd stage
; Start of 4th stage ( pipelined with 5th stage)
sub2.s1 a0,a1,a6 ; d4(2) and d4(3)
|| sub2.s2 b0,b1,b6 ; d4(18) and d4(19)
add2.s1 a0,a1,a0 ; d4(0) and d4(1)
|| add2.s2 b0,b1,b0 ; d4(16) and d4(17)
|| sub.l1 a14,a13,a14 ; adjust store pointers
|| sub.l2 b14,a13,b14
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -