?? least mean square adaptive filter. fir with n coefficients and m output samples.txt
字號(hào):
**===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* FIR4
*
* Revision Date: 4/17/97
*
* USAGE This routine is C Callable and can be called as:
*
* void lmsfir8(short *x,short *h,short *y,int N,short *d,
* short ar,short M)
*
* x = input array
* h = coefficient array
* y = output array
* N = number of coefficients (MULTIPLE of 8 >= 8)
* d = desired output array
* ar = adaptive rate
* M = number of output samples
*
* If routine is not to be used as a C callable function
* then all instructions relating to stack should be removed.
* Refer to comments of individual instructions. You will also
* need to initialize values for all of the values passed as these
* are assumed to be in registers as defined by the calling
* convention of the compiler, (refer to the C compiler reference
* guide).
*
* restrictions.
*
* Note that the assembly code is hand optimized and restrictions
* may apply
*
* void lmsfir8(short *x,short *h,short *y,int N,short *d,
* short ar,short M)
* {
* int i,j;
* int sum;
* short error = 0;
*
* for (i = 0; i < M; i++) {
*
* for (j = 0; j < N; j++) {
* h[j] = h[j] + ((((ar*error)>>15)*x[i-1+j])>>15);
* }
*
* sum = 0;
* for (j = 0; j < N; j++) {
* sum += h[j] * x[i+j];
* }
*
* sum >>= 15;
* *y++ = sum;
*
* error = d[i] - sum;
*
* }
* }
*
* DESCRIPTION
*
*
* TECHNIQUES
* The inner loop is unrolled eight times to allow update of
* previous stages coefficients to occur in the same inner loop.
*
* ASSUMPTIONS
* N MULTIPLE of 8
*
* MEMORY NOTE
* This code has no memory hits regardless of where x and h are
* located in memory. h must start on a word boundary
*
* CYCLES M * ((9/8) * N + 15) + 5
*
*===============================================================================
.global _lmsfir8
.bss stack, 52
.text
_lmsfir8:
MVK .S1 stack, A0 ; move stack pointer into A0
|| MVK .S2 stack, B0 ; move stack pointer into B0
MVKH .S1 stack, A0 ; move stack pointer into A0
|| MVKH .S2 stack, B0 ; move stack pointer into B0
STW .D1 A15, *+A0[0] ; push A15 on stack
|| STW .D2 B15, *+B0[1] ; push B15 on stack
STW .D1 A14, *+A0[2] ; push A14 on stack
|| STW .D2 B14, *+B0[3] ; push B14 on stack
STW .D1 A13, *+A0[4] ; push A13 on stack
|| STW .D2 B13, *+B0[5] ; push B13 on stack
STW .D1 A12, *+A0[6] ; push A12 on stack
|| STW .D2 B12, *+B0[7] ; push B12 on stack
STW .D1 A11, *+A0[8] ; push A11 on stack
|| STW .D2 B11, *+B0[9] ; push B11 on stack
STW .D1 A10, *+A0[10] ; push A10 on stack
|| STW .D2 B10, *+B0[11] ; push B10 on stack
STW .D1 B3, *+A0[12] ; push return pointer on stack
*** BEGIN Benchmark Timing ***
B_START
MV .D1 A10,A2 ; Outer Loop Count M
|| MV .D2 B6,B14 ; Copy Number of Coefs
|| LMBD .L2 1,B6,B6 ; Left most bit on nCoefs
|| MVK .S2 32,B0 ;
MV .D2 B4,B5 ; Coefs Buffer Pointer
|| SUB .L2 B0,B6,B6 ; N where 2^(N+1) block size of circ buff
|| ZERO .L1 A3 ; Init desired to zero so Error = 0
|| MVK .S2 0101h,B9 ; Used to set A4,B4 circular addr mode
SHL .S2 B6,16,B6 ;
|| MV .L2X A4,B4 ; Buffer pointer
|| MV .L1X B5,A1 ; Coefs Buffer Pointer
ADD .L2 B6,B9,B9 ; Used to set circular addr mode BK size
MVC .S2 B9,AMR ; Set A4,B4, circular addr mode w/ BK0
|| ZERO .L2 B3 ; Init output to zero so Error = 0
|| MV .D2 B8,B15 ; Copy Adaptive Rate
|| ADDAW .D1 A1,1,A1 ; Offset to Coefs Buffer Pointer
||[A2] SUB A2,1,A2 ; Decrement outer loop counter M
KERNEL:
LDH .D1 *++A4[3],A0 ; x(i+3)
|| LDH .D2 *B4++,B2 ; x(i)
|| SUB .L2x A3,B3,B1 ; Error = Desired - Output
|| SHR .S2 B14,3,B0 ; Number of Coefs / 8
LDH .D1 *--A4,A3 ; x(i+2)
|| LDH .D2 *B4++[4],B3 ; x(i+1)
|| SMPY .M2 B1,B15,B6 ; Error*ar
|| ZERO B11 ; initialize y0 to zero
ZERO A10 ; initialize y2 to zero
|| LDH .D1 *--A4[3],A11 ; x(i-1) Previous state
LDH .D1 *++A4[7],A13 ; x(i+6)
|| LDH .D2 *B4--,B1 ; x(i+5)
|| SHR .S2 B6,16,B6 ; Q15 format ar*error
LDW .D1 *A1++[2],A11 ; h(3) & h(2)
|| LDW .D2 *B5++[2],B10 ; h(1) & h(0)
|| MV .L1X B6,A5 ; Q15 format ar*error
SMPY .M2 B6,B2,B10 ; ph1 = ar * error * x(i)
|| SMPY .M1 A5,A0,A7 ; ph4 = ar * error * x(i+3)
|| LDW .D1 *A1++[2],A15 ; h(7) & h(6)
|| LDW .D2 *B5++[2],B7 ; h(5) & h(4)
SMPY .M2 B6,B3,B12 ; ph2 = ar * error * x(i+1)
|| SMPY .M1 A5,A3,A7 ; ph3 = ar * error * x(i+2)
|| LDH .D2 *B4++[4],B8 ; x(i+4)
|| ADDAH .D1 A4,1,A4 ; *x pointer update
CLR .S2 B10,0,15,B9 ; psh1 = ph1 w/ lower 16 bits cleared
|| SHRU .S1 A7,16,A14 ; psh4 = ph4 >> 15
|| SMPY .M1 A5,A11,A7 ; ph0 = ar * error * x(i+7)
SHRU .S2 B12,16,B8 ; psh2 = ph2 >> 16
|| CLR .S1 A7,0,15,A15 ; psh3 = ph3 w/ lower 16 bits cleared
|| LDH .D1 *A4++[4],A11 ; x(i+7)
SMPY .M2 B6,B1,B9 ; ph6 = ar * error * x(i+5)
|| SMPY .M1 A5,A13,A9 ; ph7 = ar * error * x(i+6)
|| ADD .L1X A15,B8,A12 ; ph3 & ph2
|| LDH .D1 *A4--,A0 ;* x(i+3)
|| LDH .D2 *B4++,B2 ;* x(i)
|| SHRU .S2X A7,16,B12 ; psh0 = ph0 >> 16
ADD2 .S1 A12,A11,A7 ; h(3) += ph3 & h(2) += ph2
|| LDH .D1 *A4++[4],A3 ;* x(i+2)
|| LDH .D2 *B4++[4],B3 ;* x(i+1)
|| ADD .L2 B9,B12,B13 ; ph1 & ph0
ADD2 .S2 B13,B10,B13 ; h(1) += ph1 & h(0) += ph0
|| SMPY .M2 B6,B8,B9 ; ph5 = ar * error * x(i+4)
|| CLR .S1 A9,0,15,A9 ; psh7 = ph7 w/ lower 16 bits cleared
|| STW .D1 A7,*-A1[4] ; store h(3) & h(2)
||[B0] ADD .L2 -1,B0,B0 ; dec loop counter
|| MPY .M1 A3,A7,A7 ; py2 = x(i+2) * h(2)
|| ZERO A12 ; initialize y3 to zero
|| ZERO B12 ; initialize y1 to zero
SHRU .S2 B9,16,B10 ; psh6 = ph6 >> 15
|| MPYLH .M1 A0,A7,A9 ; py3 = x(i+3) * h(3)
|| MPY .M2 B2,B13,B10 ; py0 = x(i) * h(0)
|| LDH .D1 *A4++,A13 ;* x(i+6)
|| LDH .D2 *B4--,B1 ;* x(i+5)
||[!B0] B END ; if # coefs is 8 skip over loop
ADD A7,A10,A10 ; y2 += py2,
|| CLR .S2 B9,0,15,B9 ; psh5 = ph5 w/ lower 16 bits cleared
|| MPYLH .M2 B3,B13,B10 ; py1 = x(i+1) * h(1)
|| ADD .L1X A9,B10,A7 ; ph7 & ph6
|| LDW .D1 *A1++[2],A11 ;* h(3) & h(2)
|| SMPY .M1 A5,A11,A7 ;* ph0 = ar * error * x(i)
|| LDW .D2 *B5++[2],B10 ;* h(1) & h(0)
ADD B10,B11,B11 ; y0 += py0,
|| ADD A9,A12,A9 ; y3 += py3,
|| ADD2 .S1 A7,A15,A15 ; h(7) += ph7 & h(6) += ph6
|| ADD .L2X B9,A14,B9 ; ph5 & ph4
|| SMPY .M2 B6,B2,B10 ;* ph1 = ar * error * x(i)
|| SMPY .M1 A5,A0,A7 ;* ph4 = ar * error * x(i+3)
|| LDW .D1 *A1++[2],A15 ;* h(7) & h(6)
|| LDW .D2 *B5++[2],B7 ;* h(5) & h(4)
ADD B10,B12,B9 ; y1 += py1,
|| ADD2 .S2 B9,B7,B7 ; h(5) += ph5 & h(4) += ph4
|| SMPY .M2 B6,B3,B12 ;* ph2 = ar * error * x(i+1)
|| SMPY .M1 A5,A3,A7 ;* ph3 = ar * error * x(i+2)
|| SHRU .S1 A7,16,A12 ;* psh0 = ph0 >> 16
|| LDH .D2 *B4++[4],B8 ;* x(i+4)
|| LDH .D1 *A4++[4],A11 ;* x(i+7)
STW .D2 B13,*-B5[8] ; store h(1) & h(0)
|| STW .D1 A15,*-A1[6] ; store h(7) & h(6)
|| MPY .M1 A13,A15,A7 ; py6 = x(i+6) * h(6)
|| MPYLH .M2 B1,B7,B7 ; py5 = x(i+5) * h(5)
|| CLR .S2 B10,0,15,B10 ;* psh1 = ph1 w/ lower 16 bits cleared
|| SHRU .S1 A7,16,A14 ;* psh4 = ph4 >> 15
STW .D2 B7,*-B5[6] ; store h(5) & h(4)
|| MPYLH .M1 A11,A15,A7 ; py7 = x(i+7) * h(7)
|| MPY .M2 B8,B7,B8 ; py4 = x(i+4) * h(4)
|| SHRU .S2 B12,16,B8 ;* psh2 = ph2 >> 16
|| CLR .S1 A7,0,15,A15 ;* psh3 = ph3 w/ lower 16 bits cleared
|| ADD .L2X B10,A12,B13 ;* ph1 & ph0
OUTLOOP:
ADD B7,B9,B12 ; y1 += py5,
|| ADD A7,A10,A10 ; y2 += py6,
|| SMPY .M2 B6,B1,B9 ;* ph6 = ar * error * x(i+5)
|| SMPY .M1 A5,A13,A9 ;* ph7 = ar * error * x(i+6)
|| ADD .L1X A15,B8,A12 ;* ph3 & ph2
|| LDH .D1 *A4--,A0 ;** x(i+3)
|| LDH .D2 *B4++,B2 ;** x(i)
ADD B8,B11,B11 ; y0 += py4,
|| ADD A7,A9,A12 ; y3 += py7,
|| ADD2 .S1 A12,A11,A7 ;* h(3) += ph3 & h(2) += ph2
|| LDH .D1 *A4++[4],A3 ;** x(i+2)
|| LDH .D2 *B4++[4],B3 ;** x(i+1)
ADD2 .S2 B13,B10,B13 ;* h(1) += ph1 & h(0) += ph0
|| SMPY .M2 B6,B8,B9 ;* ph5 = ar * error * x(i+4)
|| CLR .S1 A9,0,15,A9 ;* psh7 = ph7 w/ lower 16 bits cleared
|| STW .D1 A7,*-A1[4] ;* store h(3) & h(2)
||[B0] ADD .L2 -1,B0,B0 ;* dec loop counter
|| MPY .M1 A3,A7,A7 ;* py2 = x(i+2) * h(2)
[B0] B .S1 OUTLOOP ; for OUTLOOP
|| SHRU .S2 B9,16,B10 ;* psh6 = ph6 >> 15
|| MPYLH .M1 A0,A7,A9 ;* py3 = x(i+3) * h(3)
|| MPY .M2 B2,B13,B10 ;* py0 = x(i) * h(0)
|| LDH .D1 *A4++,A13 ;** x(i+6)
|| LDH .D2 *B4--,B1 ;** x(i+5)
ADD A7,A10,A10 ;* y2 += py2,
|| CLR .S2 B9,0,15,B9 ;* psh5 = ph5 w/ lower 16 bits cleared
|| MPYLH .M2 B3,B13,B10 ;* py1 = x(i+1) * h(1)
|| ADD .L1X A9,B10,A7 ;* ph7 & ph6
|| LDW .D1 *A1++[2],A11 ;** h(3) & h(2)
|| SMPY .M1 A5,A11,A7 ;** ph0 = ar * error * x(i)
|| LDW .D2 *B5++[2],B10 ;** h(1) & h(0)
ADD B10,B11,B11 ;* y0 += py0,
|| ADD A9,A12,A9 ;* y3 += py3,
|| ADD2 .S1 A7,A15,A15 ;* h(7) += ph7 & h(6) += ph6
|| ADD .L2X B9,A14,B9 ;* ph5 & ph4
|| SMPY .M2 B6,B2,B10 ;** ph1 = ar * error * x(i)
|| SMPY .M1 A5,A0,A7 ;** ph4 = ar * error * x(i+3)
|| LDW .D1 *A1++[2],A15 ;** h(7) & h(6)
|| LDW .D2 *B5++[2],B7 ;** h(5) & h(4)
ADD B10,B12,B9 ;* y1 += py1,
|| ADD2 .S2 B9,B7,B7 ;* h(5) += ph5 & h(4) += ph4
|| SMPY .M2 B6,B3,B12 ;** ph2 = ar * error * x(i+1)
|| SMPY .M1 A5,A3,A7 ;** ph3 = ar * error * x(i+2)
|| SHRU .S1 A7,16,A12 ;** psh0 = ph0 >> 16
|| LDH .D2 *B4++[4],B8 ;** x(i+4)
|| LDH .D1 *A4++[4],A11 ;** x(i+7)
STW .D2 B13,*-B5[8] ;* store h(1) & h(0)
|| STW .D1 A15,*-A1[6] ;* store h(7) & h(6)
|| MPY .M1 A13,A15,A7 ;* py6 = x(i+6) * h(6)
|| MPYLH .M2 B1,B7,B7 ;* py5 = x(i+5) * h(5)
|| CLR .S2 B10,0,15,B10 ;** psh1 = ph1 w/ lower 16 bits cleared
|| SHRU .S1 A7,16,A14 ;** psh4 = ph4 >> 15
STW .D2 B7,*-B5[6] ;* store h(5) & h(4)
|| MPYLH .M1 A11,A15,A7 ;* py7 = x(i+7) * h(7)
|| MPY .M2 B8,B7,B8 ;* py4 = x(i+4) * h(4)
|| SHRU .S2 B12,16,B8 ;** psh2 = ph2 >> 16
|| CLR .S1 A7,0,15,A15 ;** psh3 = ph3 w/ lower 16 bits cleared
|| ADD .L2X B10,A12,B13 ;** ph1 & ph0
END:
ADD .L2 B7,B9,B12 ; y1 += py5,
|| ADD .L1 A7,A10,A10 ; y2 += py6,
||[A2] B .S2 KERNEL ;
|| SUBAH .D2 B5,B14,B5 ; reset *h pointer
|| LDH .D1 *A8++,A3 ; Load Desired
ADD B8,B11,B11 ; y0 += py4,
|| ADD A7,A9,A12 ; y3 += py7,
|| SUBAH .D2 B4,B14,B4 ; reset *x pointer
||[A2] SUB A2,1,A2 ; Decrement outer loop counter M
ADD B11,B12,B12 ; y01 = y0 + y1
|| ADD A10,A12,A12 ; y23 = y2 + y3
|| SUBAW .D2 B5,4,B5 ; reset *h pointer
ADD A12,B12,B3 ; sum = y0 + y1 + y2 + y3
|| MV .L1X B5,A1 ; copy *h pointer
|| SUBAH .D2 B4,7,B4 ; reset *x pointer
SHR .S2 B3,15,B3 ; sum >>= 15
|| ADDAW .D1 A1,1,A1 ; reset *h pointer
STH .D1 B3,*A6++ ; Store output to *y++
|| MV .L1X B4,A4 ; copy *x pointer
B_END:
*** END Benchmark Timing ***
MVK .S1 stack, A8 ; move stack pointer into A8
|| MVK .S2 stack, B8 ; move stack pointer into B8
MVKH .S1 stack, A8 ; move stack pointer into A8
|| MVKH .S2 stack, B8 ; move stack pointer into B8
LDW .D1 *+A8[12], B3 ; pop return address off stack
LDW .D1 *+A8[0], A15 ; pop A15 off stack
|| LDW .D2 *+B8[1], B15 ; pop B15 off stack
LDW .D1 *+A8[2], A14 ; pop A14 off stack
|| LDW .D2 *+B8[3], B14 ; pop B14 off stack
LDW .D1 *+A8[4], A13 ; pop A13 off stack
|| LDW .D2 *+B8[5], B13 ; pop B13 off stack
LDW .D1 *+A8[6], A12 ; pop A12 off stack
|| LDW .D2 *+B8[7], B12 ; pop B12 off stack
LDW .D1 *+A8[8], A11 ; pop A11 off stack
|| LDW .D2 *+B8[9], B11 ; pop B11 off stack
|| B .S2 B3 ; return
LDW .D1 *+A8[10], A10 ; pop A10 off stack
|| LDW .D2 *+B8[11], B10 ; pop B10 off stack
NOP 4
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -