?? fir filter coefficients x4 output samples x2.txt
字號:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* FIR4
*
* Revision Date: 4/17/97
*
* USAGE This routine is C Callable and can be called as:
*
* void fir(short *x, short *h, short *y, int N, int M)
*
* x = input array
* h = coefficient array
* y = output array
* N = number of coefficients (MULTIPLE of 4 >= 8)
* M = number of output samples (M EVEN >= 2)
*
* If routine is not to be used as a C callable function
* then all instructions relating to stack should be removed.
* Refer to comments of individual instructions. You will also
* need to initialize values for all of the values passed as these
* are assumed to be in registers as defined by the calling
* convention of the compiler, (refer to the C compiler reference
* guide).
*
* C Code This is the C equivalent of the Assembly Code without
* restrictions.
*
* Note that the assembly code is hand optimized and restrictions
* may apply
*
* void fir4(short x[], short h[], short y[], int N, int M)
* {
* int i, j, sum;
*
* for (j = 0; j < M; j++) {
* sum = 0;
* for (i = 0; i < N; i++)
* sum += x[i + j] * h[i];
* y[j] = sum >> 15;
* }
* }
*
* DESCRIPTION
* This FIR assumes the number of filter coeficients is a multiple
* of 4 and the number of output samples is a multiple of 2. It
* operates on 16-bit data with a 32-bit accumulate. This
* routine has no memory hits regardless of where x, h, and y
* arrays are located in memory. The filter is M output samples
* and N coefficients. The assembly routine performs 2 output
* samples at a time.
*
*
* TECHNIQUES
* The inner loop is unrolled four times thus the number of
* filter coefficients must be a multiple of four. The outer
* loop is unrolled twice so the number of output samples must
* be a multiple of 2.
*
* If an odd number of output samples is needed or possible, the
* final store can either be removed or conditionally executed
* depending on whether M is even or odd. This code would have to
* be added to the existing code.
*
* The outer loop, like the inner loop, is software pipelined as
* well. e, o, and p in the comments of the individual
* instructions correspond to the epilogue, outer loop, and
* prologue respectively.
*
* Refer to FIR example in the optimizing assembly chapter of
* the programmer's guide for more information.
*
*
* ASSUMPTIONS
* N MULTIPLE of 4 >= 8
* M EVEN >= 2
*
*
* MEMORY NOTE
* This code has no memory hits regardless of where x and h are
* located in memory.
*
* CYCLES M*(N+8)/2+6
*
*===============================================================================
.global _fir4
.text
_fir4:
STW .D2 B10,*B15-- ; push register (for c-callable func)
*** BEGIN Benchmark Timing ***
B_START
STW .D2 B11,*B15-- ; push register (for c-callable func)
|| SHR .S1 A8,1,A2 ; set up outer loop counter
|| SHL .S2 B6,1,B10 ; used to rst h pointer each outer loop
STW .D2 B12,*B15-- ; push register (for c-callable func)
|| ADD .L1X B10,10,A3 ; used to rst x pointer each outer loop
|| ADD .S2 B10,8,B10 ; used to rst h pointer each outer loop
|| ADD .L2X A6,2,B11 ; set up pointer to y[1]
LDH .D1 *A4++,B8 ; x0 = x[j]
|| ADD .L2X A4,4,B1 ; set up pointer to x[j+2]
|| ADD .L1X B4,2,A8 ; set up pointer to h[1]
|| SHR .S2 B6,2,B12 ; set up inner loop counter
||[A2] SUB .S1 A2,1,A2 ; decrement outer loop counter
LDH .D2 *B1++[2],B0 ; x2 = x[j+i+2]
|| LDH .D1 *A4++[2],A0 ; x1 = x[j+i+1]
LDH .D1 *A8++[2],B6 ; h1 = h[i+1]
|| LDH .D2 *B4++[2],A1 ; h0 = h[i]
LDH .D1 *A4++[2],A5 ; x3 = x[j+i+3]
|| LDH .D2 *B1++[2],B5 ; x0 = x[j+i+4]
OUTLOOP:
LDH .D2 *B4++[2],A7 ; h2 = h[i+2]
|| LDH .D1 *A8++[2],B8 ; h3 = h[i+3]
|| ZERO .L1 A9 ; zero out sum0
|| ZERO .L2 B9 ; zero out sum1
LDH .D2 *B1++[2],B0 ;* x2 = x[j+i+2]
|| LDH .D1 *A4++[2],A0 ;* x1 = x[j+i+1]
|| SUB .S2 B12,2,B2 ; set up inner loop counter
LDH .D1 *A8++[2],B6 ;* h1 = h[i+1]
|| LDH .D2 *B4++[2],A1 ;* h0 = h[i]
MPY .M1X B8,A1,A0 ; x0 * h0
|| MPY .M2X A0,B6,B6 ; x1 * h1
|| LDH .D1 *A4++[2],A5 ;* x3 = x[j+i+3]
|| LDH .D2 *B1++[2],B5 ;* x0 = x[j+i+4]
[B2] B .S1 LOOP ; branch to loop
|| MPY .M2 B0,B6,B7 ; x2 * h1
|| MPY .M1 A0,A1,A1 ; x1 * h0
|| LDH .D2 *B4++[2],A7 ;* h2 = h[i+2]
|| LDH .D1 *A8++[2],B8 ;* h3 = h[i+3]
||[B2] SUB .S2 B2,1,B2 ;* decrement loop counter
ADD .L1 A0,A9,A9 ; sum0 += x0 * h0
|| MPY .M2X A5,B8,B8 ; x3 * h3
|| MPY .M1X B0,A7,A5 ; x2 * h2
|| LDH .D2 *B1++[2],B0 ;** x2 = x[j+i+2]
|| LDH .D1 *A4++[2],A0 ;** x1 = x[j+i+1]
LOOP:
ADD .L2X A1,B9,B9 ; sum1 += x1 * h0
|| ADD .L1X B6,A9,A9 ; sum0 += x1 * h1
|| MPY .M2 B5,B8,B7 ; x0 * h3
|| MPY .M1 A5,A7,A7 ; x3 * h2
|| LDH .D1 *A8++[2],B6 ;** h1 = h[i+1]
|| LDH .D2 *B4++[2],A1 ;** h0 = h[i]
ADD .L2 B7,B9,B9 ; sum1 += x2 * h1
|| ADD .L1 A5,A9,A9 ; sum0 += x2 * h2
|| MPY .M1X B5,A1,A0 ;* x0 * h0
|| MPY .M2X A0,B6,B6 ;* x1 * h1
|| LDH .D1 *A4++[2],A5 ;** x3 = x[j+i+3]
|| LDH .D2 *B1++[2],B5 ;** x0 = x[j+i+4]
ADD .L2X A7,B9,B9 ; sum1 += x3 * h2
|| ADD .L1X B8,A9,A9 ; sum0 += x3 * h3
||[B2] B .S1 LOOP ;* branch to loop
|| MPY .M2 B0,B6,B7 ;* x2 * h1
|| MPY .M1 A0,A1,A1 ;* x1 * h0
|| LDH .D2 *B4++[2],A7 ;** h2 = h[i+2]
|| LDH .D1 *A8++[2],B8 ;** h3 = h[i+3]
||[B2] SUB .S2 B2,1,B2 ;** decrement loop counter
ADD .L2 B7,B9,B9 ; sum1 += x0 * h3
|| ADD .L1 A0,A9,A9 ;* sum0 += x0 * h0
|| MPY .M2X A5,B8,B8 ;* x3 * h3
|| MPY .M1X B0,A7,A5 ;* x2 * h2
|| LDH .D2 *B1++[2],B0 ;*** x2 = x[j+i+2]
|| LDH .D1 *A4++[2],A0 ;*** x1 = x[j+i+1]
; inner loop branch occurs here
ADD .L2X A1,B9,B9 ;e sum1 += x1 * h0
|| ADD .L1X B6,A9,A9 ;e sum0 += x1 * h1
|| MPY .M2 B5,B8,B7 ;e x0 * h3
|| MPY .M1 A5,A7,A7 ;e x3 * h2
|| SUB .D1 A4,A3,A4 ;o reset x pointer to x[j]
|| SUB .D2 B4,B10,B4 ;o reset h pointer to h[0]
||[A2] B .S1 OUTLOOP ;o branch to outer loop
ADD .D2 B7,B9,B9 ;e sum1 += x2 * h1
|| ADD .L1 A5,A9,A9 ;e sum0 += x2 * h2
|| LDH .D1 *A4++,B8 ;p x0 = x[j]
|| ADD .L2X A4,4,B1 ;p set up pointer to x[j+2]
|| ADD .S1X B4,2,A8 ;p set up pointer to h[1]
ADD .L2X A7,B9,B9 ;e sum1 += x3 * h2
|| ADD .L1X B8,A9,A9 ;e sum0 += x3 * h3
|| LDH .D2 *B1++[2],B0 ;p x2 = x[j+i+2]
|| LDH .D1 *A4++[2],A0 ;p x1 = x[j+i+1]
||[A2] SUB .S1 A2,1,A2 ;o decrement outer loop counter
ADD .L2 B7,B9,B9 ;e sum1 += x0 * h3
|| SHR .S1 A9,15,A9 ;e sum0 >> 15
|| LDH .D1 *A8++[2],B6 ;p h1 = h[i+1]
|| LDH .D2 *B4++[2],A1 ;p h0 = h[i]
SHR .S2 B9,15,B9 ;e sum1 >> 15
|| LDH .D1 *A4++[2],A5 ;p x3 = x[j+i+3]
|| LDH .D2 *B1++[2],B5 ;p x0 = x[j+i+4]
STH .D1 A9,*A6++[2] ;e y[j] = sum0 >> 15
|| STH .D2 B9,*B11++[2] ;e y[j+1] = sum1 >> 15
; outer loop branch occurs here
B_END:
*** END Benchmark Timing ***
LDW .D2 *++B15,B12 ; pop register (for c-callable func)
LDW .D2 *++B15,B11 ; pop register (for c-callable func)
|| B .S2 B3 ; return
LDW .D2 *++B15,B10 ; pop register (for c-callable func)
NOP 4
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -