?? fir filter coefficients x8 output samples x2.txt
字號:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* FIR
*
* Revision Date: 2/3/97
*
* USAGE This routine is C Callable and can be called as:
*
* void fir8(short *x, short *h, short *y, int N, int M)
*
* x = input array
* h = coefficient array
* y = output array
* N = number of coefficients (MULTIPLE of 8 >= 8)
* M = number of output samples (EVEN >= 2)
*
* If routine is not to be used as a C callable function
* then all instructions relating to stack should be removed.
* Refer to comments of individual instructions. You will also
* need to initialize values for all of the values passed as these
* are assumed to be in registers as defined by the calling
* convention of the compiler, (refer to the C compiler reference
* guide).
*
* C Code This is the C equivalent of the Assembly Code without
* restrictions. Note that the assembly code is hand optimized and
* restrictions may apply
*
* void fir8(short x[], short h[], short y[], int N, int M)
* {
* int i, j, sum;
*
* for (j = 0; j < M; j++) {
* sum = 0;
* for (i = 0; i < N; i++)
* sum += x[i + j] * h[i];
* y[j] = sum >> 15;
* }
* }
*
* DESCRIPTION
* This FIR assumes the number of filter coeficients is a multiple
* of 8 and the number of output samples is a multiple of 2. It
* operates on 16-bit data with a 32-bit accumulate. This
* routine has no memory hits regardless of where x, h, and y
* arrays are located in memory. The filter is M output samples
* and N coefficients. The assembly routine performs 2 output
* samples at a time.
*
*
* TECHNIQUES
* The inner loop is unrolled eight times thus the number of
* filter coefficients must be a multiple of eight. The outer
* loop is unrolled twice so the number of output samples must
* be a multiple of 2.
*
* If an odd number of output samples is needed or possible, the
* final store can either be removed or conditionally executed
* depending on whether M is even or odd. This code would have to
* be added to the existing code.
*
* The outer loop is conditionally executed in parallel with the
* inner loop. This allows for a zero overhead outer loop.
*
* Refer to FIR example in the optimizing assembly chapter of
* the programmer's guide for more information.
*
*
* ASSUMPTIONS
* N MULTIPLE of 8 >= 8
* M EVEN >= 2
*
*
* MEMORY NOTE
* This code has no memory hits regardless of where x and h are
* located in memory.
*
* CYCLES M*N/2+13
*
*===============================================================================
.global _fir8
.text
_fir8:
STW .D2 A15,*B15-- ; push register (for c-callable func)
|| SUB .L1X B15,8,A15 ; copy stack pointer to A reg file
STW .D2 B14,*B15--[2] ; push register (for c-callable func)
|| STW .D1 A14,*A15--[2] ; push register (for c-callable func)
STW .D2 B13,*B15--[2] ; push register (for c-callable func)
|| STW .D1 A13,*A15--[2] ; push register (for c-callable func)
STW .D2 B12,*B15--[2] ; push register (for c-callable func)
|| STW .D1 A12,*A15--[2] ; push register (for c-callable func)
*** BEGIN Benchmark Timing ***
B_START
MPY .M2 B6,2,B5 ; used to reset h ptr (16*N/8)
|| SHR .S1X B6,3,A5 ; set pointer reset lp cntr (N/8)
|| ADD .L2X A6,2,B6 ; point to y[j+1]
SHR .S2X A8,1,B0 ; M/2
|| STW .D2 B11,*B15--[2] ; push register (for c-callable func)
|| STW .D1 A11,*A15--[2] ; push register (for c-callable func)
|| ZERO .S1 A2 ; set store lp cntr (N/8+1)
MV .L1X B4,A0 ; point to h[0] & h[1]
|| ADD .S2 B4,4,B14 ; point to h[2] & h[3]
|| MV .L2X A4,B1 ; point to x[j] & x[j+1]
|| ADD .S1 A4,4,A4 ; point to x[j+2] & x[j+3]
|| STW .D2 B10,*B15--[2] ; push register (for c-callable func)
|| STW .D1 A10,*A15--[2] ; push register (for c-callable func)
LDW .D1 *A4++[2],B9 ; x[j+i+2] & x[j+i+3]
|| LDW .D2 *B1++[2],A10 ; x[j+i+0] & x[j+i+1]
|| MV .L1 A5,A1 ; set pointer reset lp cntr (N/8)
|| MPY .M2X A5,B0,B0 ; set up loop counter ((N/8)*(M/2))
|| SUB .S1X B5,4,A3 ; used to reset x ptr (16*N/8-4)
|| ZERO .L2 B11 ; zero out initial accumulator
|| MVK .S2 2,B2 ; initialize loop priming count
LOOP:
[!A2] SHR .S1 A10,15,A12 ; (Asum0 >> 15)
|| MPYH .M2 B7,B9,B13 ; p03 = h[i+3]*x[j+i+3]
||[A2] ADD .L1 A7,A10,A7 ; sum0(p00) = p00 + sum0
|| MPYHL .M1X B7,A11,A10 ; p13 = h[i+3]*x[j+i+4]
|| ADD .L2X A14,B4,B7 ; sum1 += p11
|| LDW .D2 *B14++[2],B7 ;* h[i+2] & h[i+3]
|| LDW .D1 *A0++[2],A8 ;* h[i+0] & h[i+1]
||[B2] SUB .S2 B2,1,B2 ; dec loop priming count
ADD .L1 A10,A7,A13 ; sum0 += p01
|| MPYHL .M2X A9,B10,B12 ; p15 = h[i+5]*x[j+i+6]
|| MPYLH .M1 A9,A11,A10 ; p14 = h[i+4]*x[j+i+5]
|| ADD .L2 B13,B7,B7 ; sum1 += p12
|| LDW .D2 *B1++[2],A11 ;* x[j+i+4] & x[j+i+5]
|| LDW .D1 *A4++[2],B10 ;* x[j+i+6] & x[j+i+7]
||[A1] SUB .S1 A1,1,A1 ;* dec pointer reset lp cntr
||[!B2] SUB .S2 B0,1,B0 ; dec outer lp cntr
[B0] B .S2 LOOP ; Branch outer loop
|| MPY .M1 A9,A11,A11 ; p04 = h[i+4]*x[j+i+4]
|| ADD .L1X B9,A13,A13 ; sum0 += p02
|| MPYLH .M2 B8,B10,B13 ; p16 = h[i+6]*x[j+i+7]
|| ADD .L2X A10,B7,B7 ; sum1 += p13
|| LDW .D1 *A0++[2],A9 ;* h[i+4] & h[i+5]
|| LDW .D2 *B14++[2],B8 ;* h[i+6] & h[i+7]
||[!A1] SUB .S1 A4,A3,A4 ;* reset x ptr
MPY .M2 B8,B10,B11 ; p06 = h[i+6]*x[j+i+6]
|| MPYH .M1 A9,A11,A11 ; p05 = h[i+5]*x[j+i+5]
|| ADD .L1X B13,A13,A9 ; sum0 += p03
|| ADD .L2X A10,B7,B7 ; sum1 += p14
||[!A1] SUB .S2 B1,B5,B1 ;* reset x ptr
||[!A1] SUB .S1 A0,A3,A0 ;* reset h ptr
|| LDH .D2 *B1,A8 ;* x[j+i+8]
||[B2] ADD .D1 A5,1,A2 ; set store lp cntr (N/8+1)
[!A2] MV .S1 A5,A2 ; reset store lp cntr (N/8)
|| MPYH .M2 B8,B10,B13 ; p07 = h[i+7]*x[j+i+7]
|| ADD .L1 A11,A9,A9 ; sum0 += p04
|| MPYHL .M1X B8,A8,A9 ; p17 = h[i+7]*x[j+i+8]
|| ADD .S2 B12,B7,B10 ; sum1 += p15
||[!A2] STH .D2 B11,*B6++[2] ; y[j+1] = (Bsum1 >> 15)
||[!A2] STH .D1 A12,*A6++[2] ; y[j] = (Asum0 >> 15)
|| ADD .L2X A10,0,B8 ;* move to other reg file
ADD .L1 A11,A9,A12 ; sum0 += p05
|| ADD .L2 B13,B10,B8 ; sum1 += p16
|| MPYLH .M2X A8,B8,B4 ;* p10 = h[i+0]*x[j+i+1]
||[!A1] SUB .D2 B14,B5,B14 ;* reset h ptr
|| MPYHL .M1X A8,B9,A14 ;* p11 = h[i+1]*x[j+i+2]
||[!A1] ADD .S2 B1,4,B1 ;* reset x ptr
||[!A1] SUB .S1 A0,4,A0 ;* reset x ptr
[!B2] ADD .L2X A9,B8,B11 ; sum1 += p17
|| ADD .L1X B11,A12,A12 ; sum0 += p06
|| MPY .M1 A8,A10,A7 ;* p00 = h[i+0]*x[j+i+0]
|| MPYLH .M2 B7,B9,B13 ;* p12 = h[i+2]*x[j+i+3]
||[A2] SUB .D1 A2,1,A2 ;* dec store lp cntr
||[!A1] MV .S1 A5,A1 ;* reset pointer reset lp cntr (N/8)
||[B2] ZERO .D2 B11 ; zero out initial accumulator
[!B2] ADD .L1X B13,A12,A10 ; sum0 += p07
||[!A2] SHR .S2 B11,15,B11 ;* (Bsum1 >> 15)
|| MPY .M2 B7,B9,B9 ;* p02 = h[i+2]*x[j+i+2]
|| MPYH .M1 A8,A10,A10 ;* p01 = h[i+1]*x[j+i+1]
||[A2] ADD .L2 B4,B11,B4 ;* sum1(p10) = p10 + sum1
|| LDW .D1 *A4++[2],B9 ;** x[j+i+2] & x[j+i+3]
|| LDW .D2 *B1++[2],A10 ;** x[j+i+0] & x[j+i+1]
||[B2] ZERO .S1 A10 ; zero out initial accumulator
; Loop ends here
SHR .S1 A10,15,A12 ; (Asum0 >> 15)
|| LDW .D2 *++B15,A10 ; pop register (for c-callable func)
|| MV .L1X B15,A15 ; move stack pointer to A reg file
STH .D2 B11,*B6++[2] ; y[j+1] = (Bsum1 >> 15)
|| STH .D1 A12,*A6++[2] ; y[j] = (Asum0 >> 15)
B_END:
*** END Benchmark Timing ***
LDW .D1 *++A15[2],B10 ; pop register (for c-callable func)
|| LDW .D2 *++B15[2],A11 ; pop register (for c-callable func)
LDW .D1 *++A15[2],B11 ; pop register (for c-callable func)
|| LDW .D2 *++B15[2],A12 ; pop register (for c-callable func)
LDW .D1 *++A15[2],B12 ; pop register (for c-callable func)
|| LDW .D2 *++B15[2],A13 ; pop register (for c-callable func)
LDW .D1 *++A15[2],B13 ; pop register (for c-callable func)
|| LDW .D2 *++B15[2],A14 ; pop register (for c-callable func)
|| B .S2 B3 ; return
LDW .D1 *++A15[2],B14 ; pop register (for c-callable func)
|| LDW .D2 *++B15[2],A15 ; pop register (for c-callable func)
NOP 4
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -