?? finite impulse response filter.txt
字號:
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* FIRCIRC
*
* Revision Data: 03/27/97
*
* USAGE This routine is C Callable and can be called as:
*
* void fircirc(short y[], short x[], int n, short h[], int s,
* int m, int size, int index)
*
* y = output array
* x = input array
* n = number of coefficients (MULTIPLE of 4 >= 4)
* h = coefficient array
* s = output scaling factor
* m = number of inputs (MULTIPLE of 2 >= 2)
* size = Block Size Factor for Circular Addressing (Block Size
* = 2^(size + 1))
* index = Initial Index
*
* If routine is not to be used as a C callable function
* then all instructions relating to stack should be removed.
* Refer to comments of individual instructions. You will also
* need to initialize values for all of the values passed as these
* are assumed to be in registers as defined by the calling
* convention of the compiler, (refer to the C compiler reference
* guide).
*
* C CODE This is the C equivalent of the assembly code without
* restrictions. Note that the assembly code is hand optimized and
* restrictions may apply.
*
* void fircirc(short y[], short x[], int n, short h[], int s,
* int m, int size, int index)
* {
* int i, j;
* Long40 y0;
* Long40 round = (Long40) 1 << (s - 1);
* for (j = 0; j < m; j++) {
* y0 = 0;
* for (i = 0; i < n; i++)
* y0 += x[(i + j + index) % (1 << size)] * h[i];
* y[j] = y0 >> s;
* }
* }
*
* DESCRIPTION
* The fircirc performs a Finite Impulse Response filter using
* circular addressing w/ inital index and output scaling. It
* operates on 16-bit data with a 40-bit accumulate. The final
* output is scaled down by the scaling factor s. The scaling
* factor s is normaly set to 24 to give a 16 bit output. The FIR
* assumes the number of filter coeficients is a multiple of 4 and
* the number of output samples is a multiple of 2. This routine
* has no memory hits regardless of where x, h, and y arrays are
* located in memory. The filter has M input samples and N
* coefficients. The assembly routine performs 2 output samples at
* a time. The Block Size of the Circular Buffer given in Bytes is
* 2^(SIZE + 1).
*
* TECHNIQUES
* The inner loop is unrolled four times thus the number of
* filter coefficients must be a multiple of four. The outer
* loop is unrolled twice so the number of output samples must
* be a multiple of 2.
*
* If an odd number of output samples is needed or possible, the
* final store can either be removed or conditionally executed
* depending on whether M is even or odd. This code would have to
* be added to the existing code.
*
* The outer loop, like the inner loop, is software pipelined as
* well. e, o, and p in the comments of the individual
* instructions correspond to the epilogue, outer loop, and
* prologue respectively.
*
* Refer to FIR example in the optimizing assembly chapter of
* the programmer's guide for more information.
*
* ASSUMPTIONS
* N MULTIPLE of 4 >= 4
* M EVEN >= 2
*
* MEMORY NOTE
* This code has no memory hits regardless of where x and h are
* located in memory.
*
* CYCLES M*(N + 11)/2 + 13
*
*===============================================================================
.global _fircirc
.text
_fircirc:
STW .D2 B10, *B15-- ; push B10 on the stack
|| MV .L1X B15, A1 ; copy stack pointer
STW .D2 A10, *B15--[2] ; push A10 on the stack
|| STW .D1 B11, *--A1[2] ; push B11 on the stack
STW .D2 A11, *B15--[2] ; push A11 on the stack
|| STW .D1 B12, *--A1[2] ; push B12 on the stack
*** BEGIN Benchmark Timing ***
B_START
B .S1 OUTLOOP
|| ADD .D1 6, A6, A10 ; n + 6 half array reset
|| SHL .S2X A10, 16, B0 ; set circular block size
|| ADDAH .D2 B4, B10, B4 ; x += index
|| MV .L1X B3, A0 ; copy return address
|| MV .L2 B8, B1 ; move m
SHR .S1 A6, 2, A3 ; n / 4
|| MV .L2X A10, B10 ; copy array reset
|| SET .S2 B0, 8, 8, B0 ; set B4 (x) in circular mode
|| ADD .L1X 2, B6, A5 ; copy h
|| STW .D2 A12, *B15--[2] ; push A12 on the stack
|| STW .D1 B13, *--A1[2] ; push B13 on the stack
ADD .L1X 2, B4, A7 ; copy x
|| ADD .L2 B10, 2, B14 ; array reset
|| SET .S2 B0, 6, 6, B0 ; set A7 (x) in circular mode
|| STW .D2 A13, *B15--[2] ; push A13 on the stack
|| STW .D1 B14, *--A1[2] ; push B15 on the stack
ADDAH .D1 A5, A10, A5 ; compensate for first pass
|| ADDAH .D2 B6, B10, B5 ; compensate for first pass
|| MVC .S2 B0, AMR ; setup AMR
ADDAH .D1 A7, A10, A7 ; compensate for first pass
|| ADDAH .D2 B4, B14, B4 ; compensate for first pass
|| MVK .S2 1, B2 ; setup j loop priming
ADD .L2X 2, A4, B11 ; copy y
|| STW .D2 A15, *B15-- ; push A15 on the stack
LOOP: ; LOOP BEGINS HERE
[!A1] ADD .L2X A9, B13:B12,B13:B12 ; y1 += p00, i=0
||[!A1] ADD .L1X B9, A13:A12,A13:A12 ; y0 += p01, i=0
|| MPY .M2 B3, B7, B6 ; p11 = x1 * h1, i=1
|| MPY .M1 A6, A11, A11 ; p00 = x0 * h0, i=1
|| LDH .D1 *++A5[2], B9 ;** h1 = *h++, i=0
|| LDH .D2 *++B5[2], A9 ;** h0 = *h++, i=0
[!A1] ADD .L2 B6, B13:B12,B13:B12 ; y1 += p11, i=0
||[!A1] ADD .L1 A6, A13:A12,A13:A12 ; y0 += p10, i=1
|| MPY .M1X B3, A9, A15 ;* p10 = x1 * h0, i=0
|| MPY .M2X A15, B9, B9 ;* p01 = x0 * h1, i=0
|| LDH .D2 *++B4[2], B3 ;** x1 = *x++, i=1
|| LDH .D1 *++A7[2], A6 ;** x0 = *x++, i=1
||[A2] SUB .S1 A2, 1, A2 ; i++
[A2] B .S1 LOOP ;* for i
||[!A1] ADD .L2X A11, B13:B12,B13:B12 ; y1 += p00, i=1
||[!A1] ADD .L1X B7, A13:A12,A13:A12 ; y0 += p01, i=1
|| MPY .M2 B8, B9, B6 ;* p11 = x1 * h1, i=0
|| MPY .M1 A15, A9, A9 ;* p00 = x0 * h0, i=0
|| LDH .D1 *++A5[2], B7 ;** h1 = *h++, i=1
|| LDH .D2 *++B5[2], A11 ;** h0 = *h++ i=1
||[B0] SUB .S2 B0, 1, B0 ; decrement flushing count
[!A1] ADD .L2 B6, B13:B12,B13:B12 ; y1 += p11, i=1
||[B0] ADD .L1 A15, A13:A12,A13:A12 ;* y0 += p10, i=0
|| MPY .M2X A6, B7, B7 ;* p01 = x0 * h1, i=1
|| MPY .M1X B8, A11, A6 ;* p10 = x1 * h0, i=1
|| LDH .D2 *++B4[2], B8 ;*** x1 = *x++, i=0
|| LDH .D1 *++A7[2], A15 ;*** x0 = *x++, i=0
||[A1] SUB .S1 A1, 1, A1 ; decrement priming
; inner loop branch occurs here
OUTLOOP:
LDH .D2 *--B4[B14], B3 ;p x1 = *x++, i=1
[B1] B .S2 LOOP ;p for i
|| LDH .D2 *++B4[2], B8 ;p x1 = *x++, i=0
|| LDH .D1 *--A7[A10], A15 ;p x0 = *x++, i=0
|| MV .L2X A8, B6 ;o copy s
|| SUB .L1 A8, 1, A1 ;o s - 1
|| SHR .S1 A13:A12,A8, A13:A12 ;e y0 >>= s
SHR .S2 B13:B12,B6, B13:B12 ;e y1 >>= s
|| LDH .D1 *--A5[A10], B9 ;p h1 = *h++, i=0
|| LDH .D2 *--B5[B10], A9 ;p h0 = *h++, i=0
[!B2] STH .D1 A12, *A4++[2] ;e y[0] = y0
||[!B2] STH .D2 B12, *B11++[2] ;e y[1] = y1
|| MVK .S1 1, A12 ;o \ round = (Long40) 1
|| ZERO .L1 A13 ;o /
|| ZERO .L2 B2 ;o clear j loop priming
LDH .D2 *++B4[2], B3 ;p x1 = *x++, i=1
|| LDH .D1 *++A7[2], A6 ;p x0 = *x++, i=1
|| SHL .S1 A13:A12,A1, A13:A12 ;o y0 = round = (Long40) 1<<(s-1)
|| ADD .L2X 1, A3, B0 ;p setup flushing count
[B1] B .S2 LOOP ;p for i
|| LDH .D1 *++A5[2], B7 ;p h1 = *h++, i=1
|| LDH .D2 *++B5[2], A11 ;p h0 = *h++ i=1
|| MV .L2X A13, B13 ;o y1 = round
LDH .D2 *++B4[2], B8 ;p* x1 = *x++, i=0
|| LDH .D1 *++A7[2], A15 ;p* x0 = *x++, i=0
|| MV .L2X A12, B12 ;o y1 = round
||[B1] SUB .S2 B1, 2, B1 ;p j++
|| MV .L1 A3, A2 ;p i < n
|| MVK .S1 1, A1 ;p i loop priming
B_END:
*** END Benchmark Timing ***
END: LDW .D2 *++B15, A15 ; pop A15 off the stack
|| MV .L1X B15, A1 ; copy stack pointer
LDW .D1 *++A1[3], A13 ; pop A13 off the stack
|| LDW .D2 *++B15, B14 ; pop B14 off the stack
LDW .D1 *++A1[2], A12 ; pop A12 off the stack
|| LDW .D2 *++B15[2], B13 ; pop B13 off the stack
LDW .D1 *++A1[2], A11 ; pop A11 off the stack
|| LDW .D2 *++B15[2], B12 ; pop B12 off the stack
|| MV .L2X A0, B3 ; move return address
LDW .D1 *++A1[2], A10 ; pop A10 off the stack
|| LDW .D2 *++B15[2], B11 ; pop B11 off the stack
|| B .S2 B3
LDW .D2 *++B15[2], B10 ; pop B10 off the stack
NOP 4
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -