?? performs autocorrelation of a 16-bit vector.txt
字號(hào):
*===============================================================================
*
* TEXAS INSTRUMENTS, INC.
*
* AUTOCORRELATION
*
* Revision Date: 04/16/97
*
* USAGE This routine is C Callable and can be called as:
*
* int autcor(short ac[], short sd[], int N, int M)
*
* ac[] --- Resulting array of autocorrelation
* sd[] --- Input array of autocorrelation
* N --- Length of Input array vector (sd[]) - M (MULTIPLE of 8)
* M --- Length of autocorrelation (MULTIPLE of 2)
*
* If routine is not to be used as a C callable function then
* you need to initialize values for all of the values passed
* as these are assumed to be in registers as defined by the
* calling convention of the compiler, (refer to the C compiler
* reference guide).
*
* C CODE
* This is the C equivalent of the assembly code. Note that
* the assembly code is hand optimized and restrictions may
* apply.
*
* void autcor(short ac[],short sd[], int N, int M)
* {
* int i,k,sum;
*
* for (i = 0; i < M; i++){
* sum = 0;
* for (k = M; k < N+M; k++)
* sum += sd[k] * sd[k-i];
* ac[i] = (sum >> 15);
* }
* }
*
*
* DESCRIPTION
*
* This routine performs the autocorrelation of the input array sd.
* It is assumed that the length of the input array, sd, is a
* multiple of 8 and the length of the output array, ac, is a * multiple of 2. The assembly routine performs 2 output samples
* at a time. This is typically used in vselp code.
*
* TECHNIQUES
*
* The inner loop is unrolled eight times thus the length of
* the input array must be a multiple of eight. The outer
* loop is unrolled twice so the length of output array must
* be a multiple of 2.
*
* The outer loop is conditionally executed in parallel with the
* inner loop. This allows for a zero overhead outer loop.
*
* ASSUMPTIONS
*
* N is a multiple of 8
* M is a multiple of 2
* sd[0] is on a word boundary
*
* MEMORY NOTE
*
* One memory hit occurs every fourth outer loop cycle (2Nth inner
* loop cycle) or M/4 times.
*
* CYCLES
*
* (N/2)*M + 16 + M/4
*
*================================================================================
********* ASSEMBLY CODE: *******************
.global _autcor
.text
_autcor:
SUB .L1x B15,4,A9
STW .D1 A10,*A9--[2] ; push A10 on stack
|| STW .D2 B10,*B15--[2] ; push B10 on stack
STW .D1 A11,*A9--[2] ; push A11 on stack
|| STW .D2 B11,*B15--[2] ; push B11 on stack
STW .D1 A12,*A9--[2] ; push A12 on stack
|| STW .D2 B12,*B15--[2] ; push B12 on stack
*** BEGIN Benchmark Timing ***
B_START:
SHL .S1 A6,1,A14 ; for inner lp setup; N Hwords
|| MPY .M2X A6,B6,B0 ; N*M
|| STW .D1 A13,*A9--[2] ; push A13 on stack
|| STW .D2 B13,*B15--[2] ; push B13 on stack
ADD .L2X 2,A4,B14 ; a[i+1] other reg file
|| ADDAH .D2 B4,B6,B3 ; sd[k] & sd[k+1]
|| MV .L1X B3,A13 ; store return pointer
|| STW .D1 A14,*A9--[2] ; push A14 on stack
MV .L1X B3,A3 ; sd[k] & sd[k+1]
|| SUB .L2 B3,4,B4 ; sd[k-i-2] & sd[k-i-1]
|| SHR .S1 A6,3,A0 ; for inner lp setup; N/8
|| STW .D1 A15,*A9 ; push A15 on stack
|| STW .D2 B14,*B15 ; push B14 on stack
LDW .D1 *A3++[2],A5 ; sd[k] & sd[k+1]
|| MV .L1 A3,A15 ; sd[k-i] & sd[k-i+1]
|| MV .S1 A0,A1 ; Set inner loop count
LDW .D2 *B4++[2],B5 ; sd[k-i-2]& sd[k-i-1]
|| LDW .D1 *A15++[2],A6 ; sd[k-i]& sd[k-i+1]
|| SHL .S2X A6,1,B1 ; for inner lp setup; N Hwords
|| MPY .M1 A8,0,A8 ; initialize to zero
|| MPY .M2 B9,0,B9 ; initialize to zero
|| MVK .S1 1,A2 ; initialize count
|| ZERO .L2 B2 ; initialize to zero
|| ADD .L1 A0,-1,A0 ;
LDW .D2 *++B3[1],B6 ; sd[k+2] & sd[k+3]
|| MVK .S1 1,A2 ; initialize priming cnt
|| MPY .M1 A7,0,A7 ; initialize to zero
|| MPY .M2 B11,0,B11 ; initialize to zero
|| ZERO .D1 A5 ; initialize to zero
|| ZERO .L2 B6 ; initialize to zero
|| SUB .L1X 0,B10,A11 ; initialize to zero (1st cycle 1st)
|| SUB .S2 0,B8,B12 ; initialize to zero (1st cycle 1st)
LOOP1:
ADD .L1X B10,A11,A11 ; tmp_aca2 +s= p2a2
|| [A2] ADD .L2 B8,B12,B8 ; tmp_acb0 +s= p2b0
|| MPY .M1 A5,A6,A9 ; p2a4 = sd[k+4]*sd[k-i+4]
|| MPYLH .M2X A5,B7,B8 ; p2b4 = sd[k+4]*sd[k-i+3]
|| LDW .D2 *B4++[2],B7 ;*sd[k-i+2]&sd[k-i+3]
||[!A2] SHR .S2 B12,15,B13 ; (0x0000ffffL & (tmp_acb >> 16)
ADD .L1X B11,A11,A11 ; tmp_aca3 +s= p2a3
|| ADD .L2X A7,B8,B12 ; tmp_acb1 +s= p2b1
|| MPY .M2 B6,B7,B10 ; p2a6 = sd[k+6]*sd[k-i+6]
|| MPYHL .M1 A5,A6,A7 ; p2b5 = sd[k+5]*sd[k-i+4]
|| LDW .D1 *A3++[2],A5 ;* sd[k+4] & sd[k+5]
||[!A1] ADD .S1 A0,1,A1 ; reset inner lp cntr
||[!B2] SHR .S2 B0,4,B0 ; M*N/16
||[!B2] ADDAW .D2 B3,2,B3 ; sd[k+2] & sd[k+3]
ADD .L2X A8,B12,B12 ; tmp_acb2 +s= p2b2
|| ADD .L1 A9,A11,A9 ; tmp_aca4 +s= p2a4
|| MPYH .M2 B6,B7,B11 ; p2a7 = sd[k+7]*sd[k-i+7]
|| MPYLH .M1X B6,A6,A8 ; p2b6 = sd[k+6]*sd[k-i+5]
||[!A2] STH .D2 B13,*B14++[2] ; ac[0][i+1]=(tmp_acb>> 16)
|| LDW .D1 *A15++[2],A6 ;*sd[k-i+4]& sd[k-i+5]
|| [B0] B .S2 LOOP1 ; Branch inner most loop
|| [A1] ADD .S1 -1,A1,A1 ; dec lp cntr
ADD .L2 B9,B12,B12 ; tmp_acb3 +s= p2b3
|| [B2] ADD .L1 A10,A9,A11 ; tmp_aca5 +s= p2a5
|| MPYHL .M2 B6,B7,B9 ; p2b7 = sd[k+7]*sd[k-i+6]
|| MPYH .M1 A5,A6,A10 ;* p2a1 = sd[k+1]*sd[k-i+1]
|| LDW .D2 *B3++[2],B6 ;* sd[k+6] & sd[k+7]
|| [B2] SUB .S1 A1,A0,A2 ; dec lp cntr
||[!A2] STH .D1 A12,*A4++[2] ; ac[0][i] =(tmp_aca >> 16)
ADD .L1X B10,A11,A11 ; tmp_aca6 +s= p2a6
|| ADD .L2 B8,B12,B8 ; tmp_acb4 +s= p2b4
|| MPY .M1 A5,A6,A9 ;* p2a0 = sd[k]*sd[k-i]
|| MPYLH .M2X A5,B5,B8 ;* p2b0 = sd[k]*sd[k-i-1]
|| LDW .D2 *B4,B7 ;*sd[k-i+6]&sd[k-i+7]
||[!A1] SUB .S2 B4,B1,B4 ; reset ptr
||[!A1] SUB .S1 A15,A14,A15 ; reset ptr
||[!A1] SUB .D1 A3,A14,A3 ; reset ptr
ADD .L1X B11,A11,A11 ; tmp_aca7 +s= p2a7
|| ADD .L2X A7,B8,B12 ; tmp_acb5 +s= p2b5
|| MPY .M2 B6,B7,B10 ;* p2a2 = sd[k+2]*sd[k-i+2]
|| MPYHL .M1 A5,A6,A7 ;* p2b1 = sd[k+1]*sd[k-i]
|| LDW .D1 *A3++[2],A5 ;** sd[k] & sd[k+1]
||[!A1] SUB .D2 B3,B1,B3 ; reset ptr
||[!A1] SUB .S2 B4,4,B4 ; reset ptr
||[!A1] SUB .S1 A15,4,A15 ; reset ptr
ADD .L2X A8,B12,B12 ; tmp_acb6 +s= p2b6
|| [A2] ADD .L1 A9,A11,A9 ;* tmp_aca0 +s= p2a0
|| MPYH .M2 B6,B7,B11 ;* p2a3 = sd[k+3]*sd[k-i+3]
|| MPYLH .M1X B6,A6,A8 ;* p2b2 = sd[k+2]*sd[k-i+1]
|| LDW .D2 *B4++[2],B5 ;**sd[k-i-2]&sd[k-i-1]
|| LDW .D1 *A15++[2],A6 ;**sd[k-i]& sd[k-i+1]
||[!A2] SHR .S1 A11,15,A12 ; (0x0000ffffL & (tmp_aca >> 16)
||[!B2] MVK .S2 1,B2 ;
ADD .L2 B9,B12,B12 ; tmp_acb7 +s= p2b7
|| ADD .L1 A10,A9,A11 ;* tmp_aca1 +s= p2a1
|| MPYHL .M2 B6,B7,B9 ;* p2b3 = sd[k+3]*sd[k-i+2]
|| MPYH .M1 A5,A6,A10 ;* p2a5 = sd[k+5]*sd[k-i+5]
|| LDW .D2 *B3++[2],B6 ;** sd[k+2] & sd[k+3]
|| [B0] ADD .S2 -1,B0,B0 ; dec outer lp cntr
; LOOP1 ENDS HERE
SHR .S2 B12,15,B13 ; (0x0000ffffL & (tmp_acb >> 16)
STH .D1 A12,*A4 ; ac[0][i] =(tmp_aca >> 16)
|| STH .D2 B13,*B14 ; ac[0][i+1]=(tmp_acb>> 16)
|| SUB .L1x B15,12,A4 ;
B_END:
*** END Benchmark Timing ***
LDW .D1 *A4++[2], A15 ; pop A15 off stack
|| LDW .D2 *B15++[2], B14 ; pop B14 off stack
LDW .D1 *A4++[2], A14 ; pop A14 off stack
|| LDW .D2 *B15++[2], B13 ; pop B13 off stack
LDW .D1 *A4++[2], A13 ; pop A13 off stack
|| LDW .D2 *B15++[2], B12 ; pop B12 off stack
|| MV .L2x A13,B3 ; get return pointer
LDW .D1 *A4++[2], A12 ; pop A12 off stack
|| LDW .D2 *B15++[2], B11 ; pop B11 off stack
LDW .D1 *A4++[2], A11 ; pop A11 off stack
|| LDW .D2 *B15, B10 ; pop B10 off stack
|| B .S2 B3 ; Return from call
LDW .D1 *A4,A10 ; pop A10
NOP 4
?? 快捷鍵說(shuō)明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -