?? vecsum.asm
字號:
* ======================================================================== *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* vecsum -- vecsum *
* *
* *
* REVISION DATE *
* 13-Apr-2005 *
* *
* USAGE *
* *
* This routine is C callable, and has the following C prototype: *
* *
* void vecsum *
* ( *
* const short *restrict aData, // First input vector // *
* const short *restrict bData, // Second input vector // *
* int count, // Length of vectors // *
* short *restrict cData // Output vector // *
* ); *
* *
* *
* DESCRIPTION *
* *
* The "vecsum" program implements a vector sum two input arrays, *
* writing the result to a third vectors. *
* *
* void vecsum *
* ( *
* const short *restrict aData, // First input vector // *
* const short *restrict bData, // Second input vector // *
* int count, // Length of vectors // *
* short *restrict cData // Output vector // *
* ) *
* { *
* int i; *
* for (i=0; i < count; i++) *
* { *
* cData[i] = aData[i] + bData[i]; *
* } *
* } *
* *
* aData: pointer to one vector *
* bData: ptr to second vector *
* count: number of samples *
* cData: sum of aData and bData elements *
* *
* The above C code is a general implementation without *
* restrictions. The assembly code may have some restrictions, as *
* noted below. *
* *
* *
* TECHNIQUES *
* *
* The code is unrolled 8 times to enable full memory bandwidth to *
* be utilized. Double word loads and stores are used. *
* More specific optimised c implementation would look like this: *
* *
* void vecsum *
* ( *
* const short *restrict aData, // First input vector // *
* const short *restrict bData, // Second input vector // *
* int count, // Length of vectors // *
* short *restrict cData // Output vector // *
* ) *
* { *
* int i; *
* short * cData_o, * cData_e; *
* *
* cData_e * cData; *
* cData_o * cData + 4; *
* *
* for (i = 0; i < count; i+=8) *
* { *
* cData_e[0] = aData[0] + bData[0]; *
* cData_e[1] = aData[1] + bData[1]; *
* cData_e[2] = aData[2] + bData[2]; *
* cData_e[3] = aData[3] + bData[3]; *
* aData += 4; *
* bData += 4; *
* cData_e += 8; *
* cData_o[0] = aData[0] + bData[0]; *
* cData_o[1] = aData[1] + bData[1]; *
* cData_o[2] = aData[2] + bData[2]; *
* cData_o[3] = aData[3] + bData[3]; *
* aData += 4; *
* bData += 4; *
* cData_o += 8; *
* } *
* } *
* *
* *
* ASSUMPTIONS *
* *
* The input length is a multiple of 8 and greater than 0. *
* *
* The input data and coeeficients are stored on double word *
* aligned boundaries. *
* *
* *
* MEMORY NOTE *
* *
* To avoid bank conflicts, The input arrays 'aData' and 'bData' *
* must be offset by 4 half-words (8 bytes). *
* *
* The code is ENDIAN NEUTRAL. *
* *
* *
* CODESIZE *
* *
* 64 bytes *
* *
* *
* CYCLES *
* *
* cycles = 3*(count/8) + 10 *
* For count = 256, cycles = 106 *
* *
* ------------------------------------------------------------------------- *
* Copyright (c) 2005 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
* ======================================================================== *
* ======================================================================== *
************************ SYMBOLIC REGISTER ASSIGNMENTS ***********************
.asg A4, A_aData ;vector 1
.asg B4, B_bData ;vector 2
.asg B20, B_count ; elements in vector
.asg B6, B_cData ;result odd
.asg A5, A_cData ;result even
.asg A19, A_d32 ;elements 3, 2
.asg A18, A_d10 ;elements 1, 0
.asg B17, B_d32 ;elements 3, 2
.asg B16, B_d10 ;elements 1, 0
.asg A20, A_s10 ;sum 1, 0
.asg A21, A_s32 ;sum 3, 2
.asg A17, A_d76 ;elements 7, 6
.asg A16, A_d54 ;elements 5, 4
.asg B19, B_d76 ;elements 7, 6
.asg B18, B_d54 ;elements 5, 4
.asg B18, B_s54 ;sum 5,4
.asg B19, B_s76 ;sum 7,6
.text .global _vecsum_vecsum:
* ======================================================================== *
* ======================================================================== *
SHR .S1 A6, 3, A6 ; nData/8
SUB .L2X A6, 2, B_count ; nData/8-2
SPLOOPD 3
|| MVC .S2 B_count, ILC
*****************************************************************************
LDDW .D2T2 *B_bData++, B_d32:B_d10 ;load b i+0-3
|| LDDW .D1T1 *A_aData++, A_d32:A_d10 ;load a i+0-3
LDDW .D2T2 *B_bData++, B_d76:B_d54 ;load b i+4-7
|| LDDW .D1T1 *A_aData++, A_d76:A_d54 ;load a i+4-7
SPMASK
||^ ADD .D1X B_cData, 8, A_cData
NOP 2
ADD2 .L1X A_d32, B_d32, A_s32 ;sum i+2, sum i+3
ADD2 .S2X A_d54, B_d54, B_s54 ;sum i+5, sum i+4
|| ADD2 .L1X A_d10, B_d10, A_s10 ;sum i+1, sum i+0
ADD2 .L2X B_d76, A_d76, B_s76 ;sum i+6, sum i+7
SPKERNEL 0, 1
|| STDW .D1T2 B_s76:B_s54, *A_cData++[2] ;store sums i+4-7
|| STDW .D2T1 A_s32:A_s10, *B_cData++[2] ;store sums i+0-3
*****************************************************************************
**** E0, C0 **** .S2X, .L1X *************************************************
**** E0, C1 **** .L2X *******************************************************
BNOP .S2 B3, 5
.end
* ======================================================================== *
* End of file: vecsum.asm *
* ------------------------------------------------------------------------ *
* Copyright (C) 2005 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ======================================================================== *
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -