?? conv2d3x3_gen.asm
字號:
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : conv2d3x3_gen.asm
Label name : __conv2d3x3_gen
Version : 1.3
Change History :
Version Date Author Comments
1.3 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.2 11/13/2002 Swarnalatha Tested with VDSP++ 3.0
on ADSP-21535 Rev. 0.2
1.1 02/12/2002 Nishanth Modified to match
silicon cycle count
1.0 05/08/2001 Nishanth Original
Description : This function does two dimensional circular convolution of a
given sequence with 3 x 3 matrix.
In this implementation circular convolution of two matrices
`a` and `b` is calculated.
The dimension of 'a' is row x col and that of 'b' is 3 x 3.
The dimension of the output matrix c will row x col.
The first two columns of outputs are calculated separately as
these ones require circular buffering of each row.
The whole implementation is for fract16 input and output.
The format of representation is 1.15 format.
Assumptions : 1. The minimum input matrix size is 3 x 3.
2. in[] and out[] should be aligned to a 2 byte boundary.
3. mask[] should be aligned to a 4 byte boundary.
4. in[] and mask[] should be in different minibanks.
5. in[] and out[] should be in different minibanks.
6. mask[] and stack should be in different minibanks.
Prototype : void _conv2d3x3_gen(
fract16 in[],
// (i) : Pointer to the input matrix.
short row,
// (i) : Number of rows of input matrix.
short col,
// (i) : Number of columns of input matrix.
fract16 mask[],
// (i) : Pointer to 3x3 mask.
fract16 out[])
// (o) : Pointer to the output matrix.
Registers used : A0, R0-R3,R7, I0-I3, B0-B3, M0-M3, L0-L3, P0-P2, LC0,LC1
Performance :
Code Size : 296 Bytes.
Cycle Count : 9 * row * col + 31 * row + 35
(9 cycles/pixel in core; 21 cycles/pixel for first
two columns)
209 cycles for an input matrix size of 3 x 3.
415 cycles for an input matrix size of 5 x 5.
505 cycles for an input matrix size of 5 x 7.
2835 cycles for an input matrix size of 16 x 16.
*******************************************************************************/
.section L1_code;
.global __conv2d3x3_gen;
.align 8;
__conv2d3x3_gen:
[--SP] = R7; // Save R7
P0 = R1; // Loop counter for rows of input
P1 = 2; // Counter for first two columns
P2 = R2; // Number of columns
I0 = R0; // Starting address of input matrix.
B0 = R0; // Base address of circular buffer
R3 = R1.L * R2.L (ISS2) || R7 = [SP+16];
// R7 = Address of Mask
I2 = R7; // Starting address of mask
B2 = R7; // Base address of circular buffer
L2 = 20; // Length of mask = 20 (9 * 2 + 1 dummy location)
L0 = R3; // Circular buffer of length 2 * row * col
L3 = R3; // Circular buffer of length 2 * row * col
R2 = R2 << 1 || R7 = [SP+20] || R3 = [I2--];
// 2 * col , Address of output matrix, Make I2 point
// to end of mask
M3 = R2; // M3 = 2 * col
L1 = R2; // Circular buffer of length 2 * col
B3 = R7; // Base address of circular buffer
I3 = R7; // Address of output buffer.
R1 = R2 << 1;
M2 = R1; // 4 * col
R2 += -4;
M0 = R2; // 2 * col - 4
P2 += -2; // Col - 2
I0 -= M2 || R3 = [I2--];// Modify I0, Fetch h0,dummy
LSETUP (FIRST_TWO_COLS_ST, FIRST_TWO_COLS_END) LC0 = P0;
// Loop to keep track of rows of i/p(counter = row)
FIRST_TWO_COLS_ST:
M1 = 4; // Column offset = 4.
LSETUP (CONV_ST, CONV_END) LC1 = P1;
CONV_ST: B1 = I0;
I1 = B1;
I1 -= M1; // Column offset is subtracted from circular buffer
R1.L = W[I1++]; // Fetch x0
A0 = R1.L * R3.L || R1.L = W[I1++] || R3 = [I2--];
// A0 = x0*h0, x1 , h1,h2
A0 += R1.L * R3.H || R1.L = W[I1++] || I0 += M3;
// A0 += x1*h1, x2 , Modify Row pointer
B1 = I0;
I1 = B1;
I1 -= M1; // Column offset is subtracted from circular buffer
A0 += R1.L * R3.L || R1.L = W[I1++] || R3 = [I2--];
// A0 += x2*h2, x10, h3,h4
A0 += R1.L * R3.H || R1.L = W[I1++] || I0 += M3;
// A0 += x10*h3, x11 , Modify row pointer
A0 += R1.L * R3.L || R1.L = W[I1++] || R7 = [I2--];
// A0 += x11*h4, x12 , h5,h6
B1 = I0;
I1 = B1;
I1 -= M1; // Column offset is subtracted from circular buffer
A0 += R1.L * R7.H || R1.L = W[I1++];
// A0 += x12*h5, x20
A0 += R1.L * R7.L || R1.L = W[I1++] || R3 = [I2--];
// A0 += x20*h6, x21 , h7,h8
A0 += R1.L * R3.H || R1.L = W[I1++];
// A0 += x21*h7, x22
R0.L = (A0+=R1.L*R3.L) || I0 -= M2 || R3 = [I2--];
// A0 += x22*h8, Reset row pointer , h0,dummy
M1 = 2; // Column offset for last column
CONV_END: W[I3++] = R0.L; // Store the output
I0 += M3; // Modify row pointer
FIRST_TWO_COLS_END:
I3 += M0; // Modify output pointer
R1 = M2; // 4*col
R1 += 2;
M1 = R1; // 4*col + 2
M2 = -8; // -8
R1.L = W[I0++] || R2 = [I2--];
// Fetch x0 , h1,h2
LSETUP (ROW_ST,ROW_END1) LC0 = P0;
// Loop to keep track of rows of i/p(counter = row)
ROW_ST: A0 = R1.L * R3.L || R1.L = W[I0++] || I3 += 4;
// A0 = x0*h0, x1 , Modify output pointer after each
// row
LSETUP (COL_ST, COL_END) LC1 = P2;
// Loop for all columns in a row except first two,
// ctr = (col-2)/2
COL_ST: A0 += R1.L * R2.H || R1.L = W[I0] || I0 += M0;
// A0 += x1*h1, x2
A0 += R1.L * R2.L || R1.L = W[I0++] || R3 = [I2--];
// A0 += x2*h2, x10 , h3,h4
A0 += R1.L * R3.H || R1.L = W[I0++] || I2 -= 4;
// A0 += x10*h3, x11 , Modify I2
A0 += R1.L * R3.L || R1.L = W[I0] || I0 += M0;
// A0 += x11*h4, x12
A0 += R1.L * R7.H || R1.L = W[I0++];
// A0 += x12*h5, x20
A0 += R1.L * R7.L || R1.L = W[I0++] || R3 = [I2--];
// A0 += x20*h6, x21 , h7,h8
A0 += R1.L * R3.H || R1.L = W[I0] || I0 -= M1;
// A0 += x21*h7, x22
R0.L=(A0 += R1.L * R3.L) || R1.L = W[I0++] || R3 = [I2++M2];
// A0 += x22*h8, x0 , h0,dummy
COL_END: A0 = R1.L * R3.L || R1.L = W[I0++] || W[I3++] = R0.L;
// A0 = x0*h0, x1 , store output
ROW_END1:
R1.L=W[I0++]; // Fetch x0 for next output
R7 = [SP++]; // Restore R7
RTS;
NOP; //to avoid one stall if LINK or UNLINK happens to be
//the next instruction after RTS in the memory.
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -