?? cxmatmul.cpp
字號:
else if( d_size.width*sizeof(d_data[0]) <= 1600 ) \
{ \
for( i = 0; i < drows; i++, _a_data += a_step0, \
_c_data += c_step0, \
d_data += d_step ) \
{ \
a_data = _a_data, c_data = _c_data; \
\
if( a_buf ) \
{ \
for( k = 0; k < n; k++ ) \
a_buf[k] = a_data[a_step1*k]; \
a_data = a_buf; \
} \
\
for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 ) \
{ \
const arrtype* b = _b_data + j; \
worktype s0(0), s1(0), s2(0), s3(0); \
\
for( k = 0; k < n; k++, b += b_step ) \
{ \
worktype a(a_data[k]); \
s0 += a * b[0]; s1 += a * b[1]; \
s2 += a * b[2]; s3 += a * b[3]; \
} \
\
if( !c_data ) \
{ \
d_data[j] = arrtype(s0*alpha); \
d_data[j+1] = arrtype(s1*alpha); \
d_data[j+2] = arrtype(s2*alpha); \
d_data[j+3] = arrtype(s3*alpha); \
} \
else \
{ \
s0 = s0*alpha; s1 = s1*alpha; \
s2 = s2*alpha; s3 = s3*alpha; \
d_data[j] = arrtype(s0 + c_data[0]*beta); \
d_data[j+1] = arrtype(s1 + c_data[c_step1]*beta); \
d_data[j+2] = arrtype(s2 + c_data[c_step1*2]*beta); \
d_data[j+3] = arrtype(s3 + c_data[c_step1*3]*beta); \
} \
} \
\
for( ; j < m; j++, c_data += c_step1 ) \
{ \
const arrtype* b = _b_data + j; \
worktype s0(0); \
\
for( k = 0; k < n; k++, b += b_step ) \
s0 += worktype(a_data[k]) * b[0]; \
\
s0 = s0*alpha; \
if( !c_data ) \
d_data[j] = arrtype(s0); \
else \
d_data[j] = arrtype(s0 + c_data[0]*beta); \
} \
} \
} \
else \
{ \
worktype* d_buf = (worktype*)cvAlloc(m*sizeof(d_buf[0])); \
\
for( i = 0; i < drows; i++, _a_data += a_step0, \
_c_data += c_step0, \
d_data += d_step ) \
{ \
a_data = _a_data; \
b_data = _b_data; \
c_data = _c_data; \
\
if( a_buf ) \
{ \
for( k = 0; k < n; k++ ) \
a_buf[k] = _a_data[a_step1*k]; \
a_data = a_buf; \
} \
\
for( j = 0; j < m; j++ ) \
d_buf[j] = worktype(0); \
\
for( k = 0; k < n; k++, b_data += b_step ) \
{ \
worktype al(a_data[k]); \
\
for( j = 0; j <= m - 4; j += 4 ) \
{ \
worktype t0 = d_buf[j] + b_data[j]*al; \
worktype t1 = d_buf[j+1] + b_data[j+1]*al; \
d_buf[j] = t0; \
d_buf[j+1] = t1; \
t0 = d_buf[j+2] + b_data[j+2]*al; \
t1 = d_buf[j+3] + b_data[j+3]*al; \
d_buf[j+2] = t0; \
d_buf[j+3] = t1; \
} \
\
for( ; j < m; j++ ) \
d_buf[j] += b_data[j]*al; \
} \
\
if( !c_data ) \
for( j = 0; j < m; j++ ) \
d_data[j] = arrtype(d_buf[j]*alpha); \
else \
for( j = 0; j < m; j++, c_data += c_step1 ) \
{ \
worktype t = d_buf[j]*alpha; \
d_data[j] = arrtype(t + c_data[0]*beta); \
} \
} \
} \
return CV_OK; \
}
#define ICV_DEF_GEMM_BLOCK_MUL( flavor, arrtype, worktype ) \
static CvStatus CV_STDCALL \
icvGEMMBlockMul_##flavor( const arrtype* a_data, size_t a_step, \
const arrtype* b_data, size_t b_step, \
worktype* d_data, size_t d_step, \
CvSize a_size, CvSize d_size, int flags ) \
{ \
int i, j, k, n = a_size.width, m = d_size.width; \
const arrtype *_a_data = a_data, *_b_data = b_data; \
arrtype* a_buf = 0; \
size_t a_step0, a_step1, t_step; \
int do_acc = flags & 16; \
\
a_step /= sizeof(a_data[0]); \
b_step /= sizeof(b_data[0]); \
d_step /= sizeof(d_data[0]); \
\
a_step0 = a_step; \
a_step1 = 1; \
\
if( flags & CV_GEMM_A_T ) \
{ \
CV_SWAP( a_step0, a_step1, t_step ); \
n = a_size.height; \
a_buf = (arrtype*)cvAlloc(n*sizeof(a_data[0])); \
} \
\
if( flags & CV_GEMM_B_T ) \
{ \
/* second operand is transposed */ \
for( i = 0; i < d_size.height; i++, _a_data += a_step0, \
d_data += d_step ) \
{ \
a_data = _a_data; b_data = _b_data; \
\
if( a_buf ) \
{ \
for( k = 0; k < n; k++ ) \
a_buf[k] = a_data[a_step1*k]; \
a_data = a_buf; \
} \
\
for( j = 0; j < d_size.width; j++, b_data += b_step ) \
{ \
worktype s0 = do_acc ? d_data[j]:worktype(0), s1(0);\
for( k = 0; k <= n - 2; k += 2 ) \
{ \
s0 += worktype(a_data[k])*b_data[k]; \
s1 += worktype(a_data[k+1])*b_data[k+1]; \
} \
\
for( ; k < n; k++ ) \
s0 += worktype(a_data[k])*b_data[k]; \
\
d_data[j] = s0 + s1; \
} \
} \
} \
else \
{ \
for( i = 0; i < d_size.height; i++, _a_data += a_step0, \
d_data += d_step ) \
{ \
a_data = _a_data, b_data = _b_data; \
\
if( a_buf ) \
{ \
for( k = 0; k < n; k++ ) \
a_buf[k] = a_data[a_step1*k]; \
a_data = a_buf; \
} \
\
for( j = 0; j <= m - 4; j += 4 ) \
{ \
worktype s0, s1, s2, s3; \
const arrtype* b = b_data + j; \
\
if( do_acc ) \
{ \
s0 = d_data[j]; s1 = d_data[j+1]; \
s2 = d_data[j+2]; s3 = d_data[j+3]; \
} \
else \
s0 = s1 = s2 = s3 = worktype(0); \
\
for( k = 0; k < n; k++, b += b_step ) \
{ \
worktype a(a_data[k]); \
s0 += a * b[0]; s1 += a * b[1]; \
s2 += a * b[2]; s3 += a * b[3]; \
} \
\
d_data[j] = s0; d_data[j+1] = s1; \
d_data[j+2] = s2; d_data[j+3] = s3; \
} \
\
for( ; j < m; j++ ) \
{ \
const arrtype* b = b_data + j; \
worktype s0 = do_acc ? d_data[j] : worktype(0); \
\
for( k = 0; k < n; k++, b += b_step ) \
s0 += worktype(a_data[k]) * b[0]; \
\
d_data[j] = s0; \
} \
} \
} \
\
return CV_OK; \
}
#define ICV_DEF_GEMM_STORE( flavor, arrtype, worktype ) \
static CvStatus CV_STDCALL \
icvGEMMStore_##flavor( const arrtype* c_data, size_t c_step, \
const worktype* d_buf, size_t d_buf_step, \
arrtype* d_data, size_t d_step, CvSize d_size,\
double alpha, double beta, int flags ) \
{ \
const arrtype* _c_data = c_data; \
int j; \
size_t c_step0, c_step1; \
\
c_step /= sizeof(c_data[0]); \
d_buf_step /= sizeof(d_buf[0]); \
d_step /= sizeof(d_data[0]); \
\
if( !c_data ) \
c_step0 = c_step1 = 0; \
else if( !(flags & CV_GEMM_C_T) ) \
c_step0 = c_step, c_step1 = 1; \
else \
c_step0 = 1, c_step1 = c_step; \
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -