?? programcu.cu
字號:
//////////////////////////////////////////////////////////////////////////////////////////
#define MULT_TBLOCK_DIMX 128
#define MULT_TBLOCK_DIMY 1
#define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
#define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)
texture<uint4, 1, cudaReadModeElementType> texDes1;
texture<uint4, 1, cudaReadModeElementType> texDes2;
void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp)
{
int idx01 = (blockIdx.y * MULT_BLOCK_DIMY), idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
int read_idx1 = idx01 * 8 + threadIdx.x, read_idx2 = idx2 * 8;
int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
///////////////////////////////////////////////////////////////
//Load feature descriptors
///////////////////////////////////////////////////////////////
#if MULT_BLOCK_DIMY == 16
uint4 v = tex1Dfetch(texDes1, read_idx1);
data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w;
#elif MULT_BLOCK_DIMY == 8
if(threadIdx.x < 64)
{
uint4 v = tex1Dfetch(texDes1, read_idx1);
data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w;
}
#else
#error
#endif
__syncthreads();
///
if(idx2 >= num2) return;
///////////////////////////////////////////////////////////////////////////
//compare descriptors
int results[MULT_BLOCK_DIMY];
#pragma unroll
for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0;
#pragma unroll
for(int i = 0; i < 8; ++i)
{
uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
unsigned char* p2 = (unsigned char*)(&v);
#pragma unroll
for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
{
unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4));
results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1])
+ IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3])
+ IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5])
+ IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7])
+ IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9])
+ IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11])
+ IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13])
+ IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15]));
}
}
int dst_idx = IMUL(idx1, num2) + idx2;
if(d_temp)
{
int3 cmp_result = make_int3(0, -1, 0);
#pragma unroll
for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
{
if(idx1 + i < num1)
{
cmp_result = results[i] > cmp_result.x?
make_int3(results[i], idx1 + i, cmp_result.x) :
make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
d_result[dst_idx + IMUL(i, num2)] = results[i];
}
}
d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
}else
{
#pragma unroll
for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
{
if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
}
}
}
void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
{
int num1 = des1->GetImgWidth() / 8;
int num2 = des2->GetImgWidth() / 8;
dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
texDot->InitTexture( num2,num1);
if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);
des1->BindTexture(texDes1);
des2->BindTexture(texDes2);
MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
(texCRT? (int3*)texCRT->_cuData : NULL));
ProgramCU::CheckErrorCUDA("MultiplyDescriptor");
}
texture<float, 1, cudaReadModeElementType> texLoc1;
texture<float2, 1, cudaReadModeElementType> texLoc2;
struct Matrix33{float mat[3][3];};
void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp,
Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)
{
int idx01 = (blockIdx.y * MULT_BLOCK_DIMY);
int idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
int idx1 = idx01 + threadIdx.y;
int idx2 = idx02 + threadIdx.x;
__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
__shared__ float loc1[MULT_BLOCK_DIMY * 2];
int read_idx1 = idx01 * 8 + threadIdx.x ;
int read_idx2 = idx2 * 8;
int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
#if MULT_BLOCK_DIMY == 16
uint4 v = tex1Dfetch(texDes1, read_idx1);
data1[cache_idx1] = v.x;
data1[cache_idx1+1] = v.y;
data1[cache_idx1+2] = v.z;
data1[cache_idx1+3] = v.w;
#elif MULT_BLOCK_DIMY == 8
if(threadIdx.x < 64)
{
uint4 v = tex1Dfetch(texDes1, read_idx1);
data1[cache_idx1] = v.x;
data1[cache_idx1+1] = v.y;
data1[cache_idx1+2] = v.z;
data1[cache_idx1+3] = v.w;
}
#else
#error
#endif
__syncthreads();
if(threadIdx.x < MULT_BLOCK_DIMY * 2)
{
loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);
}
__syncthreads();
if(idx2 >= num2) return;
int results[MULT_BLOCK_DIMY];
/////////////////////////////////////////////////////////////////////////////////////////////
//geometric verification
/////////////////////////////////////////////////////////////////////////////////////////////
int good_count = 0;
float2 loc2 = tex1Dfetch(texLoc2, idx2);
#pragma unroll
for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
{
if(idx1 + i < num1)
{
float* loci = loc1 + i * 2;
float locx = loci[0], locy = loci[1];
//homography
float x[3], diff[2];
x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2];
x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2];
x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2];
diff[0] = fabs(FDIV(x[0], x[2]) - loc2.x);
diff[1] = fabs(FDIV(x[1], x[2]) - loc2.y);
if(diff[0] < hdistmax && diff[1] < hdistmax)
{
//check fundamental matrix
float fx1[3], ftx2[3], x2fx1, se;
fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2];
fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2];
//fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];
ftx2[0] = F.mat[0][0] * locx + F.mat[1][0] * locy + F.mat[2][0];
ftx2[1] = F.mat[0][1] * locx + F.mat[1][1] * locy + F.mat[2][1];
ftx2[2] = F.mat[0][2] * locx + F.mat[1][2] * locy + F.mat[2][2];
x2fx1 = locx * ftx2[0] + locy * ftx2[1] + ftx2[2];
se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]);
results[i] = se < fdistmax? 0: -262144;
}else
{
results[i] = -262144;
}
}else
{
results[i] = -262144;
}
good_count += (results[i] >=0);
}
/////////////////////////////////////////////////////////////////////////////////////////////
///compare feature descriptors anyway
/////////////////////////////////////////////////////////////////////////////////////////////
if(good_count > 0)
{
#pragma unroll
for(int i = 0; i < 8; ++i)
{
uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
unsigned char* p2 = (unsigned char*)(&v);
#pragma unroll
for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
{
unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4));
results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1])
+ IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3])
+ IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5])
+ IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7])
+ IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9])
+ IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11])
+ IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13])
+ IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15]));
}
}
}
int dst_idx = IMUL(idx1, num2) + idx2;
if(d_temp)
{
int3 cmp_result = make_int3(0, -1, 0);
#pragma unroll
for(int i= 0; i < MULT_BLOCK_DIMY; ++i)
{
if(idx1 + i < num1)
{
cmp_result = results[i] > cmp_result.x?
make_int3(results[i], idx1 + i, cmp_result.x) :
make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
}else
{
break;
}
}
d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;
}else
{
#pragma unroll
for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
{
if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
else break;
}
}
}
void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT,
float H[3][3], float hdistmax, float F[3][3], float fdistmax)
{
int num1 = des1->GetImgWidth() / 8;
int num2 = des2->GetImgWidth() / 8;
Matrix33 MatF, MatH;
//copy the matrix
memcpy(MatF.mat, F, 9 * sizeof(float));
memcpy(MatH.mat, H, 9 * sizeof(float));
//thread blocks
dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
//intermediate results
texDot->InitTexture( num2,num1);
if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);
loc1->BindTexture(texLoc1);
loc2->BindTexture(texLoc2);
des1->BindTexture(texDes1);
des2->BindTexture(texDes2);
MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
(texCRT? (int3*)texCRT->_cuData : NULL),
MatH, hdistmax, MatF, fdistmax);
}
texture<int, 1, cudaReadModeElementType> texDOT;
#define ROWMATCH_BLOCK_WIDTH 32
#define ROWMATCH_BLOCK_HEIGHT 1
void __global__ RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax)
{
#if ROWMATCH_BLOCK_HEIGHT == 1
__shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
__shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
__shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
int row = blockIdx.y;
#else
__shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
__shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
__shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
int* dotmax = x_dotmax[threadIdx.y];
int* dotnxt = x_dotnxt[threadIdx.y];
int* dotidx = x_dotidx[threadIdx.y];
int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
#endif
int base_address = IMUL(row , num2);
int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1;
for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH)
{
if(threadIdx.x + i < num2)
{
int v = tex1Dfetch(texDOT, base_address + threadIdx.x + i);//d_dot[base_address + threadIdx.x + i];//
bool test = v > t_dotmax;
t_dotnxt = test? t_dotmax : max(t_dotnxt, v);
t_dotidx = test? (threadIdx.x + i) : t_dotidx;
t_dotmax = test? v: t_dotmax;
}
__syncthreads();
}
dotmax[threadIdx.x] = t_dotmax;
dotnxt[threadIdx.x] = t_dotnxt;
dotidx[threadIdx.x] = t_dotidx;
__syncthreads();
#pragma unroll
for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2)
{
if(threadIdx.x < step)
{
int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
bool test = v2 > v1;
dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2);
dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x];
dotmax[threadIdx.x] = test? v2 : v1;
}
__syncthreads();
}
if(threadIdx.x == 0)
{
float dist = acos(min(dotmax[0] * 0.000003814697265625f, 1.0));
float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0));
//float ratio = dist / distn;
d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//? : -1;
}
}
void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax)
{
int num1 = texDot->GetImgHeight();
int num2 = texDot->GetImgWidth();
dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
texDot->BindTexture(texDOT);
RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
(int*)texMatch->_cuData, num2, distmax, ratiomax);
}
#define COLMATCH_BLOCK_WIDTH 32
//texture<int3, 1, cudaReadModeElementType> texCT;
void __global__ ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
{
int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
if(col >= num2) return;
int3 result = d_crt[col];//tex1Dfetch(texCT, col);
int read_idx = col + num2;
for(int i = 1; i < height; ++i, read_idx += num2)
{
int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
result = result.x < temp.x?
make_int3(temp.x, temp.y, max(result.x, temp.z)) :
make_int3(result.x, result.y, max(result.z, temp.x));
}
float dist = acos(min(result.x * 0.000003814697265625f, 1.0));
float distn = acos(min(result.z * 0.000003814697265625f, 1.0));
//float ratio = dist / distn;
d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//? : -1;
}
void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax)
{
int height = texCRT->GetImgHeight();
int num2 = texCRT->GetImgWidth();
//texCRT->BindTexture(texCT);
dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH); dim3 block(COLMATCH_BLOCK_WIDTH);
ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);
}
#endif
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -