?? programcu.cu
字號:
float vxn = tex1Dfetch(texC, index + 1);
float vxp = tex1Dfetch(texC, index - 1);
float vyp = tex1Dfetch(texC, index - width);
float vyn = tex1Dfetch(texC, index + width);
float dx = vxn - vxp, dy = vyn - vyp;
float grd = 0.5f * sqrt(dx * dx + dy * dy);
float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
d_got[index] = make_float2(grd, rot);
}
}
void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height)
{
int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
if(col < width && row < height)
{
int index = IMUL(row, width) + col;
float vp = tex1Dfetch(texP, index);
float v = tex1Dfetch(texC, index);
d_dog[index] = v - vp;
}
}
void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
{
int width = gus->GetImgWidth(), height = gus->GetImgHeight();
dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX, (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
gus->BindTexture(texC);
(gus -1)->BindTexture(texP);
if(got->_cuData)
ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
else
ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);
}
#define KEY_BLOCK_LOG_DIMX 3
#define KEY_BLOCK_LOG_DIMY 3
#define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX)
#define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY)
//4/5, 3/2 -> 33
//4, 1 -> 45
//4, 0 -> 60
#define READ_CMP_DOG_DATA(datai, tex, idx) \
datai[0] = tex1Dfetch(tex, idx - 1);\
datai[1] = tex1Dfetch(tex, idx);\
datai[2] = tex1Dfetch(tex, idx + 1);\
if(v > nmax)\
{\
nmax = max(nmax, datai[0]);\
nmax = max(nmax, datai[1]);\
nmax = max(nmax, datai[2]);\
if(v < nmax) goto key_finish;\
}else\
{\
nmin = min(nmin, datai[0]);\
nmin = min(nmin, datai[1]);\
nmin = min(nmin, datai[2]);\
if(v > nmin) goto key_finish;\
}
void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax,
float dog_threshold0, float dog_threshold, float edge_threshold, int subpixel_localization)
{
float data[3][3], v;
float datap[3][3], datan[3][3];
int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1;
int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1;
int index = IMUL(row, width) + col;
int idx[3] ={index - width, index, index + width};
int in_image =0;
float nmax, nmin, result = 0.0f;
float dx = 0, dy = 0, ds = 0;
bool offset_test_passed = true;
if(row < rowmax && col < colmax)
{
in_image = 1;
data[1][1] = v = tex1Dfetch(texC, idx[1]);
if(fabs(v) <= dog_threshold0) goto key_finish;
data[1][0] = tex1Dfetch(texC, idx[1] - 1);
data[1][2] = tex1Dfetch(texC, idx[1] + 1);
nmax = max(data[1][0], data[1][2]);
nmin = min(data[1][0], data[1][2]);
if(v <=nmax && v >= nmin) goto key_finish;
//if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
READ_CMP_DOG_DATA(data[0], texC, idx[0]);
READ_CMP_DOG_DATA(data[2], texC, idx[2]);
//edge supression
float vx2 = v * 2.0f;
float fxx = data[1][0] + data[1][2] - vx2;
float fyy = data[0][1] + data[2][1] - vx2;
float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]);
float temp1 = fxx * fyy - fxy * fxy;
float temp2 = (fxx + fyy) * (fxx + fyy);
if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish;
//read the previous level
READ_CMP_DOG_DATA(datap[0], texP, idx[0]);
READ_CMP_DOG_DATA(datap[1], texP, idx[1]);
READ_CMP_DOG_DATA(datap[2], texP, idx[2]);
//read the next level
READ_CMP_DOG_DATA(datan[0], texN, idx[0]);
READ_CMP_DOG_DATA(datan[1], texN, idx[1]);
READ_CMP_DOG_DATA(datan[2], texN, idx[2]);
if(subpixel_localization)
{
//subpixel localization
float fx = 0.5f * (data[1][2] - data[1][0]);
float fy = 0.5f * (data[2][1] - data[0][1]);
float fs = 0.5f * (datan[1][1] - datap[1][1]);
float fss = (datan[1][1] + datap[1][1] - vx2);
float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]);
float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]);
//need to solve dx, dy, ds;
// |-fx| | fxx fxy fxs | |dx|
// |-fy| = | fxy fyy fys | * |dy|
// |-fs| | fxs fys fss | |ds|
float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
float maxa = max(max(A0.x, A1.x), A2.x);
if(maxa >= 1e-10)
{
if(maxa == A1.x)
{
float4 TEMP = A1; A1 = A0; A0 = TEMP;
}else if(maxa == A2.x)
{
float4 TEMP = A2; A2 = A0; A0 = TEMP;
}
A0.y /= A0.x; A0.z /= A0.x; A0.w/= A0.x;
A1.y -= A1.x * A0.y; A1.z -= A1.x * A0.z; A1.w -= A1.x * A0.w;
A2.y -= A2.x * A0.y; A2.z -= A2.x * A0.z; A2.w -= A2.x * A0.w;
if(abs(A2.y) > abs(A1.y))
{
float4 TEMP = A2; A2 = A1; A1 = TEMP;
}
if(abs(A1.y) >= 1e-10)
{
A1.z /= A1.y; A1.w /= A1.y;
A2.z -= A2.y * A1.z; A2.w -= A2.y * A1.w;
if(abs(A2.z) >= 1e-10)
{
ds = A2.w / A2.z;
dy = A1.w - ds * A1.z;
dx = A0.w - ds * A0.z - dy * A0.y;
offset_test_passed =
fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold
&&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f;
}
}
}
}
if(offset_test_passed) result = v > nmax ? 1.0 : -1.0;
}
key_finish:
if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
}
void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge)
{
int width = dog->GetImgWidth(), height = dog->GetImgHeight();
float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog;
CuTexImage* dogp = dog - 1;
CuTexImage* dogn = dog + 1;
dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
dogp->BindTexture(texP);
dog ->BindTexture(texC);
dogn->BindTexture(texN);
Tedge = (Tedge+1)*(Tedge+1)/Tedge;
ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width, width -1, height -1,
Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);
}
#define HIST_INIT_WIDTH 128
void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
{
int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
if(row < height && col < wd)
{
int hidx = IMUL(row, wd) + col;
int scol = col << 2;
int sidx = IMUL(row, ws) + scol;
int v[4] = {0, 0, 0, 0};
if(row > 0 && row < height -1)
{
#pragma unroll
for(int i = 0; i < 4 ; ++i, ++scol)
{
float4 temp = tex1Dfetch(texDataF4, sidx +i);
v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
}
}
hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
}
}
void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
{
int ws = key->GetImgWidth(), hs = key->GetImgHeight();
int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
dim3 grid((wd + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH, hd);
dim3 block(HIST_INIT_WIDTH, 1);
key->BindTexture(texDataF4);
InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd);
//CheckHistInit(key, hist);
}
void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
{
int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
if(row < height && col < wd)
{
int hidx = IMUL(row, wd) + col;
int scol = col << 2;
int sidx = IMUL(row, ws) + scol;
int v[4] = {0, 0, 0, 0};
#pragma unroll
for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
{
int4 temp = tex1Dfetch(texDataI4, sidx + i);
v[i] = temp.x + temp.y + temp.z + temp.w;
}
d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
}
}
void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
{
int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
int temp = (int)floor(logf(float(wd * 2/ 3)) / logf(2.0f));
const int wi = min(7, max(temp , 0));
hist1->BindTexture(texDataI4);
const int BW = 1 << wi, BH = 1 << (7 - wi);
dim3 grid((wd + BW - 1)/ BW, (hd + BH -1) / BH);
dim3 block(BW, BH);
ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd);
}
#define LISTGEN_BLOCK_DIM 128
void __global__ ListGen_Kernel(int4* d_list, int width)
{
int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
int4 pos = tex1Dfetch(texDataList, idx1);
int idx2 = IMUL(pos.y, width) + pos.x;
int4 temp = tex1Dfetch(texDataI4, idx2);
int sum1 = temp.x + temp.y;
int sum2 = sum1 + temp.z;
pos.x <<= 2;
if(pos.z >= sum2)
{
pos.x += 3;
pos.z -= sum2;
}else if(pos.z >= sum1)
{
pos.x += 2;
pos.z -= sum1;
}else if(pos.z >= temp.x)
{
pos.x += 1;
pos.z -= temp.x;
}
d_list[idx1] = pos;
}
//input list (x, y) (x, y) ....
void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
{
int len = list->GetImgWidth();
list->BindTexture(texDataList);
hist->BindTexture(texDataI4);
dim3 grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
dim3 block(LISTGEN_BLOCK_DIM);
ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, hist->GetImgWidth());
}
void __global__ ComputeOrientation_Kernel(float4* d_list,
int list_len,
int width, int height,
float sigma, float sigma_step,
float gaussian_factor, float sample_factor,
int num_orientation,
int existing_keypoint,
int subpixel,
int keepsign)
{
const float ten_degree_per_radius = 5.7295779513082320876798154814105;
const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105;
int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
if(idx >= list_len) return;
float4 key;
if(existing_keypoint)
{
key = tex1Dfetch(texDataF4, idx);
}else
{
int4 ikey = tex1Dfetch(texDataList, idx);
key.x = ikey.x + 0.5f;
key.y = ikey.y + 0.5f;
key.z = sigma;
if(subpixel || keepsign)
{
float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);
if(subpixel)
{
key.x += offset.y;
key.y += offset.z;
key.z *= pow(sigma_step, offset.w);
}
if(keepsign) key.z *= offset.x;
}
}
if(num_orientation == 0)
{
key.w = 0;
d_list[idx] = key;
return;
}
float vote[37];
float gsigma = key.z * gaussian_factor;
float win = fabs(key.z) * sample_factor;
float dist_threshold = win * win + 0.5;
float factor = -0.5f / (gsigma * gsigma);
float xmin = max(1.5f, floor(key.x - win) + 0.5f);
float ymin = max(1.5f, floor(key.y - win) + 0.5f);
float xmax = min(width - 1.5f, floor(key.x + win) + 0.5f);
float ymax = min(height -1.5f, floor(key.y + win) + 0.5f);
#pragma unroll
for(int i = 0; i < 36; ++i) vote[i] = 0.0f;
for(float y = ymin; y <= ymax; y += 1.0f)
{
for(float x = xmin; x <= xmax; x += 1.0f)
{
float dx = x - key.x;
float dy = y - key.y;
float sq_dist = dx * dx + dy * dy;
if(sq_dist >= dist_threshold) continue;
float2 got = tex2D(texDataF2, x, y);
float weight = got.x * exp(sq_dist * factor);
float fidx = floor(got.y * ten_degree_per_radius);
int oidx = fidx;
if(oidx < 0) oidx += 36;
vote[oidx] += weight;
}
}
//filter the vote
const float one_third = 1.0 /3.0;
#pragma unroll
for(int i = 0; i < 6; ++i)
{
vote[36] = vote[0];
float pre = vote[35];
#pragma unroll
for(int j = 0; j < 36; ++j)
{
float temp = one_third * (pre + vote[j] + vote[j + 1]);
pre = vote[j]; vote[j] = temp;
}
}
vote[36] = vote[0];
if(num_orientation == 1 || existing_keypoint)
{
int index_max = 0;
float max_vote = vote[0];
#pragma unroll
for(int i = 1; i < 36; ++i)
{
index_max = vote[i] > max_vote? i : index_max;
max_vote = max(max_vote, vote[i]);
}
float pre = vote[index_max == 0? 35 : index_max -1];
float next = vote[index_max + 1];
float weight = max_vote;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -