?? programcu.cu
字號:
////////////////////////////////////////////////////////////////////////////
// File: ProgramCU.cu
// Author: Changchang Wu
// Description : implementation of ProgramCU and all CUDA kernels
//
// Copyright (c) 2007 University of North Carolina at Chapel Hill
// All Rights Reserved
//
// Permission to use, copy, modify and distribute this software and its
// documentation for educational, research and non-profit purposes, without
// fee, and without a written agreement is hereby granted, provided that the
// above copyright notice and the following paragraph appear in all copies.
//
// The University of North Carolina at Chapel Hill make no representations
// about the suitability of this software for any purpose. It is provided
// 'as is' without express or implied warranty.
//
// Please send BUG REPORTS to ccwu@cs.unc.edu
//
////////////////////////////////////////////////////////////////////////////
#if defined(CUDA_SIFTGPU_ENABLED)
#include "GL/glew.h"
#include <iostream>
#include <algorithm>
using namespace std;
#include "CuTexImage.h"
#include "ProgramCU.h"
#include "GlobalUtil.h"
//Standard block size
#define BLOCK_DIM 16
#define BLOCK_LOG_DIM 4
#define IMUL(X,Y) __mul24(X,Y)
//#define FDIV(X,Y) ((X)/(Y))
#define FDIV(X,Y) __fdividef(X,Y)
//filter kernel
#define KERNEL_MAX_WIDTH 33
//#define MAX_THREAD_PER_BLOCK 512 = 16 * 32
//////////////////////////////larger block gives better performance
#define FILTERV_TILE_WIDTH 16
#define FILTERV_TILE_HEIGHT 128
#define FILTERV_TBLK_HEIGHT 32
////////////////////////////
#define FILTERH_TILE_WIDTH 128
__device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
texture<float, 1, cudaReadModeElementType> texData;
texture<float2, 2, cudaReadModeElementType> texDataF2;
texture<float4, 1, cudaReadModeElementType> texDataF4;
texture<int4, 1, cudaReadModeElementType> texDataI4;
texture<int4, 1, cudaReadModeElementType> texDataList;
//template<int i> __device__ float Conv(float *data) { return Conv<i-1>(data) + data[i]*d_kernel[i];}
//template<> __device__ float Conv<0>(float *data) { return data[0] * d_kernel[0]; }
//////////////////////////////////////////////////////////////
template<int FW> __global__ void FilterH( float* d_result, int width)
{
const int HALF_WIDTH = FW >> 1;
const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1;
const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH;
__shared__ float data[CACHE_WIDTH];
const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH);
const int col = bcol + threadIdx.x;
const int index_min = IMUL(blockIdx.y, width);
const int index_max = index_min + width - 1;
int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
int cache_index = threadIdx.x;
float value = 0;
#pragma unroll
for(int j = 0; j < CACHE_COUNT; ++j)
{
if(cache_index < CACHE_WIDTH)
{
int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
data[cache_index] = tex1Dfetch(texData,fetch_index);
src_index += FILTERH_TILE_WIDTH;
cache_index += FILTERH_TILE_WIDTH;
}
}
__syncthreads();
if(col >= width) return;
#pragma unroll
for(int i = 0; i < FW; ++i)
{
value += (data[threadIdx.x + i]* d_kernel[i]);
}
// value = Conv<FW-1>(data + threadIdx.x);
d_result[index_min + col] = value;
}
////////////////////////////////////////////////////////////////////
template<int FW> __global__ void FilterV(float* d_result, int width, int height)
{
const int HALF_WIDTH = FW >> 1;
const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
const int TEMP = CACHE_WIDTH & 0xf;
//add some extra space to avoid bank conflict
#if FILTERV_TILE_WIDTH == 16
//make the stride 16 * n +/- 1
const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP;
#elif FILTERV_TILE_WIDTH == 8
//make the stride 16 * n +/- 2
const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP);
#elif FILTERV_TILE_WIDTH == 4
//make the stride 16 * n +/- 4
const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP);
#else
#error
#endif
const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA;
const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_TBLK_HEIGHT - 1) / FILTERV_TBLK_HEIGHT;
const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_TBLK_HEIGHT -1) / FILTERV_TBLK_HEIGHT;
__shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT);
const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x;
const int row_first = row_block_first - HALF_WIDTH;
const int data_index_max = IMUL(height - 1, width) + col;
const int cache_col_start = threadIdx.y;
const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH);
int cache_index = cache_col_start + cache_row_start;
int data_index = IMUL(row_first + cache_col_start, width) + col;
if(col < width)
{
#pragma unroll
for(int i = 0; i < CACHE_COUNT; ++i)
{
if(cache_col_start < CACHE_WIDTH - i * FILTERV_TBLK_HEIGHT)
{
int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
data[cache_index + i * FILTERV_TBLK_HEIGHT] = tex1Dfetch(texData,fetch_index);
data_index += IMUL(FILTERV_TBLK_HEIGHT, width);
}
}
}
__syncthreads();
if(col >= width) return;
int row = row_block_first + threadIdx.y;
int index_start = cache_row_start + threadIdx.y;
#pragma unroll
for(int i = 0; i < WRITE_COUNT; ++i,
row += FILTERV_TBLK_HEIGHT, index_start += FILTERV_TBLK_HEIGHT)
{
if(row < height)
{
int index_dest = IMUL(row, width) + col;
float value = 0;
#pragma unroll
for(int i = 0; i < FW; ++i)
{
value += (data[index_start + i] * d_kernel[i]);
}
d_result[index_dest] = value;
}
}
}
template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width)
{
const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
const float INV_SCALE = 1.0f / (float(SCALE));
int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
if(col >= width) return;
int row = blockIdx.y >> LOG_SCALE;
int index = row * width + col;
int dst_row = blockIdx.y;
int dst_idx= (width * dst_row + col) * SCALE;
int helper = blockIdx.y & SCALE_MASK;
if (helper)
{
float v11 = tex1Dfetch(texData, index);
float v12 = tex1Dfetch(texData, index + 1);
index += width;
float v21 = tex1Dfetch(texData, index);
float v22 = tex1Dfetch(texData, index + 1);
float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
float v1 = (v21 * w1 + w2 * v11);
float v2 = (v22 * w1 + w2 * v12);
d_result[dst_idx] = v1;
#pragma unroll
for(int i = 1; i < SCALE; ++i)
{
const float r2 = i * INV_SCALE;
const float r1 = 1.0f - r2;
d_result[dst_idx +i] = v1 * r1 + v2 * r2;
}
}else
{
float v1 = tex1Dfetch(texData, index);
float v2 = tex1Dfetch(texData, index + 1);
d_result[dst_idx] = v1;
#pragma unroll
for(int i = 1; i < SCALE; ++i)
{
const float r2 = i * INV_SCALE;
const float r1 = 1.0f - r2;
d_result[dst_idx +i] = v1 * r1 + v2 * r2;
}
}
}
////////////////////////////////////////////////////////////////////////////////////////
void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
{
int width = src->GetImgWidth(), height = src->GetImgHeight();
src->BindTexture(texData);
dim3 grid((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
dim3 block(FILTERH_TILE_WIDTH);
switch(log_scale)
{
case 1 : UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width); break;
case 2 : UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width); break;
case 3 : UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width); break;
default: break;
}
}
template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width)
{
const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
if(dst_col >= dst_width) return;
const int src_col = min((dst_col << LOG_SCALE), (src_width - 1));
const int dst_row = blockIdx.y;
const int src_row = blockIdx.y << LOG_SCALE;
const int src_idx = IMUL(src_row, src_width) + src_col;
const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
d_result[dst_idx] = tex1Dfetch(texData, src_idx);
}
__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale)
{
const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
if(dst_col >= dst_width) return;
const int src_col = min((dst_col << log_scale), (src_width - 1));
const int dst_row = blockIdx.y;
const int src_row = blockIdx.y << log_scale;
const int src_idx = IMUL(src_row, src_width) + src_col;
const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
d_result[dst_idx] = tex1Dfetch(texData, src_idx);
}
void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
{
int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
src->BindTexture(texData);
dim3 grid((dst_width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
dim3 block(FILTERH_TILE_WIDTH);
switch(log_scale)
{
case 1 : DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break;
case 2 : DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break;
case 3 : DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break;
default: DownsampleKernel <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale);
}
}
__global__ void ChannelReduce_Kernel(float* d_result)
{
int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
d_result[index] = tex1Dfetch(texData, index*4);
}
__global__ void ChannelReduce_Convert_Kernel(float* d_result)
{
int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
float4 rgba = tex1Dfetch(texDataF4, index);
d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
}
void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb)
{
int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
dim3 block(FILTERH_TILE_WIDTH);
if(convert_rgb)
{
src->BindTexture(texDataF4);
ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);
}else
{
src->BindTexture(texData);
ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);
}
}
void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
{
int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
width = 2*sz + 1;
//filter size truncation
if(width > KERNEL_MAX_WIDTH)
{
//std::cout<<"Filter truncated "<<width<<"->"<<KERNEL_MAX_WIDTH<<endl;
sz = KERNEL_MAX_WIDTH >> 1;
width =KERNEL_MAX_WIDTH;
}
float rv = 1.0f/(sigma*sigma), v, ksum =0;
// pre-compute filter
for( i = -sz ; i <= sz ; ++i)
{
kernel[i+sz] = v = exp(-0.5f * i * i *rv) ;
ksum += v;
}
//normalize the kernel
rv = 1.0f/ksum;
for(i = 0; i< width ;i++) kernel[i]*=rv;
}
template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf)
{
int width = src->GetImgWidth(), height = src->GetImgHeight();
//horizontal filtering
src->BindTexture(texData);
dim3 gridh((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
dim3 blockh(FILTERH_TILE_WIDTH);
FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);
CheckErrorCUDA("FilterH");
///vertical filtering
buf->BindTexture(texData);
dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH, (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_TBLK_HEIGHT);
FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height);
CheckErrorCUDA("FilterV");
}
//////////////////////////////////////////////////////////////////////
// tested on 2048x1500 image, the time on pyramid construction is
// -pack cg version : 18ms
// -unpack cg version : 49 ms
// -cuda version: 28 ms
void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma)
{
float filter_kernel[KERNEL_MAX_WIDTH]; int width;
CreateFilterKernel(sigma, filter_kernel, width);
cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice);
switch(width)
{
case 5: FilterImage< 5>(dst, src, buf); break;
case 7: FilterImage< 7>(dst, src, buf); break;
case 9: FilterImage< 9>(dst, src, buf); break;
case 11: FilterImage<11>(dst, src, buf); break;
case 13: FilterImage<13>(dst, src, buf); break;
case 15: FilterImage<15>(dst, src, buf); break;
case 17: FilterImage<17>(dst, src, buf); break;
case 19: FilterImage<19>(dst, src, buf); break;
case 21: FilterImage<21>(dst, src, buf); break;
case 23: FilterImage<23>(dst, src, buf); break;
case 25: FilterImage<25>(dst, src, buf); break;
case 27: FilterImage<27>(dst, src, buf); break;
case 29: FilterImage<29>(dst, src, buf); break;
case 31: FilterImage<31>(dst, src, buf); break;
case 33: FilterImage<33>(dst, src, buf); break;
default: break;
}
}
#define DOG_BLOCK_DIMX 128
#define DOG_BLOCK_DIMY 1
#define DOG_BLOCK_LOG_DIMX 7
#define DOG_BLOCK_LOG_DIMY 0
texture<float, 1, cudaReadModeElementType> texC;
texture<float, 1, cudaReadModeElementType> texP;
texture<float, 1, cudaReadModeElementType> texN;
void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height)
{
int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
if(col < width && row < height)
{
int index = IMUL(row, width) + col;
float vp = tex1Dfetch(texP, index);
float v = tex1Dfetch(texC, index);
d_dog[index] = v - vp;
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -