?? programcu.cu

?? SiftGPU is an implementation of SIFT [1] for GPU. SiftGPU processes pixels parallely to build Gaussi
?? CU
?? 第 1 頁 / 共 4 頁
字號:
12 3 4 下一頁
////////////////////////////////////////////////////////////////////////////
//	File:		ProgramCU.cu
//	Author:		Changchang Wu
//	Description : implementation of ProgramCU and all CUDA kernels
//
//	Copyright (c) 2007 University of North Carolina at Chapel Hill
//	All Rights Reserved
//
//	Permission to use, copy, modify and distribute this software and its
//	documentation for educational, research and non-profit purposes, without
//	fee, and without a written agreement is hereby granted, provided that the
//	above copyright notice and the following paragraph appear in all copies.
//	
//	The University of North Carolina at Chapel Hill make no representations
//	about the suitability of this software for any purpose. It is provided
//	'as is' without express or implied warranty. 
//
//	Please send BUG REPORTS to ccwu@cs.unc.edu
//
////////////////////////////////////////////////////////////////////////////

#if defined(CUDA_SIFTGPU_ENABLED)

#include "GL/glew.h"
#include <iostream>
#include <algorithm>
using namespace std;

#include "CuTexImage.h"
#include "ProgramCU.h"
#include "GlobalUtil.h"

//Standard block size
#define BLOCK_DIM 16
#define BLOCK_LOG_DIM 4
#define IMUL(X,Y) __mul24(X,Y)
//#define FDIV(X,Y) ((X)/(Y))
#define FDIV(X,Y) __fdividef(X,Y)
//filter kernel
#define KERNEL_MAX_WIDTH 33


//#define MAX_THREAD_PER_BLOCK 512 = 16 * 32
//////////////////////////////larger block gives better performance
#define FILTERV_TILE_WIDTH 16
#define FILTERV_TILE_HEIGHT 128
#define FILTERV_TBLK_HEIGHT 32
////////////////////////////
#define FILTERH_TILE_WIDTH 128


__device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
texture<float, 1, cudaReadModeElementType> texData;
texture<float2, 2, cudaReadModeElementType> texDataF2;
texture<float4, 1, cudaReadModeElementType> texDataF4;
texture<int4, 1, cudaReadModeElementType> texDataI4;
texture<int4, 1, cudaReadModeElementType> texDataList;

//template<int i>	 __device__ float Conv(float *data)		{    return Conv<i-1>(data) + data[i]*d_kernel[i];}
//template<>		__device__ float Conv<0>(float *data)	{    return data[0] * d_kernel[0];					}

  
//////////////////////////////////////////////////////////////
template<int FW> __global__ void FilterH( float* d_result, int width)
{

	const int HALF_WIDTH = FW >> 1;
	const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1;
	const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH;
	__shared__ float data[CACHE_WIDTH];
	const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH);
	const int col =  bcol + threadIdx.x;
	const int index_min = IMUL(blockIdx.y, width);
	const int index_max = index_min + width - 1;
	int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
	int cache_index = threadIdx.x;
	float value = 0;
#pragma unroll
	for(int j = 0; j < CACHE_COUNT; ++j)
	{
		if(cache_index < CACHE_WIDTH)
		{
			int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
			data[cache_index] = tex1Dfetch(texData,fetch_index);
			src_index += FILTERH_TILE_WIDTH;
			cache_index += FILTERH_TILE_WIDTH;
		}
	}
	__syncthreads(); 
	if(col >= width) return;
#pragma unroll
	for(int i = 0; i < FW; ++i)
	{
		value += (data[threadIdx.x + i]* d_kernel[i]);
	}
//	value = Conv<FW-1>(data + threadIdx.x);
	d_result[index_min + col] = value;
}



////////////////////////////////////////////////////////////////////
template<int  FW>  __global__ void FilterV(float* d_result, int width, int height)
{
	const int HALF_WIDTH = FW >> 1;
	const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
	const int TEMP = CACHE_WIDTH & 0xf;
//add some extra space to avoid bank conflict
#if FILTERV_TILE_WIDTH == 16
	//make the stride 16 * n +/- 1
	const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP;
#elif FILTERV_TILE_WIDTH == 8
	//make the stride 16 * n +/- 2
	const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP);
#elif FILTERV_TILE_WIDTH == 4
	//make the stride 16 * n +/- 4
	const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP);
#else
#error
#endif
	const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA;
	const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_TBLK_HEIGHT - 1) / FILTERV_TBLK_HEIGHT;
	const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_TBLK_HEIGHT -1) / FILTERV_TBLK_HEIGHT;
	__shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
	const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT);
	const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x;
	const int row_first = row_block_first - HALF_WIDTH;
	const int data_index_max = IMUL(height - 1, width) + col;
	const int cache_col_start = threadIdx.y;	
	const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH);
	int cache_index = cache_col_start + cache_row_start;
	int data_index = IMUL(row_first + cache_col_start, width) + col;

	if(col < width) 
	{
#pragma unroll
		for(int i = 0; i < CACHE_COUNT; ++i)
		{
			if(cache_col_start < CACHE_WIDTH - i * FILTERV_TBLK_HEIGHT) 
			{
				int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
				data[cache_index + i * FILTERV_TBLK_HEIGHT] = tex1Dfetch(texData,fetch_index);
				data_index += IMUL(FILTERV_TBLK_HEIGHT, width);
			}
		}
	}
	__syncthreads();
	
	if(col >= width) return;

	int row = row_block_first + threadIdx.y;
	int index_start = cache_row_start + threadIdx.y;
#pragma unroll
	for(int i = 0; i < WRITE_COUNT;		++i, 
			row += FILTERV_TBLK_HEIGHT, index_start += FILTERV_TBLK_HEIGHT)
	{
		if(row < height)
		{
			int index_dest = IMUL(row, width) + col;
			float value = 0;
#pragma unroll
			for(int i = 0; i < FW; ++i)
			{
				value += (data[index_start + i] * d_kernel[i]);
			}
			d_result[index_dest] = value;
		}
	}
}


template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width)
{
	const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
	const float INV_SCALE = 1.0f / (float(SCALE));
	int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
	if(col >= width) return;

	int row = blockIdx.y >> LOG_SCALE; 
	int index = row * width + col;
	int dst_row = blockIdx.y;
	int dst_idx= (width * dst_row + col) * SCALE;
	int helper = blockIdx.y & SCALE_MASK; 
	if (helper)
	{
		float v11 = tex1Dfetch(texData, index);
		float v12 = tex1Dfetch(texData, index + 1);
		index += width;
		float v21 = tex1Dfetch(texData, index);
		float v22 = tex1Dfetch(texData, index + 1);
		float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
		float v1 = (v21 * w1  + w2 * v11);
		float v2 = (v22 * w1  + w2 * v12);
		d_result[dst_idx] = v1;
#pragma unroll
		for(int i = 1; i < SCALE; ++i)
		{
			const float r2 = i * INV_SCALE;
			const float r1 = 1.0f - r2; 
			d_result[dst_idx +i] = v1 * r1 + v2 * r2;
		}
	}else
	{
		float v1 = tex1Dfetch(texData, index);
		float v2 = tex1Dfetch(texData, index + 1);
		d_result[dst_idx] = v1;
#pragma unroll
		for(int i = 1; i < SCALE; ++i)
		{
			const float r2 = i * INV_SCALE;
			const float r1 = 1.0f - r2; 
			d_result[dst_idx +i] = v1 * r1 + v2 * r2;
		}
	}

}
////////////////////////////////////////////////////////////////////////////////////////
void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
{
	int width = src->GetImgWidth(), height = src->GetImgHeight();
	src->BindTexture(texData);
	dim3 grid((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
	dim3 block(FILTERH_TILE_WIDTH);
	switch(log_scale)
	{
	case 1 : 	UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
	case 2 : 	UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
	case 3 : 	UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width);	break;
	default:	break;
	}
}

template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width)
{
	const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
	if(dst_col >= dst_width) return;
	const int src_col = min((dst_col << LOG_SCALE), (src_width - 1));
	const int dst_row = blockIdx.y; 
	const int src_row = blockIdx.y << LOG_SCALE;
	const int src_idx = IMUL(src_row, src_width) + src_col;
	const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
	d_result[dst_idx] = tex1Dfetch(texData, src_idx);

}

__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale)
{
	const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
	if(dst_col >= dst_width) return;
	const int src_col = min((dst_col << log_scale), (src_width - 1));
	const int dst_row = blockIdx.y; 
	const int src_row = blockIdx.y << log_scale;
	const int src_idx = IMUL(src_row, src_width) + src_col;
	const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
	d_result[dst_idx] = tex1Dfetch(texData, src_idx);

}

void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
{
	int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;

	src->BindTexture(texData);
	dim3 grid((dst_width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
	dim3 block(FILTERH_TILE_WIDTH);
	switch(log_scale)
	{
	case 1 : 	DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
	case 2 :	DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
	case 3 : 	DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);	break;
	default:	DownsampleKernel    <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale);
	}
}
__global__ void ChannelReduce_Kernel(float* d_result)
{
	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
	d_result[index] = tex1Dfetch(texData, index*4);
}

__global__ void ChannelReduce_Convert_Kernel(float* d_result)
{
	int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
	float4 rgba = tex1Dfetch(texDataF4, index);
	d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
}

void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb)
{
	int width = src->GetImgWidth(), height = dst->GetImgHeight() ;

	dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
	dim3 block(FILTERH_TILE_WIDTH);
	if(convert_rgb)
	{
		src->BindTexture(texDataF4);
		ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);
	}else
	{
		src->BindTexture(texData);
		ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);
	}
}


void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
{
	int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
	width = 2*sz + 1;

	//filter size truncation
	if(width > KERNEL_MAX_WIDTH)
	{
		//std::cout<<"Filter truncated "<<width<<"->"<<KERNEL_MAX_WIDTH<<endl;
		sz = KERNEL_MAX_WIDTH >> 1;
		width =KERNEL_MAX_WIDTH;
	}

	float   rv = 1.0f/(sigma*sigma), v, ksum =0; 

	// pre-compute filter
	for( i = -sz ; i <= sz ; ++i) 
	{
		kernel[i+sz] =  v = exp(-0.5f * i * i *rv) ;
		ksum += v;
	}

	//normalize the kernel
	rv = 1.0f/ksum;
	for(i = 0; i< width ;i++) kernel[i]*=rv;
}


template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf)
{
	int width = src->GetImgWidth(), height = src->GetImgHeight();

	//horizontal filtering
	src->BindTexture(texData);
	dim3 gridh((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
	dim3 blockh(FILTERH_TILE_WIDTH);
	FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);
	CheckErrorCUDA("FilterH");

	///vertical filtering
	buf->BindTexture(texData);
	dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH,  (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
	dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_TBLK_HEIGHT);
	FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height); 
	CheckErrorCUDA("FilterV");
}

//////////////////////////////////////////////////////////////////////
// tested on 2048x1500 image, the time on pyramid construction is
// -pack cg version : 18ms
// -unpack cg version : 49 ms
// -cuda version: 28 ms
void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma)
{
	float filter_kernel[KERNEL_MAX_WIDTH]; int width;
	CreateFilterKernel(sigma, filter_kernel, width);
	cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice);

	switch(width)
	{
		case 5:		FilterImage< 5>(dst, src, buf);	break;
		case 7:		FilterImage< 7>(dst, src, buf);	break;
		case 9:		FilterImage< 9>(dst, src, buf);	break;
		case 11:	FilterImage<11>(dst, src, buf);	break;
		case 13:	FilterImage<13>(dst, src, buf);	break;
		case 15:	FilterImage<15>(dst, src, buf);	break;
		case 17:	FilterImage<17>(dst, src, buf);	break;
		case 19:	FilterImage<19>(dst, src, buf);	break;
		case 21:	FilterImage<21>(dst, src, buf);	break;
		case 23:	FilterImage<23>(dst, src, buf);	break;
		case 25:	FilterImage<25>(dst, src, buf);	break;
		case 27:	FilterImage<27>(dst, src, buf);	break;
		case 29:	FilterImage<29>(dst, src, buf);	break;
		case 31:	FilterImage<31>(dst, src, buf);	break;
		case 33:	FilterImage<33>(dst, src, buf);	break;
		default:	break;
	}

}



#define DOG_BLOCK_DIMX 128
#define DOG_BLOCK_DIMY 1
#define DOG_BLOCK_LOG_DIMX 7
#define DOG_BLOCK_LOG_DIMY 0

texture<float, 1, cudaReadModeElementType> texC;
texture<float, 1, cudaReadModeElementType> texP;
texture<float, 1, cudaReadModeElementType> texN;

void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height)
{
	int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
	int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
	if(col < width && row < height) 
	{
		int index = IMUL(row, width) + col;
		float vp = tex1Dfetch(texP, index);
		float v = tex1Dfetch(texC, index);
		d_dog[index] = v - vp;
12 3 4 下一頁
?? 文件大小 6821 K
?? 上傳用戶 autojacky
?? 所屬分類其他
??? 相關標簽

#SiftGPU #implementation #processes #parallely
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? programcu.cu

?? 快捷鍵說明