?? marsscan.cu
字號:
unsigned int address = __mul24(blockIdx.x, (blockDim.x << 1)) + baseIndex + threadIdx.x;
__syncthreads();
// note two adds per thread
g_data[address] += uni;
g_data[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * uni;
}
inline bool
isPowerOfTwo(int n)
{
return ((n&(n-1))==0) ;
}
inline int
floorPow2(int n)
{
#ifdef WIN32
// method 2
return 1 << (int)logb((float)n);
#else
// method 1
// int nf = (int)n;
// return 1 << (((*(int*)&nf) >> 23) - 127);
int exp;
frexp((double)n, &exp);
return 1 << (exp - 1);
#endif
}
#define BLOCK_SIZE 256
int** g_scanBlockSums;
unsigned int g_numEltsAllocated = 0;
unsigned int g_numLevelsAllocated = 0;
void preallocBlockSums(unsigned int maxNumElements)
{
// assert(g_numEltsAllocated == 0); // shouldn't be called
g_numEltsAllocated = maxNumElements;
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
unsigned int numElts = maxNumElements;
int level = 0;
do
{
unsigned int numBlocks =
max(1, (int)ceil((int)numElts / (2.f * blockSize)));
if (numBlocks > 1)
{
level++;
}
numElts = numBlocks;
} while (numElts > 1);
g_scanBlockSums = (int**) malloc(level * sizeof(int*));
g_numLevelsAllocated = level;
numElts = maxNumElements;
level = 0;
do
{
unsigned int numBlocks =
max(1, (int)ceil((int)numElts / (2.f * blockSize)));
if (numBlocks > 1)
{
CUDA_SAFE_CALL(cudaMalloc((void**) &g_scanBlockSums[level++],
numBlocks * sizeof(int)));
}
numElts = numBlocks;
} while (numElts > 1);
CUT_CHECK_ERROR("preallocBlockSums");
}
void deallocBlockSums()
{
for (int i = 0; i < g_numLevelsAllocated; i++)
{
cudaFree(g_scanBlockSums[i]);
}
CUT_CHECK_ERROR("deallocBlockSums");
free((void**)g_scanBlockSums);
g_scanBlockSums = 0;
g_numEltsAllocated = 0;
g_numLevelsAllocated = 0;
}
void saven_initialPrefixSum(unsigned int maxNumElements)
{
if(g_numEltsAllocated == 0)
preallocBlockSums(maxNumElements);
else
if(g_numEltsAllocated>maxNumElements)
{
deallocBlockSums();
preallocBlockSums(maxNumElements);
}
}
void prescanArrayRecursive(int *outArray,
const int *inArray,
int numElements,
int level)
{
unsigned int blockSize = BLOCK_SIZE; // max size of the thread blocks
unsigned int numBlocks =
max(1, (int)ceil((int)numElements / (2.f * blockSize)));
unsigned int numThreads;
if (numBlocks > 1)
numThreads = blockSize;
else if (isPowerOfTwo(numElements))
numThreads = numElements / 2;
else
numThreads = floorPow2(numElements);
unsigned int numEltsPerBlock = numThreads * 2;
// if this is a non-power-of-2 array, the last block will be non-full
// compute the smallest power of 2 able to compute its scan.
unsigned int numEltsLastBlock =
numElements - (numBlocks-1) * numEltsPerBlock;
unsigned int numThreadsLastBlock = max(1, numEltsLastBlock / 2);
unsigned int np2LastBlock = 0;
unsigned int sharedMemLastBlock = 0;
if (numEltsLastBlock != numEltsPerBlock)
{
np2LastBlock = 1;
if(!isPowerOfTwo(numEltsLastBlock))
numThreadsLastBlock = floorPow2(numEltsLastBlock);
unsigned int extraSpace = (2 * numThreadsLastBlock) / NUM_BANKS;
sharedMemLastBlock =
sizeof(int) * (2 * numThreadsLastBlock + extraSpace);
}
// padding space is used to avoid shared memory bank conflicts
unsigned int extraSpace = numEltsPerBlock / NUM_BANKS;
unsigned int sharedMemSize =
sizeof(int) * (numEltsPerBlock + extraSpace);
#ifdef DEBUG
if (numBlocks > 1)
{
assert(g_numEltsAllocated >= numElements);
}
#endif
// setup execution parameters
// if NP2, we process the last block separately
dim3 grid(max(1, numBlocks - np2LastBlock), 1, 1);
dim3 threads(numThreads, 1, 1);
// make sure there are no CUDA errors before we start
CUT_CHECK_ERROR("prescanArrayRecursive before kernels");
// execute the scan
if (numBlocks > 1)
{
prescan<true, false><<< grid, threads, sharedMemSize >>>(outArray,
inArray,
g_scanBlockSums[level],
numThreads * 2, 0, 0);
CUT_CHECK_ERROR("prescanWithBlockSums");
if (np2LastBlock)
{
prescan<true, true><<< 1, numThreadsLastBlock, sharedMemLastBlock >>>
(outArray, inArray, g_scanBlockSums[level], numEltsLastBlock,
numBlocks - 1, numElements - numEltsLastBlock);
CUT_CHECK_ERROR("prescanNP2WithBlockSums");
}
// After scanning all the sub-blocks, we are mostly done. But now we
// need to take all of the last values of the sub-blocks and scan those.
// This will give us a new value that must be sdded to each block to
// get the final results.
// recursive (CPU) call
prescanArrayRecursive(g_scanBlockSums[level],
g_scanBlockSums[level],
numBlocks,
level+1);
uniformAdd<<< grid, threads >>>(outArray,
g_scanBlockSums[level],
numElements - numEltsLastBlock,
0, 0);
CUT_CHECK_ERROR("uniformAdd");
if (np2LastBlock)
{
uniformAdd<<< 1, numThreadsLastBlock >>>(outArray,
g_scanBlockSums[level],
numEltsLastBlock,
numBlocks - 1,
numElements - numEltsLastBlock);
CUT_CHECK_ERROR("uniformAdd");
}
}
else if (isPowerOfTwo(numElements))
{
prescan<false, false><<< grid, threads, sharedMemSize >>>(outArray, inArray,
0, numThreads * 2, 0, 0);
CUT_CHECK_ERROR("prescan");
}
else
{
prescan<false, true><<< grid, threads, sharedMemSize >>>(outArray, inArray,
0, numElements, 0, 0);
CUT_CHECK_ERROR("prescanNP2");
}
}
void prescanArray(int *outArray, int *inArray, int numElements)
{
prescanArrayRecursive(outArray, inArray, numElements, 0);
}
int prefexSum( int* d_inArr, int* d_outArr, int numRecords )
{
preallocBlockSums(numRecords);
prescanArray( d_outArr, d_inArr, numRecords );
deallocBlockSums();
int* h_outLast = ( int* )malloc( sizeof( int ) );
CUDA_SAFE_CALL( cudaMemcpy( h_outLast, d_outArr+numRecords-1, sizeof(int),
cudaMemcpyDeviceToHost) );
int* h_inLast = ( int* )malloc( sizeof( int ) );
CUDA_SAFE_CALL( cudaMemcpy( h_inLast, d_inArr+numRecords-1, sizeof(int),
cudaMemcpyDeviceToHost) );
unsigned int sum = *h_outLast + *h_inLast;
free( h_outLast );
free( h_inLast );
return sum;
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -