?? marssort.cu
字號:
}
else
{
bs_cmpbuf[tid].x =-1;
}
__syncthreads();
// Parallel bitonic sort.
int compareValue=0;
for (int k = 2; k <= SHARED_MEM_INT2; k *= 2)
{
// Bitonic merge:
for (int j = k / 2; j>0; j /= 2)
{
int ixj = tid ^ j;
if (ixj > tid)
{
if ((tid & k) == 0)
{
compareValue=getCompareValue(d_rawData, bs_cmpbuf[tid], bs_cmpbuf[ixj]);
//if (shared[tid] > shared[ixj])
if(compareValue>0)
{
swap(bs_cmpbuf[tid], bs_cmpbuf[ixj]);
}
}
else
{
compareValue=getCompareValue(d_rawData, bs_cmpbuf[tid], bs_cmpbuf[ixj]);
//if (shared[tid] < shared[ixj])
if(compareValue<0)
{
swap(bs_cmpbuf[tid], bs_cmpbuf[ixj]);
}
}
}
__syncthreads();
}
}
// Write result.
/*if(tid<rLen)
{
d_output[tid] = bs_cmpbuf[tid+SHARED_MEM_INT2-rLen];
}*/
int startCopy=SHARED_MEM_INT2-rLen;
if(tid>=startCopy)
{
d_output[tid-startCopy]=bs_cmpbuf[tid];
}
}
__global__ void bitonicSortMultipleBlocks_kernel(void* d_rawData, int totalLenInBytes, cmp_type_t * d_values, int* d_bound, int startBlock, int numBlock, cmp_type_t *d_output)
{
__shared__ int bs_pStart;
__shared__ int bs_pEnd;
__shared__ int bs_numElement;
__shared__ cmp_type_t bs_shared[SHARED_MEM_INT2];
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
//const int numThread=blockDim.x;
//const int resultID=(bx)*numThread+tid;
if(bid>=numBlock) return;
if(tid==0)
{
bs_pStart=d_bound[(bid+startBlock)<<1];
bs_pEnd=d_bound[((bid+startBlock)<<1)+1];
bs_numElement=bs_pEnd-bs_pStart;
//if(bid==82&& bs_pStart==6339)
// printf("%d, %d, %d\n", bs_pStart, bs_pEnd, bs_numElement);
}
__syncthreads();
// Copy input to shared mem.
if(tid<bs_numElement)
{
bs_shared[tid] = d_values[tid+bs_pStart];
//if(bid==82 && bs_pStart==6339)
// printf("tid %d, pos, %d, %d, %d, %d\n", tid,tid+bs_pStart, bs_pStart,bs_pEnd, d_values[tid+bs_pStart].x);
//if(6342==tid+bs_pStart)
// printf(")))tid %d, pos, %d, %d, %d, %d\n", tid,tid+bs_pStart, bs_pStart,bs_pEnd, d_values[tid+bs_pStart].x);
}
else
{
bs_shared[tid].x =-1;
}
__syncthreads();
// Parallel bitonic sort.
int compareValue=0;
for (int k = 2; k <= SHARED_MEM_INT2; k *= 2)
{
// Bitonic merge:
for (int j = k / 2; j>0; j /= 2)
{
int ixj = tid ^ j;
if (ixj > tid)
{
if ((tid & k) == 0)
{
compareValue=getCompareValue(d_rawData, bs_shared[tid], bs_shared[ixj]);
//if (shared[tid] > shared[ixj])
if(compareValue>0)
{
swap(bs_shared[tid], bs_shared[ixj]);
}
}
else
{
compareValue=getCompareValue(d_rawData, bs_shared[tid], bs_shared[ixj]);
//if (shared[tid] < shared[ixj])
if(compareValue<0)
{
swap(bs_shared[tid], bs_shared[ixj]);
}
}
}
__syncthreads();
}
}
// Write result.
//if(tid<bs_numElement)
//{
// d_output[tid+bs_pStart] = bs_shared[tid+SHARED_MEM_INT2-bs_numElement];
//}
//int startCopy=SHARED_MEM_INT2-bs_numElement;
if(tid>=bs_numElement)
{
d_output[tid-bs_numElement]=bs_shared[tid];
}
}
__global__ void initialize_kernel(cmp_type_t* d_data, int startPos, int rLen, cmp_type_t value)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
d_data[pos]=value;
}
void bitonicSortMultipleBlocks(void* d_rawData, int totalLenInBytes, cmp_type_t * d_values, int* d_bound, int numBlock, cmp_type_t * d_output)
{
int numThreadsPerBlock_x=SHARED_MEM_INT2;
int numThreadsPerBlock_y=1;
int numBlock_x=NUM_BLOCK_PER_CHUNK_BITONIC_SORT;
int numBlock_y=1;
int numChunk=numBlock/numBlock_x;
if(numBlock%numBlock_x!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*numBlock_x;
end=start+numBlock_x;
if(end>numBlock)
end=numBlock;
//printf("bitonicSortMultipleBlocks_kernel: %d, range, %d, %d\n", i, start, end);
bitonicSortMultipleBlocks_kernel<<<grid,thread>>>(d_rawData, totalLenInBytes, d_values, d_bound, start, end-start, d_output);
cudaThreadSynchronize();
}
// cudaThreadSynchronize();
}
void bitonicSortSingleBlock(void* d_rawData, int totalLenInBytes, cmp_type_t * d_values, int rLen, cmp_type_t * d_output)
{
int numThreadsPerBlock_x=SHARED_MEM_INT2;
int numThreadsPerBlock_y=1;
int numBlock_x=1;
int numBlock_y=1;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
bitonicSortSingleBlock_kernel<<<grid,thread>>>(d_rawData, totalLenInBytes, d_values, rLen, d_output);
cudaThreadSynchronize();
}
void initialize(cmp_type_t *d_data, int rLen, cmp_type_t value)
{
int numThreadsPerBlock_x=512;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
initialize_kernel<<<grid,thread>>>(d_data, start, rLen, value);
}
cudaThreadSynchronize();
}
void bitonicSortGPU(void* d_rawData, int totalLenInBytes, cmp_type_t* d_Rin, int rLen, void *d_Rout)
{
unsigned int numRecordsR;
unsigned int size = rLen;
unsigned int level = 0;
while( size != 1 )
{
size = size/2;
level++;
}
if( (1<<level) < rLen )
{
level++;
}
numRecordsR = (1<<level);
if(rLen<=NUM_THREADS_CHUNK)
{
bitonicSortSingleBlock((void*)d_rawData, totalLenInBytes, d_Rin, rLen, (cmp_type_t*)d_Rout);
}
else
if( rLen <= 256*1024 )
{
//unsigned int numRecordsR = rLen;
unsigned int numThreadsSort = NUM_THREADS_CHUNK;
if(numRecordsR<NUM_THREADS_CHUNK)
numRecordsR=NUM_THREADS_CHUNK;
unsigned int numBlocksXSort = numRecordsR/numThreadsSort;
unsigned int numBlocksYSort = 1;
dim3 gridSort( numBlocksXSort, numBlocksYSort );
unsigned int memSizeRecordsR = sizeof( cmp_type_t ) * numRecordsR;
//copy the <offset, length> pairs.
cmp_type_t* d_R;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_R, memSizeRecordsR) );
cmp_type_t tempValue;
tempValue.x=tempValue.y=-1;
initialize(d_R, numRecordsR, tempValue);
CUDA_SAFE_CALL( cudaMemcpy( d_R, d_Rin, rLen*sizeof(cmp_type_t), cudaMemcpyDeviceToDevice) );
for( int k = 2; k <= numRecordsR; k *= 2 )
{
for( int j = k/2; j > 0; j /= 2 )
{
bitonicKernel<<<gridSort, numThreadsSort>>>((void*)d_rawData, totalLenInBytes, d_R, numRecordsR, k, j);
}
}
CUDA_SAFE_CALL( cudaMemcpy( d_Rout, d_R+(numRecordsR-rLen), sizeof(cmp_type_t)*rLen, cudaMemcpyDeviceToDevice) );
cudaFree( d_R );
cudaThreadSynchronize();
}
else
{
unsigned int numThreadsSort = NUM_THREADS_CHUNK;
unsigned int numBlocksYSort = 1;
unsigned int numBlocksXSort = (numRecordsR/numThreadsSort)/numBlocksYSort;
if(numBlocksXSort>=(1<<16))
{
numBlocksXSort=(1<<15);
numBlocksYSort=(numRecordsR/numThreadsSort)/numBlocksXSort;
}
unsigned int numBlocksChunk = NUM_BLOCKS_CHUNK;
unsigned int numThreadsChunk = NUM_THREADS_CHUNK;
unsigned int chunkSize = numBlocksChunk*numThreadsChunk;
unsigned int numChunksR = numRecordsR/chunkSize;
dim3 gridSort( numBlocksXSort, numBlocksYSort );
unsigned int memSizeRecordsR = sizeof( cmp_type_t ) * numRecordsR;
cmp_type_t* d_R;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_R, memSizeRecordsR) );
cmp_type_t tempValue;
tempValue.x=tempValue.y=-1;
initialize(d_R, numRecordsR, tempValue);
CUDA_SAFE_CALL( cudaMemcpy( d_R, d_Rin, rLen*sizeof(cmp_type_t), cudaMemcpyDeviceToDevice) );
for( int chunkIdx = 0; chunkIdx < numChunksR; chunkIdx++ )
{
unitBitonicSortKernel<<< numBlocksChunk, numThreadsChunk>>>( (void*)d_rawData, totalLenInBytes, d_R, numRecordsR, chunkIdx );
}
int j;
for( int k = numThreadsChunk*2; k <= numRecordsR; k *= 2 )
{
for( j = k/2; j > numThreadsChunk/2; j /= 2 )
{
bitonicKernel<<<gridSort, numThreadsSort>>>( (void*)d_rawData, totalLenInBytes, d_R, numRecordsR, k, j);
}
for( int chunkIdx = 0; chunkIdx < numChunksR; chunkIdx++ )
{
partBitonicSortKernel<<< numBlocksChunk, numThreadsChunk>>>((void*)d_rawData, totalLenInBytes, d_R, numRecordsR, chunkIdx, k/numThreadsSort );
}
}
CUDA_SAFE_CALL( cudaMemcpy( d_Rout, d_R+(numRecordsR-rLen), sizeof(cmp_type_t)*rLen, cudaMemcpyDeviceToDevice) );
cudaFree( d_R );
cudaThreadSynchronize();
}
}
__global__ void getIntYArray_kernel(int2* d_input, int startPos, int rLen, int* d_output)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
int2 value=d_input[pos];
d_output[pos]=value.y;
}
}
__global__ void getXYArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_output)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
cmp_type_t value=d_input[pos];
d_output[pos].x=value.x;
d_output[pos].y=value.y;
}
}
__global__ void getZWArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_output)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
cmp_type_t value=d_input[pos];
d_output[pos].x=value.z;
d_output[pos].y=value.w;
}
}
__global__ void setXYArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_value)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
cmp_type_t value=d_input[pos];
value.x=d_value[pos].x;
value.y=d_value[pos].y;
d_input[pos]=value;
}
}
__global__ void setZWArray_kernel(cmp_type_t* d_input, int startPos, int rLen, int2* d_value)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -