?? marssort.cu
字號:
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
cmp_type_t value=d_input[pos];
value.z=d_value[pos].x;
value.w=d_value[pos].y;
d_input[pos]=value;
}
}
void getIntYArray(int2 *d_data, int rLen, int* d_output)
{
int numThreadsPerBlock_x=512;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
getIntYArray_kernel<<<grid,thread>>>(d_data, start, rLen, d_output);
}
cudaThreadSynchronize();
}
void getXYArray(cmp_type_t *d_data, int rLen, int2* d_output)
{
int numThreadsPerBlock_x=512;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
getXYArray_kernel<<<grid,thread>>>(d_data, start, rLen, d_output);
}
cudaThreadSynchronize();
}
void getZWArray(cmp_type_t *d_data, int rLen, int2* d_output)
{
int numThreadsPerBlock_x=512;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
getZWArray_kernel<<<grid,thread>>>(d_data, start, rLen, d_output);
}
cudaThreadSynchronize();
}
void setXYArray(cmp_type_t *d_data, int rLen, int2* d_value)
{
int numThreadsPerBlock_x=512;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
setXYArray_kernel<<<grid,thread>>>(d_data, start, rLen, d_value);
}
cudaThreadSynchronize();
}
void setZWArray(cmp_type_t *d_data, int rLen, int2* d_value)
{
int numThreadsPerBlock_x=512;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
setZWArray_kernel<<<grid,thread>>>(d_data, start, rLen, d_value);
}
cudaThreadSynchronize();
}
__global__ void copyChunks_kernel(void *d_source, int startPos, int2* d_Rin, int rLen, int *d_sum, void *d_dest)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
int2 value=d_Rin[pos];
int offset=value.x;
int size=value.y;
int startWritePos=d_sum[pos];
int i=0;
char *source=(char*)d_source;
char *dest=(char*)d_dest;
for(i=0;i<size;i++)
{
dest[i+startWritePos]=source[i+offset];
}
value.x=startWritePos;
d_Rin[pos]=value;
}
}
__global__ void getChunkBoundary_kernel(void* d_rawData, int startPos, cmp_type_t *d_Rin,
int rLen, int* d_startArray)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
int result=0;
if(pos==0)//the start position
{
result=1;
}
else
{
cmp_type_t cur=d_Rin[pos];
cmp_type_t left=d_Rin[pos-1];
if(getCompareValue(d_rawData, cur, left)!=0)
{
result=1;
}
}
d_startArray[pos]=result;
}
}
__global__ void setBoundaryInt2_kernel(int* d_boundary, int startPos, int numKey, int rLen,
int2* d_boundaryRange)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<numKey)
{
int2 flag;
flag.x=d_boundary[pos];
if((pos+1)!=numKey)
flag.y=d_boundary[pos+1];
else
flag.y=rLen;
d_boundaryRange[pos]=flag;
}
}
__global__ void writeBoundary_kernel(int startPos, int rLen, int* d_startArray,
int* d_startSumArray, int* d_bounary)
{
const int by = blockIdx.y;
const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
const int bid=bx+by*gridDim.x;
const int numThread=blockDim.x;
const int resultID=(bid)*numThread+tid;
int pos=startPos+resultID;
if(pos<rLen)
{
int flag=d_startArray[pos];
int writePos=d_startSumArray[pos];
if(flag==1)
d_bounary[writePos]=pos;
}
}
void copyChunks(void *d_source, int2* d_Rin, int rLen, void *d_dest)
{
//extract the size information for each chunk
int* d_size;
CUDA_SAFE_CALL( cudaMalloc( (void**) (&d_size), sizeof(int)*rLen) );
getIntYArray(d_Rin, rLen, d_size);
//compute the prefix sum for the output positions.
int* d_sum;
CUDA_SAFE_CALL( cudaMalloc( (void**) (&d_sum), sizeof(int)*rLen) );
saven_initialPrefixSum(rLen);
prescanArray(d_sum,d_size,rLen);
cudaFree(d_size);
//output
int numThreadsPerBlock_x=128;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
copyChunks_kernel<<<grid,thread>>>(d_source, start, d_Rin, rLen, d_sum, d_dest);
}
cudaThreadSynchronize();
cudaFree(d_sum);
}
//return the number of chunks.
int getChunkBoundary(void *d_source, cmp_type_t* d_Rin, int rLen, int2 ** h_outputKeyListRange)
{
int resultNumChunks=0;
//get the chunk boundary[start of chunk0, start of chunk 1, ...]
int* d_startArray;
CUDA_SAFE_CALL( cudaMalloc( (void**) (&d_startArray), sizeof(int)*rLen) );
int numThreadsPerBlock_x=512;
int numThreadsPerBlock_y=1;
int numBlock_x=512;
int numBlock_y=1;
int chunkSize=numBlock_x*numThreadsPerBlock_x;
int numChunk=rLen/chunkSize;
if(rLen%chunkSize!=0)
numChunk++;
dim3 thread( numThreadsPerBlock_x, numThreadsPerBlock_y, 1);
dim3 grid( numBlock_x, numBlock_y , 1);
int i=0;
int start=0;
int end=0;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
getChunkBoundary_kernel<<<grid,thread>>>(d_source, start, d_Rin, rLen, d_startArray);
}
cudaThreadSynchronize();
//prefix sum for write positions.
int* d_startSumArray;
CUDA_SAFE_CALL( cudaMalloc( (void**) (&d_startSumArray), sizeof(int)*rLen) );
saven_initialPrefixSum(rLen);
prescanArray(d_startSumArray,d_startArray,rLen);
//gpuPrint(d_startSumArray, rLen, "d_startSumArray");
int lastValue=0;
int partialSum=0;
CUDA_SAFE_CALL( cudaMemcpy( &lastValue, d_startArray+(rLen-1), sizeof(int), cudaMemcpyDeviceToHost) );
//gpuPrint(d_startArray, rLen, "d_startArray");
CUDA_SAFE_CALL( cudaMemcpy( &partialSum, d_startSumArray+(rLen-1), sizeof(int), cudaMemcpyDeviceToHost) );
//gpuPrint(d_startSumArray, rLen, "d_startSumArray");
resultNumChunks=lastValue+partialSum;
int* d_boundary;//[start of chunk0, start of chunk 1, ...]
CUDA_SAFE_CALL( cudaMalloc( (void**) (&d_boundary), sizeof(int)*resultNumChunks) );
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>rLen)
end=rLen;
writeBoundary_kernel<<<grid,thread>>>(start, rLen, d_startArray,
d_startSumArray, d_boundary);
}
cudaFree(d_startArray);
cudaFree(d_startSumArray);
//set the int2 boundary.
int2 *d_outputKeyListRange;
CUDA_SAFE_CALL( cudaMalloc( (void**) (&d_outputKeyListRange), sizeof(int2)*resultNumChunks) );
numChunk=resultNumChunks/chunkSize;
if(resultNumChunks%chunkSize!=0)
numChunk++;
for(i=0;i<numChunk;i++)
{
start=i*chunkSize;
end=start+chunkSize;
if(end>resultNumChunks)
end=resultNumChunks;
setBoundaryInt2_kernel<<<grid,thread>>>(d_boundary, start, resultNumChunks, rLen, d_outputKeyListRange);
}
cudaThreadSynchronize();
*h_outputKeyListRange=(int2*)BenMalloc(sizeof(int2)*resultNumChunks);
CUDA_SAFE_CALL( cudaMemcpy( *h_outputKeyListRange, d_outputKeyListRange, sizeof(int2)*resultNumChunks, cudaMemcpyDeviceToHost) );
cudaFree(d_boundary);
cudaFree(d_outputKeyListRange);
return resultNumChunks;
}
int GPUBitonicSortMem (void * d_inputKeyArray, int totalKeySize, void * d_inputValArray, int totalValueSize,
cmp_type_t * d_inputPointerArray, int rLen,
void * d_outputKeyArray, void * d_outputValArray,
cmp_type_t * d_outputPointerArray, int2 ** h_outputKeyListRange
)
{
saven_initialPrefixSum(rLen);
//array_startTime(1);
int numDistinctKey=0;
int totalLenInBytes=-1;
bitonicSortGPU(d_inputKeyArray, totalLenInBytes, d_inputPointerArray, rLen, d_outputPointerArray);
//array_endTime("sort", 1);
//!we first scatter the values and then the keys. so that we can reuse d_PA.
int2 *d_PA;
CUDA_SAFE_CALL( cudaMalloc( (void**) (&d_PA), sizeof(int2)*rLen) );
//scatter the values.
if(d_inputValArray!=NULL)
{
getZWArray(d_outputPointerArray, rLen, d_PA);
copyChunks(d_inputValArray, d_PA, rLen, d_outputValArray);
setZWArray(d_outputPointerArray, rLen, d_PA);
}
//scatter the keys.
if(d_inputKeyArray!=NULL)
{
getXYArray(d_outputPointerArray, rLen, d_PA);
copyChunks(d_inputKeyArray, d_PA, rLen, d_outputKeyArray);
setXYArray(d_outputPointerArray, rLen, d_PA);
}
//find the boudary for each key.
numDistinctKey=getChunkBoundary(d_outputKeyArray, d_outputPointerArray, rLen, h_outputKeyListRange);
return numDistinctKey;
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -