?? marssort.cu
字號:
size_t chunkSize = chunkRecCount / (sortInfo->realChunkCount+1);
size_t *recOffsets = (size_t*)BenMalloc(sizeof(size_t)*sortInfo->realChunkCount);
size_t totalKeyCount = 0;
for (int i = 0; i < sortInfo->realChunkCount; i++)
{
//initialize small chunk
GetSmallSortChunk(file, smallChunks, sortInfo->chunks,
chunkSize, bigChunkBitmap, i, recOffsets);
totalKeyCount += sortInfo->chunks[i].diffKeyCount;
}
ChunkInfo_t *outputChunk = (ChunkInfo_t*)BenMalloc(sizeof(ChunkInfo_t));
SortRec_t *minRec = (SortRec_t*)BenMalloc(sizeof(SortRec_t));
char flag = 0;
size_t which = 0;
size_t only_one = 0;
size_t finishedCount = 0;
size_t actualRecCount = 0;
SortRec_t tmpRec;
while (1)
{
finishedCount = 0;
for (int i = 0; i < sortInfo->realChunkCount; i++)
{
if (bigChunkBitmap[i] == 0)
{
finishedCount++;
continue;
}
else
{
only_one = i;
//run out small chunk?
if (smallChunks[i].smallCursor >= smallChunks[i].diffKeyCount)
{
BenFree((char**)&smallChunks[i].keys, smallChunks[i].keySize);
BenFree((char**)&smallChunks[i].vals, smallChunks[i].valSize);
BenFree((char**)&smallChunks[i].index, smallChunks[i].indexSize);
BenFree((char**)&smallChunks[i].keyListRange, smallChunks[i].rangeSize);
if(!GetSmallSortChunk(file, smallChunks, sortInfo->chunks,
chunkSize, bigChunkBitmap, i, recOffsets))
continue;
smallChunks[i].smallCursor = 0;
}
if (flag == 0)
{
GetSortRec(minRec, smallChunks, i, recOffsets);
flag = 1;
which = i;
continue;
}
//fetch record from small chunk
GetSortRec(&tmpRec, smallChunks, i, recOffsets);
//compare
if (cpu_compare(minRec->key, minRec->keySize,
tmpRec.key, tmpRec.keySize) <= 0)
{
continue;
}
else
{
BenMemcpy(minRec, &tmpRec, sizeof(SortRec_t));
which = i;
}
}//if -- else --
}//for
if (finishedCount != (sortInfo->realChunkCount-1))
{
smallChunks[which].smallCursor++;
flag = 0;
}
else
{
break;
}
actualRecCount++;
BenLog("*****************%d in %d chunk, \n", *(int*)minRec->key, which);
//output records
OutputAndGroupRecs(outputChunk, tmpfile, minRec, chunkSize);
}//while(1)
BenLog("actualRecCount:%d,totalKeyCount:%d\n", actualRecCount, totalKeyCount);
//reminder
if (actualRecCount < totalKeyCount)
{
do
{
if (smallChunks[only_one].smallCursor >=
smallChunks[only_one].diffKeyCount)
{
if(!GetSmallSortChunk(file, smallChunks, sortInfo->chunks,
chunkSize, bigChunkBitmap, only_one, recOffsets))
break;
smallChunks[only_one].smallCursor = 0;
}
for (int i = smallChunks[only_one].smallCursor;
i < smallChunks[only_one].diffKeyCount;
i++)
{
actualRecCount++;
GetSortRec(&tmpRec, smallChunks, only_one, recOffsets);
smallChunks[only_one].smallCursor++;
BenLog("*****************%d in %d chunk, \n", *(int*)minRec->key, which);
OutputAndGroupRecs(outputChunk, tmpfile, &tmpRec, chunkSize);
}
BenFree((char**)&smallChunks[only_one].keys, smallChunks[only_one].keySize);
BenFree((char**)&smallChunks[only_one].vals, smallChunks[only_one].valSize);
BenFree((char**)&smallChunks[only_one].index, smallChunks[only_one].indexSize);
BenFree((char**)&smallChunks[only_one].keyListRange, smallChunks[only_one].rangeSize);
} while(smallChunks[only_one].cursor < sortInfo->chunks[only_one].diffKeyCount);
}//if (actualRecCount < totalKeyCount)
BenLog("actualRecCount:%d,totalKeyCount:%d,groupOffset:%d,totalRecCount:%d\n",
actualRecCount, totalKeyCount, groupOffset, totalRecCount);
FlushSortChunk(outputChunk, tmpfile);
*totalOutputRecCount = totalRecCount;
*totalOutputDiffKeyCount = totalGroupCount;
BenFree((char**)&recOffsets, sizeof(size_t)*sortInfo->realChunkCount);
BenFree((char**)&minRec, sizeof(SortRec_t));
BenFree((char**)&outputChunk, sizeof(ChunkInfo_t));
BenFree((char**)&bigChunkBitmap, sortInfo->realChunkCount);
BenFree((char**)&smallChunks,
sizeof(SortChunk_t)*sortInfo->realChunkCount);
//LeaveFunc("MergeSortFile");
}
//==============================================================
//GPU sort
//==============================================================
#define NUM_BLOCK_PER_CHUNK_BITONIC_SORT 512//b256
#define SHARED_MEM_INT2 256
#define NUM_BLOCKS_CHUNK 512
#define NUM_THREADS_CHUNK 256//(256)
#define CHUNK_SIZE (NUM_BLOCKS_CHUNK*NUM_THREADS_CHUNK)
#define NUM_CHUNKS_R (NUM_RECORDS_R/CHUNK_SIZE)
__device__ int getCompareValue(void *d_rawData, cmp_type_t value1, cmp_type_t value2)
{
int compareValue=0;
int v1=value1.x;
int v2=value2.x;
if((v1==-1) || (v2==-1))
{
if(v1==v2)
compareValue=0;
else
if(v1==-1)
compareValue=-1;
else
compareValue=1;
}
else
compareValue=gpu_compare((void*)(((char*)d_rawData)+v1), value1.y, (void*)(((char*)d_rawData)+v2), value2.y);
return compareValue;
}
void * s_qsRawData=NULL;
__global__ void
partBitonicSortKernel( void* d_rawData, int totalLenInBytes,cmp_type_t* d_R, unsigned int numRecords, int chunkIdx, int unitSize)
{
__shared__ cmp_type_t shared[NUM_THREADS_CHUNK];
int tx = threadIdx.x;
int bx = blockIdx.x;
//load the data
int dataIdx = chunkIdx*CHUNK_SIZE+bx*blockDim.x+tx;
int unitIdx = ((NUM_BLOCKS_CHUNK*chunkIdx + bx)/unitSize)&1;
shared[tx] = d_R[dataIdx];
__syncthreads();
int ixj=0;
int a=0;
cmp_type_t temp1;
cmp_type_t temp2;
int k = NUM_THREADS_CHUNK;
if(unitIdx == 0)
{
for (int j = (k>>1); j>0; j =(j>>1))
{
ixj = tx ^ j;
//a = (shared[tx].y - shared[ixj].y);
temp1=shared[tx];
temp2= shared[ixj];
if (ixj > tx) {
//a=temp1.y-temp2.y;
//a=compareString((void*)(((char4*)d_rawData)+temp1.x),(void*)(((char4*)d_rawData)+temp2.x));
a=getCompareValue(d_rawData, temp1, temp2);
if ((tx & k) == 0) {
if ( (a>0)) {
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
else {
if ( (a<0)) {
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
}
__syncthreads();
}
}
else
{
for (int j = (k>>1); j>0; j =(j>>1))
{
ixj = tx ^ j;
temp1=shared[tx];
temp2= shared[ixj];
if (ixj > tx) {
//a=temp1.y-temp2.y;
//a=compareString((void*)(((char4*)d_rawData)+temp1.x),(void*)(((char4*)d_rawData)+temp2.x));
a=getCompareValue(d_rawData, temp1, temp2);
if ((tx & k) == 0) {
if( (a<0))
{
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
else {
if( (a>0))
{
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
}
__syncthreads();
}
}
d_R[dataIdx] = shared[tx];
}
__global__ void
unitBitonicSortKernel(void* d_rawData, int totalLenInBytes, cmp_type_t* d_R, unsigned int numRecords, int chunkIdx )
{
__shared__ cmp_type_t shared[NUM_THREADS_CHUNK];
int tx = threadIdx.x;
int bx = blockIdx.x;
int unitIdx = (NUM_BLOCKS_CHUNK*chunkIdx + bx)&1;
//load the data
int dataIdx = chunkIdx*CHUNK_SIZE+bx*blockDim.x+tx;
shared[tx] = d_R[dataIdx];
__syncthreads();
cmp_type_t temp1;
cmp_type_t temp2;
int ixj=0;
int a=0;
if(unitIdx == 0)
{
for (int k = 2; k <= NUM_THREADS_CHUNK; (k =k<<1))
{
// bitonic merge:
for (int j = (k>>1); j>0; (j=j>>1))
{
ixj = tx ^ j;
temp1=shared[tx];
temp2= shared[ixj];
if (ixj > tx) {
//a=temp1.y-temp2.y;
//a=compareString((void*)(((char4*)d_rawData)+temp1.x),(void*)(((char4*)d_rawData)+temp2.x));
a=getCompareValue(d_rawData, temp1, temp2);
if ((tx & k) == 0) {
if ( (a>0)) {
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
else {
if ( (a<0)) {
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
}
__syncthreads();
}
}
}
else
{
for (int k = 2; k <= NUM_THREADS_CHUNK; (k =k<<1))
{
// bitonic merge:
for (int j = (k>>1); j>0; (j=j>>1))
{
ixj = tx ^ j;
temp1=shared[tx];
temp2= shared[ixj];
if (ixj > tx) {
//a=temp1.y-temp2.y;
//a=compareString((void*)(((char4*)d_rawData)+temp1.x),(void*)(((char4*)d_rawData)+temp2.x));
a=getCompareValue(d_rawData, temp1, temp2);
if ((tx & k) == 0) {
if( (a<0))
{
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
else {
if( (a>0))
{
shared[tx]=temp2;
shared[ixj]=temp1;
}
}
}
__syncthreads();
}
}
}
d_R[dataIdx] = shared[tx];
}
__global__ void
bitonicKernel( void* d_rawData, int totalLenInBytes, cmp_type_t* d_R, unsigned int numRecords, int k, int j)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tid = threadIdx.x;
int dataIdx = by*gridDim.x*blockDim.x + bx*blockDim.x + tid;
int ixj = dataIdx^j;
if( ixj > dataIdx )
{
cmp_type_t tmpR = d_R[dataIdx];
cmp_type_t tmpIxj = d_R[ixj];
if( (dataIdx&k) == 0 )
{
//if( tmpR.y > tmpIxj.y )
//if(compareString((void*)(((char4*)d_rawData)+tmpR.x),(void*)(((char4*)d_rawData)+tmpIxj.x))==1)
if(getCompareValue(d_rawData, tmpR, tmpIxj)==1)
{
d_R[dataIdx] = tmpIxj;
d_R[ixj] = tmpR;
}
}
else
{
//if( tmpR.y < tmpIxj.y )
//if(compareString((void*)(((char4*)d_rawData)+tmpR.x),(void*)(((char4*)d_rawData)+tmpIxj.x))==-1)
if(getCompareValue(d_rawData, tmpR, tmpIxj)==-1)
{
d_R[dataIdx] = tmpIxj;
d_R[ixj] = tmpR;
}
}
}
}
__device__ inline void swap(cmp_type_t & a, cmp_type_t & b)
{
// Alternative swap doesn't use a temporary register:
// a ^= b;
// b ^= a;
// a ^= b;
cmp_type_t tmp = a;
a = b;
b = tmp;
}
__global__ void bitonicSortSingleBlock_kernel(void* d_rawData, int totalLenInBytes, cmp_type_t * d_values, int rLen, cmp_type_t* d_output)
{
__shared__ cmp_type_t bs_cmpbuf[SHARED_MEM_INT2];
//const int by = blockIdx.y;
//const int bx = blockIdx.x;
const int tx = threadIdx.x;
const int ty = threadIdx.y;
const int tid=tx+ty*blockDim.x;
//const int bid=bx+by*gridDim.x;
//const int numThread=blockDim.x;
//const int resultID=(bx)*numThread+tid;
if(tid<rLen)
{
bs_cmpbuf[tid] = d_values[tid];
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -