?? marsgpulib.cu
字號(hào):
/**
*This is the source code for Mars, a MapReduce framework on graphics
*processors.
*Author: Wenbin Fang (HKUST), Bingsheng He (HKUST)
*Mentor: Naga K. Govindaraju (Microsoft Corp.), Qiong Luo (HKUST), Tuyong
*Wang (Sina.com).
*If you have any question on the code, please contact us at {saven,
*wenbin, luo}@cse.ust.hk.
*The copyright is held by HKUST. Mars is provided "as is" without any
*guarantees of any kind.
*/
#undef __CPU_MAP__
#undef __CPU_REDUCE__
#undef __COMPARE__
#define __GPU_MAP__
#define __GPU_REDUCE__
#include "MarsInc.h"
#include "MarsInc/MarsConfig.h"
#include "MarsGPUEmit.cu"
//------------------------------------------------------
//get key/value in a record
//------------------------------------------------------
__device__ char *gpuGetRecordFromBuf(char *buf,
int4 *offsetSizes, int index, char type,
size_t keyOffset, size_t valOffset)
{
int offset = ((type == 0)?(offsetSizes[index].x-keyOffset):
(offsetSizes[index].z-valOffset));
return (buf + offset);
}
//------------------------------------------------------
//efficiently copy data in gpu kernel function
//------------------------------------------------------
__device__ void copyData(char4 *des, char4 *src, size_t size)
{
//if data size is less than 4, simply copy it
if (size < 4)
{
for (int i = 0; i < size; i++)
des[i] = src[i];
return;
}
//if data size is greater than 4, use vector
int size4 = size >> 2;
char *pDes = NULL;
char *pSrc = NULL;
for (int i = 0; i < size4; i++)
{
pDes = (char*)&des[i];
pSrc = (char*)&src[i];
for (int j = 0; j < 4; j++)
pDes[j] = pSrc[j];
}
int remainder = size & 3;
if (remainder > 0)
{
pDes += 4;
pSrc += 4;
for (int i = 0; i < remainder; i++)
pDes[i] = pSrc[i];
}
}
__device__ char *gpuGetVal(void *vals, int4 *index, size_t valStartIndex,
size_t valCount, size_t i)
{
if (i >= valCount) return NULL;
char *val = (char*)vals+index[valStartIndex+i].z - index[valStartIndex].z;
//BenLog("--val:%d--", *(int*)val);
return val;
}
__device__ size_t gpuGetValSize(void *vals, int4 *index, size_t valStartIndex,
size_t valCount, size_t i)
{
if (i >= valCount) return NULL;
return index[valStartIndex+i].w;
}
//----------------------------------------------------
//called by
//----------------------------------------------------
__device__ void gpuEmitInterCount(size_t keySize,
size_t valSize,
size_t* interKeysSizePerTask,
size_t* interValsSizePerTask,
size_t* interCountPerTask)
{
size_t index = (blockIdx.x * blockDim.x + threadIdx.x);
interKeysSizePerTask[index] += keySize;
interValsSizePerTask[index] += valSize;
interCountPerTask[index]++;
}
//-------------------------------------------------------
//gpu kernel function, called by StartGPUMap
//-------------------------------------------------------
__global__ void gpuMapCount(char* inputKeys,
char* inputVals,
int4* inputOffsetSizes,
size_t* interKeysSizePerTask,
size_t* interValsSizePerTask,
size_t* interCountPerTask,
size_t recordNum,
size_t recordsPerTask,
size_t taskNum,
size_t keyOffset,
size_t valOffset)
{
size_t index = (blockIdx.x * blockDim.x + threadIdx.x);
for (int i = 0; i <= recordsPerTask; i++)
{
int cindex = i*taskNum+index;
if (cindex >= recordNum) return;
char *key = gpuGetRecordFromBuf(inputKeys,
inputOffsetSizes, cindex, 0, keyOffset, valOffset);
char *val = gpuGetRecordFromBuf(inputVals,
inputOffsetSizes, cindex, 1, keyOffset, valOffset);
gpu_map_count(key,
val,
inputOffsetSizes[cindex].y,
inputOffsetSizes[cindex].w,
interKeysSizePerTask,
interValsSizePerTask,
interCountPerTask);
}
}
//-------------------------------------------------------
//gpu kernel function, called by StartGPUMap
//-------------------------------------------------------
__global__ void gpuMap(char* inputKeys,
char* inputVals,
int4* inputOffsetSizes,
size_t* psKeySizes,
size_t* psValSizes,
size_t* psCounts,
int2* keyValOffsets,
char* interKeys,
char* interVals,
int4* interOffsetSizes,
size_t* curIndex,
size_t recordNum,
size_t recordsPerTask,
size_t taskNum,
size_t keyOffset,
size_t valOffset)
{
size_t index = (blockIdx.x * blockDim.x + threadIdx.x);
interOffsetSizes[psCounts[index]].x = psKeySizes[index];
interOffsetSizes[psCounts[index]].z = psValSizes[index];
for (int i = 0; i <= recordsPerTask; i++)
{
int cindex = i*taskNum+index;
if (cindex >= recordNum) return;
char *key = gpuGetRecordFromBuf(inputKeys,
inputOffsetSizes, cindex, 0, keyOffset, valOffset);
char *val = gpuGetRecordFromBuf(inputVals,
inputOffsetSizes, cindex, 1, keyOffset, valOffset);
gpu_map(key,
val,
inputOffsetSizes[cindex].y,
inputOffsetSizes[cindex].w,
psKeySizes,
psValSizes,
psCounts,
keyValOffsets,
interKeys,
interVals,
interOffsetSizes,
curIndex);
}
}
//----------------------------------------------------
//main gpu map procedure
//----------------------------------------------------
void StartGPUMap(Schedule_t *sched, char mode)
{
//D_ENTER_FUNC("StartGPUMap");
//EnterFunc("StartGPUMap");
BEN_ASSERT(sched != NULL);
//-------------------------------------------------------
//get map input data
//-------------------------------------------------------
//!!!need to be small chunk!!!!
size_t inputRecCount = sched->inputSmallChunk.recCount;
size_t inputKeySize = sched->inputSmallChunk.keySize;
size_t inputValSize = sched->inputSmallChunk.valSize;
size_t inputKeyOffset = sched->inputSmallChunk.keyOffset;
size_t inputValOffset = sched->inputSmallChunk.valOffset;
if (inputRecCount <= 0) return;
char *inputKeys = sched->inputSmallChunk.keys;
char *inputVals = sched->inputSmallChunk.vals;
int4 *inputIndex = sched->inputSmallChunk.index;
//!!!
//----------------------------------------------
//determine the number of threads to run
//----------------------------------------------
size_t gridDim = sched->gpuMapGridDim;
size_t blockDim = sched->gpuMapBlockDim;
size_t sharedMemSize = sched->gpuMapSharedMemSize;
size_t threadNum = sched->gpuMapGridDim*sched->gpuMapBlockDim;
size_t recPerThread = inputRecCount / threadNum;
if (0 == recPerThread)
recPerThread = 1;
//-------------------------------------------------------
//upload map input data onto device memory
//-------------------------------------------------------
char* d_inputKeys = D_MALLOC(inputKeySize);
D_MEMCPY_H2D(d_inputKeys, inputKeys, inputKeySize);
char* d_inputVals = D_MALLOC(inputValSize);
D_MEMCPY_H2D(d_inputVals, inputVals, inputValSize);
int4* d_inputIndex = (int4*)D_MALLOC(sizeof(int4)*inputRecCount);
D_MEMCPY_H2D(d_inputIndex, inputIndex, sizeof(int4)*inputRecCount);
//----------------------------------------------
//calculate intermediate data keys'buf size
// and values' buf size
//----------------------------------------------
size_t* d_interKeySizePerThread = (size_t*)D_MALLOC(sizeof(size_t)*threadNum);
size_t* d_interValSizePerThread = (size_t*)D_MALLOC(sizeof(size_t)*threadNum);
size_t* d_interCountPerThread = (size_t*)D_MALLOC(sizeof(size_t)*threadNum);
gpuMapCount<<<gridDim, blockDim, sharedMemSize>>>(d_inputKeys,
d_inputVals,
d_inputIndex,
d_interKeySizePerThread,
d_interValSizePerThread,
d_interCountPerThread,
inputRecCount,
recPerThread,
threadNum,
inputKeyOffset,
inputValOffset);
//----------------------------------------------
//do prefix sum
//----------------------------------------------
size_t *d_psKeySizes = (size_t*)D_MALLOC(sizeof(size_t)*threadNum);
size_t allKeySize = prefexSum((int*)d_interKeySizePerThread, (int*)d_psKeySizes, threadNum);
size_t *d_psValSizes = (size_t*)D_MALLOC(sizeof(size_t)*threadNum);
size_t allValSize = prefexSum((int*)d_interValSizePerThread, (int*)d_psValSizes, threadNum);
size_t *d_psCounts = (size_t*)D_MALLOC(sizeof(size_t)*threadNum);
size_t allCounts = prefexSum((int*)d_interCountPerThread, (int*)d_psCounts, threadNum);
//-----------------------------------------------
//allocate intermediate memory on device memory
//-----------------------------------------------
char* d_interKeys = NULL;
char* d_interVals = NULL;
int4* d_interIndex = NULL;
int2* d_keyValOffsets = NULL;
size_t* d_curIndex = NULL;
if (allCounts == 0)
goto GPU_MAP_EXIT;
d_interKeys = D_MALLOC(allKeySize);
d_interVals = D_MALLOC(allValSize);
d_interIndex = (int4*)D_MALLOC(sizeof(int4)*allCounts);
//--------------------------------------------------
// start map
//--------------------------------------------------
d_keyValOffsets = (int2*)D_MALLOC(sizeof(int2)*threadNum);
d_curIndex = (size_t*)D_MALLOC(sizeof(size_t)*threadNum);
gpuMap<<<gridDim, blockDim, sharedMemSize>>>(d_inputKeys,
d_inputVals,
d_inputIndex,
d_psKeySizes,
d_psValSizes,
d_psCounts,
d_keyValOffsets,
d_interKeys,
d_interVals,
d_interIndex,
d_curIndex,
inputRecCount,
recPerThread,
threadNum,
inputKeyOffset,
inputValOffset);
sched->outputSmallChunk.keys = d_interKeys;
sched->outputSmallChunk.vals = d_interVals;
sched->outputSmallChunk.index = d_interIndex;
sched->outputSmallChunk.keySize = allKeySize;
sched->outputSmallChunk.valSize = allValSize;
sched->outputSmallChunk.indexSize = allCounts*sizeof(int4);
sched->outputSmallChunk.rangeSize = 0;//sizeof(int2)*interDiffKeyCount;
sched->outputSmallChunk.recCount = allCounts;
//PrintRecords(&(sched->outputSmallChunks[cur]), NULL, INT, INT, 100);
//---------------------------------------------
//clean
//---------------------------------------------
GPU_MAP_EXIT:
D_FREE(d_interKeySizePerThread, sizeof(size_t)*threadNum);
D_FREE(d_interValSizePerThread, sizeof(size_t)*threadNum);
D_FREE(d_interCountPerThread, sizeof(size_t)*threadNum);
// D_FREE(d_interKeys, allKeySize);
// D_FREE(d_interVals, allValSize);
// D_FREE(d_interIndex, sizeof(int4)*allCounts);
D_FREE(d_keyValOffsets, sizeof(int2)*threadNum);
D_FREE(d_curIndex, sizeof(size_t)*threadNum);
D_FREE(d_inputKeys, inputKeySize);
D_FREE(d_inputVals, inputValSize);
D_FREE(d_inputIndex, sizeof(int4)*inputRecCount);
D_FREE(d_psKeySizes, sizeof(size_t)*threadNum);
D_FREE(d_psValSizes, sizeof(size_t)*threadNum);
D_FREE(d_psCounts, sizeof(size_t)*threadNum);
//D_LEAVE_FUNC("StartGPUMap");
//LeaveFunc("StartGPUMap");
}
void StartGPUSort_cpu(Schedule_t *sched, char mode)
{
char *d_interKeys = sched->outputSmallChunk.keys;
char *d_interVals = sched->outputSmallChunk.vals;
int4 *d_interIndex = (int4*)sched->outputSmallChunk.index;
size_t allKeySize = sched->outputSmallChunk.keySize;
size_t allValSize = sched->outputSmallChunk.valSize;
size_t allCounts = sched->outputSmallChunk.recCount;
char *interKeys = (char*)BenMalloc(allKeySize);
char *interVals = (char*)BenMalloc(allValSize);
int4 *interIndex = (int4*)BenMalloc(sizeof(int4)*allCounts);
D_MEMCPY_D2H(interKeys, d_interKeys, allKeySize);
D_MEMCPY_D2H(interVals, d_interVals, allValSize);
D_MEMCPY_D2H(interIndex, d_interIndex, sizeof(int4)*allCounts);
sched->outputSmallChunk.keys = interKeys;
sched->outputSmallChunk.vals = interVals;
sched->outputSmallChunk.index = interIndex;
if (mode & MAP_SORT || mode & MAP_SORT_REDUCE)
{
if (mode & USE_FILE)
sched->outputSmallChunk.keyOffset = 0;
QuickSortMem(&sched->outputSmallChunk);
GroupByMem(&sched->outputSmallChunk);
sched->outputSmallChunk.rangeSize = sched->outputSmallChunk.diffKeyCount*sizeof(int2);
}
D_FREE(d_interKeys, allKeySize);
D_FREE(d_interVals, allValSize);
D_FREE(d_interIndex, sizeof(int4)*allCounts);
}
?? 快捷鍵說明
復(fù)制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號(hào)
Ctrl + =
減小字號(hào)
Ctrl + -