?? 采用塊并行化.cu
字號:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#define DATA_SIZE 1048576
#define THREAD_NUM 512
#define BLOCK_NUM 36
int data[DATA_SIZE];
__global__ static void sumOfSquares(int* num, int* result, clock_t* time)
{
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
if(tid == 0) time[bid] = clock();
for(i = bid*THREAD_NUM+tid ; i < DATA_SIZE; i+=BLOCK_NUM*THREAD_NUM) {
sum += num[i] * num[i];
}
result[bid*THREAD_NUM+tid] = sum;
if(tid == 0) time[bid+BLOCK_NUM] = clock();
}
void GenerateNumber(int* number, int size)
{
for(int i=0;i<size;i++){
number[i]=rand()%10;
}
}
int main(int argc, char** argv)
{
int *gpudata,*result;
clock_t *time;
GenerateNumber(data,DATA_SIZE);
cudaMalloc((void**)&gpudata,sizeof(int)*DATA_SIZE);
cudaMalloc((void**)&result,sizeof(int)*THREAD_NUM*BLOCK_NUM);
cudaMalloc((void**)&time,sizeof(clock_t)*BLOCK_NUM*2);
cudaMemcpy(gpudata, data, sizeof(int)*DATA_SIZE, cudaMemcpyHostToDevice);
sumOfSquares<<<BLOCK_NUM,THREAD_NUM,0>>>(gpudata,result,time);
int sum[THREAD_NUM*BLOCK_NUM];
clock_t time_used[BLOCK_NUM*2];
cudaMemcpy(sum, result, sizeof(int)*THREAD_NUM*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t)*BLOCK_NUM*2, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int Final_Sum=0;
for(int i=0;i<THREAD_NUM*BLOCK_NUM;i++)
{
Final_Sum+=sum[i];
}
clock_t min_time=time_used[0];
clock_t max_time=time_used[BLOCK_NUM];
for(int i=0;i<BLOCK_NUM;i++)
{
if(min_time>time_used[i]) min_time=time_used[i];
if(max_time<time_used[i+BLOCK_NUM]) max_time=time_used[i+BLOCK_NUM];
}
printf("sum:%d \n",Final_Sum);
printf("time_used:%ld \n",max_time-min_time);
Final_Sum = 0;
for(int i = 0; i < DATA_SIZE; i++) {
Final_Sum += data[i] * data[i];
}
printf("sum (CPU): %d\n", Final_Sum);
CUT_EXIT(argc, argv);
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -