?? backup_simplestreams.cu
字號:
*ob++ = *ib++;
ib = inblock; fb = fret; /* R[i]=L[i] XOR f(R[i-1],key) */
*ob++ = *ib++ ^ *fb++;
*ob++ = *ib++ ^ *fb++;
*ob++ = *ib++ ^ *fb++;
*ob++ = *ib++ ^ *fb++;
}
__device__ void endes(unsigned char *inblock, unsigned char *outblock) /* encrypt 64-bit inblock */
{
unsigned char iters[17][8]; /* workspace for each iteration */
unsigned char swap[8]; /* place to interchange L and R */
register int i;
register unsigned char *s, *t;
permute(inblock,iperm,iters[0]);/* apply initial permutation */
for (i=0; i<16; i++) /* 16 churning operations */
iter(i,iters[i],iters[i+1]);
/* don't re-copy to save space */
s = swap; t = &iters[16][4]; /* interchange left */
*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
t = &iters[16][0]; /* and right */
*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
permute(swap,fperm,outblock); /* apply final permutation */
}
__device__ void dedes(unsigned char *inblock, unsigned char *outblock) /* decrypt 64-bit inblock */
{ unsigned char iters[17][8]; /* workspace for each iteration */
unsigned char swap[8]; /* place to interchange L and R */
register int i;
register unsigned char *s, *t;
permute(inblock,iperm,iters[0]);/* apply initial permutation */
for (i=0; i<16; i++) /* 16 churning operations */
iter(15-i,iters[i],iters[i+1]);
/* reverse order from encrypting*/
s = swap; t = &iters[16][4]; /* interchange left */
*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
t = &iters[16][0]; /* and right */
*s++ = *t++; *s++ = *t++; *s++ = *t++; *s++ = *t++;
permute(swap,fperm,outblock); /* apply final permutation */
}
__global__ void kernel_endes(unsigned char *in, unsigned char *out, int size)
{
int start= (THREAD_NUM*blockIdx.x) + threadIdx.x;
int idx = start;
__syncthreads();
do{
endes(in+idx, out+idx);
__syncthreads();
idx+= (THREAD_NUM<<THREAD_BLK_NUM_LOG2<<3);
} while (idx < size);
}
__global__ void kernel_dedes(unsigned char *in, unsigned char *out, int size)
{
int start= (THREAD_NUM*blockIdx.x) + threadIdx.x;
int idx = start;
__syncthreads();
do{
dedes(in+idx, out+idx);
__syncthreads();
idx+= (THREAD_NUM<<THREAD_BLK_NUM_LOG2<<3);
} while (idx < size);
}
__host__ void cpy_const_data_from_host_to_device()
{
cudaMemcpyToSymbol(iperm, iperm_host, sizeof(iperm));
cudaMemcpyToSymbol(fperm, fperm_host, sizeof(fperm));
cudaMemcpyToSymbol(s, s_host, sizeof(s));
cudaMemcpyToSymbol(p32, p32_host, sizeof(p32));
cudaMemcpyToSymbol(kn, kn_host, sizeof(kn));
}
void generate_input(unsigned char* input, int num)
{
for (int i=0; i<num; i++)
input[i]= 'a' + (i%26);
}
int correct_data(unsigned char *input, unsigned char *out, const int n)
{
for(int i = 0; i < n; i++)
if(input[i] != 'a' + (i%26))
return 0;
return 1;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
/*
__device__ int get_value(const int c);
unsigned int get_value_host(const int c);
__global__ void init_array(int *values, int *out, int size)
{
int i= (THREAD_NUM*blockIdx.x) + threadIdx.x;
do {
// 1) Copy input to .
int v = values[i];
int o =0;
// 2) Synchronization
__syncthreads();
// 3) Process
o = get_value(v);
// 5) Write the result to device memory.
//values[i] = o;
out[i] =o;
// 4) Synchronization again
__syncthreads();
i+= (THREAD_NUM<<THREAD_BLK_NUM_LOG2);//(THREAD_NUM)*THREAD_BLK_NUM;
} while (i < size );
}
__device__ int get_value(const int in)
{
unsigned int out=in;
for (int loop=0; loop<REPEAT_TIME; loop++)
{
out=(out*out) & 0xFFFF;
}
return out;
}
unsigned int get_value_host(const int in)
{
unsigned int out=in;
for (int loop=0; loop<REPEAT_TIME; loop++)
{
out=(out*out) & 0xFFFF;
}
return out;
}
*/
int test();
int main(int argc, unsigned char *argv[])
{
int nstreams = 8; // number of streams for CUDA calls
int nbytes = NUM * 8; // number of data bytes
dim3 threads, blocks; // kernel launch configuration
float elapsed_time=0, time_memcpy=0, time_kernel=0; // timing variables
test();
unsigned char keyx[9]= "cxskq";
desinit(keyx); /* set up tables for DES */
// check the compute capability of the device
int num_devices=0;
CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
if(0==num_devices)
{
printf("your system does not have a CUDA capable device\n");
return 1;
}
cudaDeviceProp device_properties;
CUDA_SAFE_CALL( cudaGetDeviceProperties(&device_properties, 0) );
if( (1 == device_properties.major) && (device_properties.minor < 1))
printf("%s does not have compute capability 1.1 or later\n\n", device_properties.name);
printf("////////////////////////////////////////////////\n");
printf("64-bit_array_size=%d(M) (total size=%d (M))thread_blk_num=%d\n", NUM/1024/1024, 2*NUM/1024/1024, THREAD_BLK_NUM);
// allocate host
unsigned char *a = 0; // pointer to the array data in host memory
unsigned char *out=0;
// allocate host memory (pinned is required for achieve asynchronicity)
CUDA_SAFE_CALL( cudaMallocHost((void**)&a, nbytes) );
memset(a,0,nbytes);
CUDA_SAFE_CALL( cudaMallocHost((void**)&out, nbytes) );
memset(out,0,nbytes);
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
for(int i = 0; i < nstreams; i++)
CUDA_SAFE_CALL( cudaStreamCreate(&(streams[i])) );
// create CUDA event handles
cudaEvent_t start_event, stop_event;
CUDA_SAFE_CALL( cudaEventCreate(&start_event) );
CUDA_SAFE_CALL( cudaEventCreate(&stop_event) );
// allocate device memory
unsigned char *d_a = 0, *d_o = 0; // pointers to data and init value in the device memory
CUDA_SAFE_CALL( cudaMalloc((void**)&d_a, nbytes) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_o, nbytes) );
cudaMemset(d_a, 0, nbytes);
cudaMemset(d_o, 0, nbytes);
/*
// time memcopy from device
cudaEventRecord(start_event, 0); // record in stream-0, to ensure that all previous CUDA calls have completed
CUDA_SAFE_CALL( cudaMemcpy(d_a, a, nbytes, cudaMemcpyHostToDevice) );
cudaMemcpyAsync(out, d_o, nbytes, cudaMemcpyDeviceToHost, streams[0]);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event); // block until the event is actually recorded
CUDA_SAFE_CALL( cudaEventElapsedTime(&time_memcpy, start_event, stop_event) );
printf("2-way memcopy :\t%.2f\n", time_memcpy);
// time kernel
threads=dim3(THREAD_NUM, 1);
blocks=dim3(THREAD_BLK_NUM, 1);
cudaEventRecord(start_event, 0);
cpy_const_data_from_host_to_device();
kernel_endes<<<blocks, threads, SHARED_MEM_SIZE, streams[0]>>>(d_a, d_o, nbytes/8);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL( cudaEventElapsedTime(&time_kernel, start_event, stop_event) );
printf("kernel computing: %.2f\n", time_kernel);
//////////////////////////////////////////////////////////////////////
// time non-streamed execution for reference
threads=dim3(THREAD_NUM, 1);
blocks=dim3(1, 1);
memset(a,0,nbytes);
cudaEventRecord(start_event, 0);
CUDA_SAFE_CALL( cudaMemcpy(d_a, a, nbytes, cudaMemcpyHostToDevice) );
cpy_const_data_from_host_to_device();
kernel_endes<<<blocks, threads,SHARED_MEM_SIZE>>>(d_a, d_o, nbytes/8);
cudaMemcpy(out, d_o, nbytes, cudaMemcpyDeviceToHost);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
printf("Time to 2-way memcpy & execute:\n");
printf("non-streamed:\t%.2f (%.2f expected = %.2f + %.2f)\n", elapsed_time, time_kernel + time_memcpy,
time_kernel, time_memcpy);
*/
//////////////////////////////////////////////////////////////////////
// time execution with nstreams streams
threads=dim3(THREAD_NUM,1);
//blocks=dim3(n/(nstreams*threads.x),1);
blocks=dim3(THREAD_BLK_NUM,1);
//memset(a, c, nbytes); // set host memory bits to all 1s, for testing correctness
//cudaMemset(d_a, 0, nbytes); // set device memory to all 0s, for testing correctness
generate_input(a,nbytes);
clock_t start_clock_gpu, end_clock_gpu;
start_clock_gpu = clock();
cudaMemset(d_a, 0, nbytes);
cudaMemset(d_o, 0, nbytes);
cudaEventRecord(start_event, 0);
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for(int i = 0; i < nstreams; i++)
cudaMemcpyAsync(d_a + i * nbytes / nstreams, a + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);
cpy_const_data_from_host_to_device();
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < nstreams; i++)
kernel_endes<<<blocks, threads, SHARED_MEM_SIZE, streams[i]>>>(d_a + i * nbytes / nstreams, d_o + i * nbytes / nstreams, nbytes/nstreams);
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for(int i = 0; i < nstreams; i++)
cudaMemcpyAsync(out + i * nbytes / nstreams, d_o + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);
printf("Encoding ... \n" );
for (int i=0; i<80; i++)
printf("i=%d in=%x out=%x correct=%d\n", i, a[i] & 0xFF, out[i]& 0xFF, 0 );
memset(a,0,nbytes);
/// Decode
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for(int i = 0; i < nstreams; i++)
cudaMemcpyAsync(d_o + i * nbytes / nstreams, out + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyHostToDevice, streams[i]);
cpy_const_data_from_host_to_device();
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < nstreams; i++)
kernel_dedes<<<blocks, threads, SHARED_MEM_SIZE, streams[i]>>>(d_o + i * nbytes / nstreams, d_a + i * nbytes / nstreams, nbytes/nstreams);
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
for(int i = 0; i < nstreams; i++)
cudaMemcpyAsync(a + i * nbytes / nstreams, d_a + i * nbytes / nstreams, nbytes / nstreams, cudaMemcpyDeviceToHost, streams[i]);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
end_clock_gpu = clock();
printf("%d streams:\t%.2f (%.2f expected = %.2f + %.2f/%d)\n", nstreams, elapsed_time , time_kernel + time_memcpy / nstreams,
time_kernel, time_memcpy,nstreams );
printf("Time taken by GPU using clock()= %d ms\n", end_clock_gpu - start_clock_gpu);
printf("Decoding ... \n" );
for (int i=0; i<80; i++)
printf("i=%d in=%x out=%x correct=%d\n", i, a[i] & 0xFF, out[i] & 0xFF, 0 );
// check whether the output is correct
printf("------------VERIFY USING CPU-------------------\n");
//for (int i=0; i<80; i++)
// printf("i=%d in=%x out=%x correct=%x\n", i, a[i], out[i], get_value_host(a[i]));
clock_t start_clock_cpu, end_clock_cpu;
start_clock_cpu = clock();
int verification = correct_data(a, out, nbytes);
end_clock_cpu = clock();
if(verification)
printf("Test PASSED\n");
else
printf("Test FAILED\n");
printf("Time taken by single threaded CPU using clock()= %d ms\n", end_clock_cpu - start_clock_cpu);
float faster_time = (end_clock_gpu - start_clock_gpu)<1 ? 1: (end_clock_cpu - start_clock_cpu) / (end_clock_gpu - start_clock_gpu);
printf("GPU is %.2f (%d/%d) times faster than CPU.\n", faster_time,
(end_clock_cpu - start_clock_cpu) , (end_clock_gpu - start_clock_gpu));
// release resources
for(int i = 0; i < nstreams; i++)
cudaStreamDestroy(streams[i]);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a);
cudaFree(d_a);
//cudaFree(d_c);
CUT_EXIT(argc, argv);
return 0;
}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -