1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
| #include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 8
#define ARRAY_SIZE 78*78*78
cudaError_t CUerr;
inline int CUERROR(char *str){
if(CUerr != cudaSuccess){
fprintf(stderr, "%s, %s\n", str, cudaGetErrorString(CUerr));
return 1;
}else return 0;
}
#define CUALLOC(var, size) do{ \
CUerr = cudaMalloc((void**)&var, size); \
if(CUERROR("CUDA: can't allocate memory")){ \
exit(1); \
}}while(0)
#define CUMOV2HOST(dest, src, size) do{ \
CUerr = cudaMemcpy(dest, src, size, \
cudaMemcpyDeviceToHost); \
if(CUERROR("CUDA: can't copy data to host")){\
exit(1); \
}}while(0)
#define CUMOV2DEV(dest, src, size) do{ \
CUerr = cudaMemcpy(dest, src, size, \
cudaMemcpyHostToDevice); \
if(CUERROR("CUDA: can't copy data to device")){\
exit(1); \
}}while(0)
#define CUFREE(var) do{cudaFree(var); var = NULL; }while(0)
__global__ void testKernel(float *data) {
int ix = blockIdx.x * blockDim.x + threadIdx.x;
if (ix > ARRAY_SIZE) return;
data[ix] = (float)ix;
}
int main(int argc, char *argv[]) {
int size = ARRAY_SIZE;
int SZ = size * sizeof(float), i;
float *devData = NULL;
float *outData = (float*)malloc(SZ);
if(!outData){
fprintf(stderr, "Can't allocate outData\n");
exit(1);
}
CUALLOC(devData, SZ);
dim3 blockSize(BLOCK_SIZE);
int BLK = (ARRAY_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 gridSize(BLK);
cudaMemset(devData, 0, SZ);
testKernel<<<gridSize, blockSize>>>(devData);
cudaThreadSynchronize();
CUMOV2HOST(outData, devData, SZ);
CUFREE(devData);
for(i=0; i!=size; i++)
printf("%g\n", outData[i]);
free(outData);
return 0;
}
|