1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
| #include <stdio.h>
__global__ void addFromGPU(float *A, float *B, float *C, const int N) { const int bid = blockIdx.x; const int tid = threadIdx.x; const int id = tid + bid * blockDim.x;
C[id] = A[id] + B[id]; }
void initialData(float *addr, int elemCount); void setGPU();
int main(void) { setGPU();
int iElemCount = 512; size_t stBytesCount = iElemCount * sizeof(float); float *fpHost_A, *fpHost_B, *fpHost_C; fpHost_A = (float *)malloc(stBytesCount); fpHost_B = (float *)malloc(stBytesCount); fpHost_C = (float *)malloc(stBytesCount); if (fpHost_A != NULL && fpHost_B != NULL && fpHost_C != NULL) { memset(fpHost_A, 0, stBytesCount); memset(fpHost_B, 0, stBytesCount); memset(fpHost_C, 0, stBytesCount); } else { printf("Fail to allocate host memory!\n"); exit(-1); }
float *fpDevice_A, *fpDevice_B, *fpDevice_C; cudaMalloc((float**)&fpDevice_A, stBytesCount); cudaMalloc((float**)&fpDevice_B, stBytesCount); cudaMalloc((float**)&fpDevice_C, stBytesCount); if (fpDevice_A != NULL && fpDevice_B != NULL && fpDevice_C != NULL) { cudaMemset(fpDevice_A, 0, stBytesCount); cudaMemset(fpDevice_B, 0, stBytesCount); cudaMemset(fpDevice_C, 0, stBytesCount); } else { printf("fail to allocate memory\n"); free(fpHost_A); free(fpHost_B); free(fpHost_C); exit(-1); }
srand(666); initialData(fpHost_A, iElemCount); initialData(fpHost_B, iElemCount); cudaMemcpy(fpDevice_A, fpHost_A, stBytesCount, cudaMemcpyHostToDevice); cudaMemcpy(fpDevice_B, fpHost_B, stBytesCount, cudaMemcpyHostToDevice); cudaMemcpy(fpDevice_C, fpHost_C, stBytesCount, cudaMemcpyHostToDevice);
dim3 block(32); dim3 grid(iElemCount / 32);
addFromGPU<<<grid, block>>>(fpDevice_A, fpDevice_B, fpDevice_C, iElemCount);
cudaMemcpy(fpHost_C, fpDevice_C, stBytesCount, cudaMemcpyDeviceToHost);
for (int i = 0; i < 10; i++) { printf("idx=%2d\tmatrix_A:%.2f\tmatrix_B:%.2f\tresult=%.2f\n", i+1, fpHost_A[i], fpHost_B[i], fpHost_C[i]); }
free(fpHost_A); free(fpHost_B); free(fpHost_C); cudaFree(fpDevice_A); cudaFree(fpDevice_B); cudaFree(fpDevice_C);
cudaDeviceReset(); return 0; }
void setGPU() { int iDeviceCount = 0; cudaError_t error = cudaGetDeviceCount(&iDeviceCount);
if(error != cudaSuccess || iDeviceCount == 0) { printf("no compatible GPU found\n"); exit(-1); } else { printf("the count of GPU id %d\n", iDeviceCount); }
int iDev = 0; error = cudaSetDevice(iDev); if (error != cudaSuccess) { printf("fail to set GPU 0 for computing\n"); exit(-1); } else { printf("set GPU 0 for computing"); } }
void initialData(float *addr, int elemCount) { for (int i = 0; i < elemCount; i++) { addr[i] = (float)(rand() & 0xFF) / 10.f; } return; }
|