1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
| #include <stdio.h> #include "../tools/common.cuh"
__device__ float add(const float x, const float y) { return x + y; }
__global__ void addFromGPU(float *A, float *B, float *C, const int N) { const int bid = blockIdx.x; const int tid = threadIdx.x; const int id = tid + bid * blockDim.x;
if (id >= N) return; C[id] = add(A[id], B[id]); }
void initialData(float *addr, int elemCount) { for (int i = 0; i < elemCount; i++) { addr[i] = (float)(rand() & 0xFF) / 10.f; } return; }
int main(void) { setGPU();
int iElemCount = 4096; size_t stBytesCount = iElemCount * sizeof(float); float *fpHost_A, *fpHost_B, *fpHost_C; fpHost_A = (float *)malloc(stBytesCount); fpHost_B = (float *)malloc(stBytesCount); fpHost_C = (float *)malloc(stBytesCount); if (fpHost_A != NULL && fpHost_B != NULL && fpHost_C != NULL) { memset(fpHost_A, 0, stBytesCount); memset(fpHost_B, 0, stBytesCount); memset(fpHost_C, 0, stBytesCount); } else { printf("Fail to allocate host memory!\n"); exit(-1); }
float *fpDevice_A, *fpDevice_B, *fpDevice_C; cudaMalloc((float**)&fpDevice_A, stBytesCount); cudaMalloc((float**)&fpDevice_B, stBytesCount); cudaMalloc((float**)&fpDevice_C, stBytesCount); if (fpDevice_A != NULL && fpDevice_B != NULL && fpDevice_C != NULL) { cudaMemset(fpDevice_A, 0, stBytesCount); cudaMemset(fpDevice_B, 0, stBytesCount); cudaMemset(fpDevice_C, 0, stBytesCount); } else { printf("fail to allocate memory\n"); free(fpHost_A); free(fpHost_B); free(fpHost_C); exit(-1); }
srand(666); initialData(fpHost_A, iElemCount); initialData(fpHost_B, iElemCount); cudaMemcpy(fpDevice_A, fpHost_A, stBytesCount, cudaMemcpyHostToDevice); cudaMemcpy(fpDevice_B, fpHost_B, stBytesCount, cudaMemcpyHostToDevice); cudaMemcpy(fpDevice_C, fpHost_C, stBytesCount, cudaMemcpyHostToDevice);
dim3 block(2048); dim3 grid((iElemCount + block.x - 1) / 2048);
addFromGPU<<<grid, block>>>(fpDevice_A, fpDevice_B, fpDevice_C, iElemCount); ErrorCheck(cudaGetLastError(), __FILE__, __LINE__); ErrorCheck(cudaDeviceSynchronize(), __FILE__, __LINE__);
cudaMemcpy(fpHost_C, fpDevice_C, stBytesCount, cudaMemcpyDeviceToHost); for (int i = 0; i < 10; i++) { printf("idx=%2d\tmatrix_A:%.2f\tmatrix_B:%.2f\tresult=%.2f\n", i+1, fpHost_A[i], fpHost_B[i], fpHost_C[i]); }
free(fpHost_A); free(fpHost_B); free(fpHost_C); cudaFree(fpDevice_A); cudaFree(fpDevice_B); cudaFree(fpDevice_C);
cudaDeviceReset(); return 0; }
>>> output: The count of GPUs is 1. set GPU 0 for computing. CUDA error: code=9, name=cudaErrorInvalidConfiguration, description=invalid configuration argument file=errorCheckKernel.cu, line95 idx= 1 matrix_A:0.90 matrix_B:10.90 result=0.00 idx= 2 matrix_A:19.00 matrix_B:4.00 result=0.00 idx= 3 matrix_A:15.80 matrix_B:15.00 result=0.00 idx= 4 matrix_A:5.00 matrix_B:3.30 result=0.00 idx= 5 matrix_A:11.10 matrix_B:4.20 result=0.00 idx= 6 matrix_A:23.50 matrix_B:18.60 result=0.00 idx= 7 matrix_A:20.90 matrix_B:4.50 result=0.00 idx= 8 matrix_A:23.40 matrix_B:17.70 result=0.00 idx= 9 matrix_A:16.90 matrix_B:18.40 result=0.00 idx=10 matrix_A:7.30 matrix_B:18.30 result=0.00
|