__global__ voidreduceUnrollWarp8(int * g_idata,int * g_odata,unsignedint n){ //set thread ID unsignedint tid = threadIdx.x; unsignedint idx = blockDim.x*blockIdx.x*8+threadIdx.x; //boundary check if (tid >= n) return; //convert global data pointer to the int *idata = g_idata + blockIdx.x*blockDim.x*8; //unrolling 8; if(idx+7 * blockDim.x<n) { int a1=g_idata[idx]; int a2=g_idata[idx+blockDim.x]; int a3=g_idata[idx+2*blockDim.x]; int a4=g_idata[idx+3*blockDim.x]; int a5=g_idata[idx+4*blockDim.x]; int a6=g_idata[idx+5*blockDim.x]; int a7=g_idata[idx+6*blockDim.x]; int a8=g_idata[idx+7*blockDim.x]; g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
} __syncthreads(); //in-place reduction in global memory for (int stride = blockDim.x/2; stride>32; stride >>=1) { if (tid <stride) { idata[tid] += idata[tid + stride]; } //synchronize within block __syncthreads(); } //write result for this block to global mem if(tid<32) { volatileint *vmem = idata; vmem[tid]+=vmem[tid+32]; vmem[tid]+=vmem[tid+16]; vmem[tid]+=vmem[tid+8]; vmem[tid]+=vmem[tid+4]; vmem[tid]+=vmem[tid+2]; vmem[tid]+=vmem[tid+1];
__global__ voidreduceCompleteUnrollWarp8(int * g_idata,int * g_odata,unsignedint n) { //set thread ID unsignedint tid = threadIdx.x; unsignedint idx = blockDim.x*blockIdx.x*8+threadIdx.x; //boundary check if (tid >= n) return; //convert global data pointer to the int *idata = g_idata + blockIdx.x*blockDim.x*8; if(idx+7 * blockDim.x<n) { int a1=g_idata[idx]; int a2=g_idata[idx+blockDim.x]; int a3=g_idata[idx+2*blockDim.x]; int a4=g_idata[idx+3*blockDim.x]; int a5=g_idata[idx+4*blockDim.x]; int a6=g_idata[idx+5*blockDim.x]; int a7=g_idata[idx+6*blockDim.x]; int a8=g_idata[idx+7*blockDim.x]; g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
} __syncthreads(); //in-place reduction in global memory if(blockDim.x>=1024 && tid <512) idata[tid]+=idata[tid+512]; __syncthreads(); if(blockDim.x>=512 && tid <256) idata[tid]+=idata[tid+256]; __syncthreads(); if(blockDim.x>=256 && tid <128) idata[tid]+=idata[tid+128]; __syncthreads(); if(blockDim.x>=128 && tid <64) idata[tid]+=idata[tid+64]; __syncthreads(); //write result for this block to global mem if(tid<32) { volatileint *vmem = idata; vmem[tid]+=vmem[tid+32]; vmem[tid]+=vmem[tid+16]; vmem[tid]+=vmem[tid+8]; vmem[tid]+=vmem[tid+4]; vmem[tid]+=vmem[tid+2]; vmem[tid]+=vmem[tid+1];