140 likes | 283 Views
GPU&CUDA Labwork Week 6. Bin ZHOU USTC: Fall 2012. 上机实验. 目的 自主进行CUDA程序优化尝试 工具 cuda 4.2; Linux 为主, windows 为辅 GTX 640 方法 上机实验. 实验内容. 1)合并访存实验 2) Warp 实验 3)Bank Conflict 实验 提供示意伪代码. 声明. 伪代码不保证可用,只是示例,请自己写好代码
E N D
GPU&CUDA Labwork Week6 Bin ZHOU USTC: Fall2012
上机实验 目的 自主进行CUDA程序优化尝试 工具 cuda 4.2;Linux为主,windows为辅 GTX 640 方法 上机实验
实验内容 1)合并访存实验 2)Warp 实验 3)Bank Conflict 实验 提供示意伪代码
声明 • 伪代码不保证可用,只是示例,请自己写好代码 • After 4-week lab work, students should know the basic cuda programming well and so • minimum support for basic questions!
合并访存实验 • Step1: 实现矩阵相加,使用A方式 • A[N]= B[N]+ C[N]; N > 32768 (伪代码) • Stride = M; • __global__ kernel(float *A,float *B,float *C) • { • base = blockIdx.x*blockDim.x; • offset = threadIdx.x * Stride; • for (int i=0;i<Stride;i++) • A[base+offset+i] = B[base+offset+i] + C[base+offset+i]; • } • 每个线程做M个数据,注意block的大小和数量; • 测试性能;记录时间;
Step 2: 使用B模式 • A[N]= B[N]+ C[N]; N > 32768 (伪代码) • __global__ kernel(float *A,float *B,float *C) • { • base = blockIdx.x*blockDim.x; • offset = threadIdx.x; • A[base+offset] = B[base+offset] + C[base+offset]; • } • 每个线程做1个数据,注意block的大小和数量; • 测试性能
合并内存访问(实例) • Float3需要12bytes: float3 a=d_in[threadIdx.x] • 每个线程需要执行3次读取操作 • 不连续的空间 7
合并内存访问(实例) Step3与前两步类似,起始地址从512开始 • 利用shared memory实现coalescing • 每个block256个线程,则每个block需要sizeof(float3)×256bytes shared memory • 每个线程读取3个float • 地址的偏移量:0,(threads/block),2*(threads/block)
合并内存访问 (实例) • 地址的偏移量: • threadIdx.x, • threadIdx.x+256, • threadIdx.x+512, 具体代码
Warp 实验 • 以上代码,奇数号线程做加法,偶数号做减法;并加入延迟M足够大 • __global__ kernel(float *A,float *B,float *C) • { • base = blockIdx.x*blockDim.x; • offset = threadIdx.x; • if (threadIdx.x%2 == 1) • { • for (x =0;x < M; x ++) • k = k + 5; • A[base+offset] = B[base+offset] + C[base+offset] + k; • } • else • { • for (x =0;x < M; x ++) • k = k + 5; • A[base+offset] = B[base+offset] + C[base+offset] - k; • } • }
以上代码,奇数号Warp做加法,偶数号做减法;并加入延迟M足够大以上代码,奇数号Warp做加法,偶数号做减法;并加入延迟M足够大 • __global__ kernel(float *A,float *B,float *C) • { • base = blockIdx.x*blockDim.x; • offset = threadIdx.x; • warpId = threadIdx.x / 32; • if (warpId %2 == 1) • { • for (x =0;x < M; x ++) • k = k + 5; • A[base+offset] = B[base+offset] + C[base+offset] + k; • } • else • { • for (x =0;x < M; x ++) • k = k + 5; • A[base+offset] = B[base+offset] + C[base+offset] - k; • } • }
Bank Conflict • A = BT 矩阵转置 • __global__ void transposeNaive(float *odata, float *idata, • int width, int height) • { • int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; • int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; • int index_in = xIndex + width * yIndex; • int index_out = yIndex + height * xIndex; • odata[index_out] = idata[index_in]; • }
Coalesced but with conflict • _global__ void transposeCoalesced(float *odata, float *idata, • int width, int height) • { • __shared__ float tile[TILE_DIM][TILE_DIM]; • int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; • int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; • int index_in = xIndex + (yIndex)*width; • xIndex = blockIdx.y * TILE_DIM + threadIdx.x; • yIndex = blockIdx.x * TILE_DIM + threadIdx.y; • int index_out = xIndex + (yIndex)*height; • tile[threadIdx.y][threadIdx.x] = idata[index_in]; • __syncthreads(); • odata[index_out] = tile[threadIdx.x][threadIdx.y]; • }
Remove Conflict • Padding Shared memory • __shared__ float tile[TILE_DIM][TILE_DIM+1];