Introduction to CUDA Programming

Introduction to CUDA Programming Histograms and Sparse Array Multiplication Andreas Moshovos Winter 2009 Based on documents from: NVIDIA & Appendix A of the New P&H Book

Histogram • E.g., Given an image calculate this: • Distribution of values

Sequential Algorithm for (inti = 0; i < BIN_COUNT; i++) result[i] = 0; for (inti = 0; i < dataN; i++) result[data[i]]++; The challenge is that the write access pattern is data dependent No ordering of accesses to memory

Data Race • Thread 1 Thread 2 • X++ X++ • Start with X = 10 • X++ is really • tmp= Read x • Increment tmp • Write tmp into x • Thread 1 Thread 2 • Read 10 Read 10 • 10+1 = 11 10 + 1 = 11 • X = 11 X = 11 • X = 11 and not 12

Parallel Strategy • Distribute work across multiple blocks • Divide input data to blocks • Each block process its own portion • Multiple threads, one per image pixel? • #pixels / #threads per thread • Produces a partial histogram • Could produce multiple histograms • One per thread  no ordering problems here • Merge all partial histograms • Produces the final histogram

Data Structures and Access Patterns result[data[i]]++; • Data[]: • We control: Can be accessed sequentially • Each element accessed only once • Result[]: • Access is data-dependent • Each element may be accessed multiple times • Data[] in global memory • Result[] in shared memory

Sub-Histograms • How many sub-histograms can we fit in shared memory? • Input value range: 0-255, 1 byte • Each histogram needs 256 entries • How many bytes per entry? • That’s data dependent • Let’s assume 32-bits or 4 bytes • 16KB (shared memory) / (256 x 4) (histogram) • 16 sub-histograms at any given point of time • If one per thread then we have less than a warp • Let’s try one histogram per block • many threads per block • Ordering problem persists but within a block

Partial Histogram Data Structure • Array in shared memory • One per block • One column per possible pixel value

Algorithm Overview • Step 1: • Initialize partial histogram • Each thread: • s_Hist[index] = 0 • index += threads per block • Step 2: • Update histogram • Each thread: • read data[index] • update s_Hist[]  conflicts possible • index += Total number of threads • Step 3: • Update global histogram • Each thread • read s_hist[index] • update global histogram • index += threads per block

Simultaneous Updates? • Threads in a block: • update s_Hist[] • All threads: • update global histogram • Without special support this becomes: • register X = value of A • X ++ • A = register X • This is a read-modify-write sequence

The problem with simultaneous updates • What if we do each step individually • r10 = mem[100] 10 r100 = mem[100] 10 • r10++ 11 r100++ 11 • mem[100] = r10 11mem[100] = r100 11 • But we really wanted 12 • What if we had 32 threads running in parallel? • Start with 10 we would want: 10+32 • We may still get 11 • Need to think about this: • Special support: Atomic operations

Atomic Operations • Read-Modify-Write operations that are guaranteed to happen “atomically” • Produces the same result as if the sequence executed in isolation in time • Think of it as “serializing the execution” of all atomics • This is not what necessarily happens • This is how you should think about them

Atomic Operations • Supported both in Shared and Global memory • Example: • atomicAdd (pointer, value) • does: *pointer += value • Atomic Operations • Add, Sub, Inc, Dec • Exch, Min, Max, CAS • Bitwise: And, Or, Xor • Work with (unsigned) integers • Exch works with single FP as well

atomicExch, atomicMin, atomicMax, atomicCAS • atomicExch (pointer, value) • tmp = * pointer • *pointer = value • return tmp • atomicMin (pointer, value) (max is similar) • tmp = *pointer • if (*pointer < value) *pointer = value • return tmp • atomicCAS (pointer, value1, value2) • tmp = *pointer • if (*pointer == value1) *pointer = value2 • return tmp

atomicInc, atomicDec • atomicInc (pointer, value) • tmp = *pointer • if (*pointer < value) (*pointer)++ • else *pointer = 0 • return tmp • atomicDec (pointer, value) • tmp = *pointer • if (*pointer == 0 || *pointer > value) *pointer = value • else (*pointer)-- • return tmp • Allow for warp-around work queues

atomicAnd, atomicOr, atomicXOR • atomicAnd (pointer, value) • tmp = *pointer • *pointer = *pointer & value • return tmp • Others similar

CUDA Implementation - Declarations __global__ void histogram256Kernel (uint *d_Result, uint *d_Data, intdataN){ //Current global thread index const int globalTid= blockIdx.x * blockDim.x + threadIdx.x; //Total number of threads in the compute grid const int numThreads= blockDim.x * gridDim.x; __shared__ uints_Hist[BIN_COUNT];

Clear partial histogram buffer //Clear shared memory buffer for current block before processing for ( int pos = threadIdx.x; pos < BIN_COUNT; pos += blockDim.x) s_Hist[pos] = 0; __syncthreads(); // All threads finished clearing out the // histogram

Generate partial histogram for ( int pos = globalTid; pos < dataN; pos += numThreads){ uint data4 = d_Data[pos]; // coalesced // shared memory is word interleaved // read four pixels per thread with a load word atomicAdd (s_Hist + (data4 >> 0) & 0xFFU, 1); atomicAdd (s_Hist + (data4 >> 8) & 0xFFU, 1); atomicAdd (s_Hist + (data4 >> 16) & 0xFFU, 1); atomicAdd (s_Hist + (data4 >> 24) & 0xFFU, 1); // we are not using atomicInc which has a more // complex structure that atomicAdd } __syncthreads();

Merge partial histogram with global histogram for (int pos = threadIdx.x; pos < BIN_COUNT; pos += blockDim.x){ atomicAdd(d_Result + pos, s_Hist[pos]); // these operate on global memory }

Code overview __global__ void histogram256Kernel (uint *d_Result, uint *d_Data, intdataN){ const intglobalTid = blockIdx.x * blockDim.x + threadIdx.x; const intnumThreads = blockDim.x * gridDim.x; __shared__ uints_Hist[BIN_COUNT]; for (int pos = threadIdx.x; pos < BIN_COUNT; pos += blockDim.x) s_Hist[pos] = 0; __syncthreads (); for (int pos = globalTid; pos < dataN; pos += numThreads){ uint data4 = d_Data[pos]; // coalesced atomicAdd (s_Hist + (data4 >> 0) & 0xFFU, 1); atomicAdd (s_Hist + (data4 >> 8) & 0xFFU, 1); atomicAdd (s_Hist + (data4 >> 16) & 0xFFU, 1); atomicAdd (s_Hist + (data4 >> 24) & 0xFFU, 1); } __syncthreads(); for (int pos = threadIdx.x; pos < BIN_COUNT; pos += blockDim.x) atomicAdd(d_Result + pos, s_Hist[pos]); }

Discussion • s_Hist updates • Conflicts in shared memory • Data Dependent • 16-way conflicts possible and likely • Is there an alternative? • One histogram per thread? • Not enough shared memory • Load data in shared memory • Each thread produces a portion of the s_Hist that maps onto the same bank?

Warp Vote Functions • WARP, not block, grid, etc. ** WARP ** • int__all (int predicate); • evaluates predicate for all threads of the warp and returns non-zero (TRUE) if and only if predicate evaluates to non-zero (TRUE) for all of them. • int __any (int predicate); • evaluates predicate for all threads of the warp and returns non-zero (TRUE) if and only if predicate evaluates to non-zero (TRUE) for any of them.

Warp Vote Functions Example • Original code: for (int pos = threadIdx.x; pos < BIN_COUNT; pos += blockDim.x){ atomicAdd(d_Result + pos, s_Hist[pos]); • Modified w/ __any (): • Update only if there are non-zero values for (int pos = threadIdx.x; pos < BIN_COUNT; pos += blockDim.x){ if (__any (s_Hist[pos] != 0) ) atomicAdd(d_Result + pos, s_Hist[pos]); • Modified w/ __all (): for (int pos = threadIdx.x; pos < BIN_COUNT; pos += blockDim.x){ if (!__all (s_Hist[pos] == 0) ) atomicAdd(d_Result + pos, s_Hist[pos]);

Fence Primitives vs. Syncthreads __syncthreads() • Waits until: • all threads reach this point and • All threads must execute this  otherwise we have deadlock • all their globaland sharedmemory accesses made up to __syncthreads() are visible to all threads within the block. __threadfence_block() • Waits for all global and shared memory accesses made by the thread before the __threadfence_block() to become visible to: • All threads in the block. • Not all threads have to do this __threadfence() • Waits for all global and shared memory accesses made by the thread before the __threadfence() to become visible to: • All threads in the device for global memory • All threads in the block for shared memory • Not all threads have to execute this

Sparse Matrix Multiplication • Sparse Matrix N x N: • number of non-zero entries m is only a small fraction of the total • Representation goal: • store only non-zero entries • Typically: • m = O(N)

Compressed Sparse Row Representation • Av[]: • Array values in row-major order • Aj[]: • Column for corresponding Av[] entry • Ap[]: • row i extends from indexes Ap[i] to Ap[i+1] -1 in Av[] and Aj[]

Matrix x Vector: y = Ax x Ax 2 x 20 + 4 x 30 + 1 x 40

Single Row Multiply • Produces an entry of the result vector float multiply_row (uintrowsize, uint *Aj, // column indices for row float *Av, // nonzero entries for row float *xl // the RHS vector { float sum = 0; for (uint column=0; column<rowsize; column++) sum = Av[column] * x[ Aj[column] ] ; return sum; }

Serial Code void csrmul_serial (uint *Ap, uint *Aj, float *Av, uint num_rows, float *x, float *y) { for (uint row=0; row<num_rows; row++) { uint row_begin = Ap[row]; uint row_end = Ap[row + l]; y[row] = multiply_row ( row_end - row_begin, Aj+row_begin, Av+row_begin, x); } }

CUDA Strategy • Assume that there are many rows • One thread per row

CUDA Kernel __global__ void csrmul_kernel (uint *Ap, uint *Aj, float *Av, uint num_rows, float *x, float *y) uint row = blockIdx.x * blockDim.x + threadIdx.x; uint row_begin = Ap[row]; uint row_end = Ap[row+1]; y[row] = multiply_row (row_end – row_begin, Aj + row_begin, Av + row_begin, x); }

Discussion • We are not using shared memory • all are in global memory • Claim: • rows using x[i] will be rows near row i • Nature of computations using sparse matrices • Block processing rows i through j • cache x[i] through x[j] • load from shared memory if possible • Unroll multiply_row() • Fetch Ap[row+1] from adjacent thread

CSR multiplication using shared memory __global__ void csrmul_cached( uint *Ap, uint *Aj, float *Av, uintnum_rows,float *x, float *y) { __shared__ float cache[blocksize]; uintblock_begin = blockIdx.x * blockDim.x; uintblock_end = block_begin + blockDim.x; uint row = block_begin + threadIdx.x: if (row < num_rows) cache[threadIdx.x] = x[row]; __syncthreads(); if (row < num_rows) // for left over threads at the end { uintrow_begin = Ap[row]; uintrow_end = Ap[row + l] ; float sum = 0, x_j ; for (uintcol=row_begin; col < row_end; col++ ) { uint j = Aj[col]; // Fetch x_j from our cache when possible if (j >= block_begin&& j < block_end) x_j = cache [j – block_begin]; else x_j = x [j]; sum += Av[col] * x_j ; } y[row] = sum; } }

Introduction to CUDA Programming