1 / 9

Sample Code Segment (pg. 41 & 42)

Sample Code Segment (pg. 41 & 42). 1. #include "../common/book.h" 2. #define N 10 3. int main( void ) { 4. int a[N], b[N], c[N]; 5. int *dev_a, *dev_b, *dev_c; 6. // allocate the memory on the GPU 7. HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );

emma-levine
Download Presentation

Sample Code Segment (pg. 41 & 42)

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


  1. Sample Code Segment (pg. 41 & 42) • 1. #include "../common/book.h" • 2. #define N 10 • 3. int main( void ) { • 4. int a[N], b[N], c[N]; • 5. int *dev_a, *dev_b, *dev_c; • 6. // allocate the memory on the GPU • 7. HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) ); • 8. HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ); • 9. HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ); • // fill the arrays 'a' and 'b' on the CPU • 10. for (int i=0; i<N; i++) { • 11. a[i] = -i; • 12. b[i] = i * i; • 13.} • // copy the arrays 'a' and 'b' to the GPU • 14. HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice ) ); • 15. HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice ) ); • 16. add<<<N,1>>>( dev_a, dev_b, dev_c ); • // copy the array 'c' back from the GPU to the CPU • 17. HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int), • cudaMemcpyDeviceToHost ) ); • // display the results • 18. for (int i=0; i<N; i++) { • 19. printf( "%d + %d = %d\n", a[i], b[i], c[i] ); } • // free the memory allocated on the GPU • 20. cudaFree( dev_a ); • 21.cudaFree( dev_b ); • 22. cudaFree( dev_c ); • 23. return 0; • }

  2. Sample Code Illustration – time t1 3. Int main (void) { 4. int a[N], b[N], c[N]; 5. int *dev_a, *dev_b, *dev_c; 2. define N 10 B C A Idle *dev_a *dev_c *dev_b

  3. Sample Code Illustration – time t1 7. HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) ); 8. HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) ); 9. HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) ); &dev_c &dev_a Idle &dev_b

  4. Sample Code Illustration – time t2 • 14. HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice ) ); • 15. HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice ) ); 10. for (int i=0; i<N; i++) { 11. a[i] = -i; 12. b[i] = i * i; &dev_a i = 0 i = 1 i = 2 i = 3 i = N a[0] 0 a[1] -1 a[2] -2 a[3] -3 a[n] 9 … a[2] -2 a[1] -1 a[0] 0 b[0] 0 * 0 b[1] 1 * 1 b[2] 2* 2 b[3] 3* 3 b[n] 9 * 9 … … a[3] -3 a[9] -9 &dev_b c[0] c[1] c[2] c[3] c[9] … Array c, currently unused b[1] 1 * 1 b[0] 0 * 0 b[2] 2 * 2 … b[3] 3 * 3 b[9] 9 * 9

  5. Sample Code Illustration – time t3 16. add<<<N,1>>>( dev_a, dev_b, dev_c ); &dev_a &dev_b &dev_a elements, added to &dev_b elements by each thread … … a[2] -2 a[1] -1 a[0] 0 a[3] -3 a[9] -9 -9+81 -1+1 -3+9 -2+4 0 + 0 … Idle a[9] + b[9] a[3] + b[3] a[0] + b[0] a[1] + b[1] a[2] + b[2] 10 total threads, one thread per operation &dev_c b[1] 1 * 1 b[0] 0 * 0 b[2] 2 * 2 c[2] 2 c[1] 0 c[0] 0 c[3] 6 c[9] 72 b[3] 3 * 3 b[9] 9 * 9 …

  6. Sample Code Illustration – time t4 17. HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost ) ); dev_c c[0] c[1] c[2] c[3] c[9] … … c[0] 0 c[2] 2 c[1] 0 c[3] 6 c[9] 72

  7. o Sample Code Illustration – time t5 18. for (inti=0; i<N; i++) { 19. printf( "%d + %d = %d\n", a[i], b[i], c[i] ); } For i, being 0 through 9, since it is n large, which is 10. a[i] Added with b[i] Equals c[i] a[0] 0 a[1] -1 a[2] -2 a[3] -3 a[n] -9 … a[n] a[3] a[2] a[1] a[0] b[1] 1 b[2] 4 b[3] 9 b[n] 81 b[0] 0 … b[n] b[3] b[2] b[1] b[0] c[3] 6 a[n] 72 c[1] 0 c[2] 2 c[0] 0 … c[n] c[3] c[2] c[1] c[0]

  8. Sample Code Illustration – time t6 20. cudaFree( dev_a ); 21. cudaFree( dev_b ); 22. cudaFree( dev_c ); dev_b dev_a dev_c 23. return 0;

  9. Diagram Key &dev_c &dev_a CPUGPU Triangle: Represents Array Transparent Oval: Empty Pointer Oval: Represents Pointer Solid Oval: Referenced Pointer Oval [red square]: Individual Array Elements Inverted Triangle: Filled Array Circle [red square]: Pointer Element Size in Bytes Triangle [red square]: Pointer Element Size in Bytes Red Square: One Block Color and Shading Transparent: Empty Blue Tint: Array Color Solid Color: Full Yellow: Dynamic Memory Allocation Parallel Arrays: Identical Colors Dashed Black Border: Pointer ---------------------------------------------------------------------------------------------------------------------------------------------- Red Arrows: Directionality of Memory Transfer Between CPU and GPU, or GPU and CPU Squiggle Arrow: Single Thread … a[2] -2 a[1] -1 a[0] 0 A a[3] -3 a[9] -9 -9+81

More Related