GPU Study Group - Session 1 Exercise Code Files

Exercise 1.1: Environment Setup Commands

Basic CUDA Verification

# Check CUDA compiler version
nvcc --version
 
# Check GPU status and driver
nvidia-smi
 
# List available CUDA devices
nvidia-smi -L

Compile and Run deviceQuery (if available)

# Find CUDA samples (location may vary)
find /usr -name "deviceQuery" 2>/dev/null
# Or try:
# find /opt -name "deviceQuery" 2>/dev/null
 
# If samples are installed:
cd /usr/local/cuda/samples/1_Utilities/deviceQuery
make
./deviceQuery
 
# Alternative: Create simple device query
# (See deviceQuery.cu below if samples not available)

Exercise 1.2: Hello CUDA

Basic Version (hello_cuda.cu)

#include <cuda_runtime.h>
#include <stdio.h>
 
__global__ void helloFromGPU() {
    printf("Hello from GPU thread %d in block %d\n", 
           threadIdx.x, blockIdx.x);
}
 
int main() {
    printf("Hello from CPU\n");
    
    // Launch kernel with 2 blocks, 4 threads per block
    helloFromGPU<<<2, 4>>>();
    
    // Wait for GPU to finish
    cudaDeviceSynchronize();
    
    printf("Back to CPU\n");
    return 0;
}

Compilation and Execution

# Compile
nvcc hello_cuda.cu -o hello_cuda
 
# Run
./hello_cuda

Expected Output

Hello from CPU
Hello from GPU thread 0 in block 0
Hello from GPU thread 1 in block 0
Hello from GPU thread 2 in block 0
Hello from GPU thread 3 in block 0
Hello from GPU thread 0 in block 1
Hello from GPU thread 1 in block 1
Hello from GPU thread 2 in block 1
Hello from GPU thread 3 in block 1
Back to CPU

Variations to Try

// Variation 1: Different grid configuration
helloFromGPU<<<3, 2>>>();  // 3 blocks, 2 threads each
 
// Variation 2: Single block, more threads
helloFromGPU<<<1, 8>>>();  // 1 block, 8 threads
 
// Variation 3: Maximum threads per block (usually 1024)
helloFromGPU<<<1, 1024>>>(); // May hit hardware limits
 
// Variation 4: More detailed thread info
__global__ void detailedHello() {
    printf("Block [%d,%d,%d] Thread [%d,%d,%d]\n",
           blockIdx.x, blockIdx.y, blockIdx.z,
           threadIdx.x, threadIdx.y, threadIdx.z);
}

Exercise 1.3: Simple Device Query (if samples not available)

deviceQuery.cu (Simplified Version)

#include <cuda_runtime.h>
#include <stdio.h>
 
int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    printf("Number of CUDA devices: %d\n\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        
        printf("Device %d: %s\n", i, prop.name);
        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("  Total Global Memory: %.2f GB\n", 
               prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
        printf("  Multiprocessors: %d\n", prop.multiProcessorCount);
        printf("  Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
        printf("  Max Block Dimensions: [%d, %d, %d]\n",
               prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("  Max Grid Dimensions: [%d, %d, %d]\n",
               prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("  Warp Size: %d\n", prop.warpSize);
        printf("  Memory Clock Rate: %.2f MHz\n", prop.memoryClockRate / 1000.0);
        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
        printf("  Peak Memory Bandwidth: %.2f GB/s\n\n",
               2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
    }
    
    return 0;
}

Key Specifications to Discuss

Questions for participants:
1. What is your GPU's compute capability? (affects which CUDA features available)
2. How much global memory? (determines max problem size)
3. How many multiprocessors? (affects parallelism level)
4. What's the max threads per block? (usually 1024)
5. What's the warp size? (should be 32 for NVIDIA GPUs)

Bonus: Error Checking Version

hello_cuda_safe.cu (With Error Checking)

#include <cuda_runtime.h>
#include <stdio.h>
 
// Error checking macro
#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d - %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)
 
__global__ void helloFromGPU() {
    printf("Hello from GPU thread %d in block %d\n", 
           threadIdx.x, blockIdx.x);
}
 
int main() {
    printf("Hello from CPU\n");
    
    // Launch kernel
    helloFromGPU<<<2, 4>>>();
    
    // Check for kernel launch errors
    CUDA_CHECK(cudaGetLastError());
    
    // Wait for GPU to finish and check for errors
    CUDA_CHECK(cudaDeviceSynchronize());
    
    printf("Back to CPU\n");
    return 0;
}

Google Colab Alternative

If participants don’t have local CUDA setup:

# Google Colab notebook cell 1: Check GPU
!nvidia-smi
 
# Cell 2: Write CUDA file
%%writefile hello_cuda.cu
#include <cuda_runtime.h>
#include <stdio.h>
 
__global__ void helloFromGPU() {
    printf("Hello from GPU thread %d in block %d\n", 
           threadIdx.x, blockIdx.x);
}
 
int main() {
    printf("Hello from CPU\n");
    helloFromGPU<<<2, 4>>>();
    cudaDeviceSynchronize();
    return 0;
}
 
# Cell 3: Compile and run
!nvcc hello_cuda.cu -o hello_cuda
!./hello_cuda

Troubleshooting Common Issues

Compilation Problems

# If nvcc not found
export PATH=/usr/local/cuda/bin:$PATH
 
# If architecture error
nvcc -arch=sm_50 hello_cuda.cu -o hello_cuda
 
# If linking problems
nvcc -lcudart hello_cuda.cu -o hello_cuda

Runtime Problems

# If no GPU detected
nvidia-smi  # Check if GPU is visible
 
# If driver issues
lsmod | grep nvidia  # Check if driver loaded
 
# If permission issues
ls -la /dev/nvidia*  # Check device permissions

Common Error Messages and Solutions

ErrorLikely CauseSolution
nvcc: command not foundCUDA not in PATHAdd to PATH or use full path
no CUDA-capable deviceNo GPU or driver issueCheck nvidia-smi
invalid device functionWrong architectureSpecify correct -arch flag
kernel launch timeoutInfinite loop in kernelCheck kernel logic