GPU Study Group - Session 1 Exercise Code Files

Exercise 1.1: Environment Setup Commands

Basic CUDA Verification

# Check CUDA compiler version
nvcc --version
 
# Check GPU status and driver
nvidia-smi
 
# List available CUDA devices
nvidia-smi -L

Compile and Run deviceQuery (if available)

# Find CUDA samples (location may vary)
find /usr -name "deviceQuery" 2>/dev/null
# Or try:
# find /opt -name "deviceQuery" 2>/dev/null
 
# If samples are installed:
cd /usr/local/cuda/samples/1_Utilities/deviceQuery
make
./deviceQuery
 
# Alternative: Create simple device query
# (See deviceQuery.cu below if samples not available)

Exercise 1.2: Hello CUDA

Basic Version (hello_cuda.cu)

#include <cuda_runtime.h>
#include <stdio.h>
 
__global__ void helloFromGPU() {
    printf("Hello from GPU thread %d in block %d\n", 
           threadIdx.x, blockIdx.x);
}
 
int main() {
    printf("Hello from CPU\n");
    
    // Launch kernel with 2 blocks, 4 threads per block
    helloFromGPU<<<2, 4>>>();
    
    // Wait for GPU to finish
    cudaDeviceSynchronize();
    
    printf("Back to CPU\n");
    return 0;
}

Compilation and Execution

# Compile
nvcc hello_cuda.cu -o hello_cuda
 
# Run
./hello_cuda

Expected Output

Hello from CPU
Hello from GPU thread 0 in block 0
Hello from GPU thread 1 in block 0
Hello from GPU thread 2 in block 0
Hello from GPU thread 3 in block 0
Hello from GPU thread 0 in block 1
Hello from GPU thread 1 in block 1
Hello from GPU thread 2 in block 1
Hello from GPU thread 3 in block 1
Back to CPU

Variations to Try

// Variation 1: Different grid configuration
helloFromGPU<<<3, 2>>>();  // 3 blocks, 2 threads each
 
// Variation 2: Single block, more threads
helloFromGPU<<<1, 8>>>();  // 1 block, 8 threads
 
// Variation 3: Maximum threads per block (usually 1024)
helloFromGPU<<<1, 1024>>>(); // May hit hardware limits
 
// Variation 4: More detailed thread info
__global__ void detailedHello() {
    printf("Block [%d,%d,%d] Thread [%d,%d,%d]\n",
           blockIdx.x, blockIdx.y, blockIdx.z,
           threadIdx.x, threadIdx.y, threadIdx.z);
}

Exercise 1.3: Simple Device Query (if samples not available)

deviceQuery.cu (Simplified Version)

#include <cuda_runtime.h>
#include <stdio.h>
 
int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    printf("Number of CUDA devices: %d\n\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        
        printf("Device %d: %s\n", i, prop.name);
        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("  Total Global Memory: %.2f GB\n", 
               prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
        printf("  Multiprocessors: %d\n", prop.multiProcessorCount);
        printf("  Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
        printf("  Max Block Dimensions: [%d, %d, %d]\n",
               prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("  Max Grid Dimensions: [%d, %d, %d]\n",
               prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("  Warp Size: %d\n", prop.warpSize);
        printf("  Memory Clock Rate: %.2f MHz\n", prop.memoryClockRate / 1000.0);
        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
        printf("  Peak Memory Bandwidth: %.2f GB/s\n\n",
               2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
    }
    
    return 0;
}

Key Specifications to Discuss

Questions for participants:
1. What is your GPU's compute capability? (affects which CUDA features available)
2. How much global memory? (determines max problem size)
3. How many multiprocessors? (affects parallelism level)
4. What's the max threads per block? (usually 1024)
5. What's the warp size? (should be 32 for NVIDIA GPUs)

Bonus: Error Checking Version

hello_cuda_safe.cu (With Error Checking)

#include <cuda_runtime.h>
#include <stdio.h>
 
// Error checking macro
#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error at %s:%d - %s\n", __FILE__, __LINE__, \
                   cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)
 
__global__ void helloFromGPU() {
    printf("Hello from GPU thread %d in block %d\n", 
           threadIdx.x, blockIdx.x);
}
 
int main() {
    printf("Hello from CPU\n");
    
    // Launch kernel
    helloFromGPU<<<2, 4>>>();
    
    // Check for kernel launch errors
    CUDA_CHECK(cudaGetLastError());
    
    // Wait for GPU to finish and check for errors
    CUDA_CHECK(cudaDeviceSynchronize());
    
    printf("Back to CPU\n");
    return 0;
}

Google Colab Alternative

If participants don’t have local CUDA setup:

# Google Colab notebook cell 1: Check GPU
!nvidia-smi
 
# Cell 2: Write CUDA file
%%writefile hello_cuda.cu
#include <cuda_runtime.h>
#include <stdio.h>
 
__global__ void helloFromGPU() {
    printf("Hello from GPU thread %d in block %d\n", 
           threadIdx.x, blockIdx.x);
}
 
int main() {
    printf("Hello from CPU\n");
    helloFromGPU<<<2, 4>>>();
    cudaDeviceSynchronize();
    return 0;
}
 
# Cell 3: Compile and run
!nvcc hello_cuda.cu -o hello_cuda
!./hello_cuda

Troubleshooting Common Issues

Compilation Problems

# If nvcc not found
export PATH=/usr/local/cuda/bin:$PATH
 
# If architecture error
nvcc -arch=sm_50 hello_cuda.cu -o hello_cuda
 
# If linking problems
nvcc -lcudart hello_cuda.cu -o hello_cuda

Runtime Problems

# If no GPU detected
nvidia-smi  # Check if GPU is visible
 
# If driver issues
lsmod | grep nvidia  # Check if driver loaded
 
# If permission issues
ls -la /dev/nvidia*  # Check device permissions

Common Error Messages and Solutions

Error	Likely Cause	Solution
`nvcc: command not found`	CUDA not in PATH	Add to PATH or use full path
`no CUDA-capable device`	No GPU or driver issue	Check nvidia-smi
`invalid device function`	Wrong architecture	Specify correct -arch flag
`kernel launch timeout`	Infinite loop in kernel	Check kernel logic

Alex Xi's Notes

Explorer

exercise code files

GPU Study Group - Session 1 Exercise Code Files

Exercise 1.1: Environment Setup Commands

Basic CUDA Verification

Compile and Run deviceQuery (if available)

Exercise 1.2: Hello CUDA

Basic Version (hello_cuda.cu)

Compilation and Execution

Expected Output

Variations to Try

Exercise 1.3: Simple Device Query (if samples not available)

deviceQuery.cu (Simplified Version)

Key Specifications to Discuss

Bonus: Error Checking Version

hello_cuda_safe.cu (With Error Checking)

Google Colab Alternative

If participants don’t have local CUDA setup:

Troubleshooting Common Issues

Compilation Problems

Runtime Problems

Common Error Messages and Solutions

Table of Contents

Backlinks

Explorer