# Check CUDA compiler versionnvcc --version# Check GPU status and drivernvidia-smi# List available CUDA devicesnvidia-smi -L
Compile and Run deviceQuery (if available)
# Find CUDA samples (location may vary)find /usr -name "deviceQuery" 2>/dev/null# Or try:# find /opt -name "deviceQuery" 2>/dev/null# If samples are installed:cd /usr/local/cuda/samples/1_Utilities/deviceQuerymake./deviceQuery# Alternative: Create simple device query# (See deviceQuery.cu below if samples not available)
Exercise 1.2: Hello CUDA
Basic Version (hello_cuda.cu)
#include <cuda_runtime.h>#include <stdio.h>__global__ void helloFromGPU() { printf("Hello from GPU thread %d in block %d\n", threadIdx.x, blockIdx.x);}int main() { printf("Hello from CPU\n"); // Launch kernel with 2 blocks, 4 threads per block helloFromGPU<<<2, 4>>>(); // Wait for GPU to finish cudaDeviceSynchronize(); printf("Back to CPU\n"); return 0;}
Hello from CPU
Hello from GPU thread 0 in block 0
Hello from GPU thread 1 in block 0
Hello from GPU thread 2 in block 0
Hello from GPU thread 3 in block 0
Hello from GPU thread 0 in block 1
Hello from GPU thread 1 in block 1
Hello from GPU thread 2 in block 1
Hello from GPU thread 3 in block 1
Back to CPU
Variations to Try
// Variation 1: Different grid configurationhelloFromGPU<<<3, 2>>>(); // 3 blocks, 2 threads each// Variation 2: Single block, more threadshelloFromGPU<<<1, 8>>>(); // 1 block, 8 threads// Variation 3: Maximum threads per block (usually 1024)helloFromGPU<<<1, 1024>>>(); // May hit hardware limits// Variation 4: More detailed thread info__global__ void detailedHello() { printf("Block [%d,%d,%d] Thread [%d,%d,%d]\n", blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);}
Exercise 1.3: Simple Device Query (if samples not available)
deviceQuery.cu (Simplified Version)
#include <cuda_runtime.h>#include <stdio.h>int main() { int deviceCount; cudaGetDeviceCount(&deviceCount); printf("Number of CUDA devices: %d\n\n", deviceCount); for (int i = 0; i < deviceCount; i++) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, i); printf("Device %d: %s\n", i, prop.name); printf(" Compute Capability: %d.%d\n", prop.major, prop.minor); printf(" Total Global Memory: %.2f GB\n", prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0)); printf(" Multiprocessors: %d\n", prop.multiProcessorCount); printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock); printf(" Max Block Dimensions: [%d, %d, %d]\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf(" Max Grid Dimensions: [%d, %d, %d]\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf(" Warp Size: %d\n", prop.warpSize); printf(" Memory Clock Rate: %.2f MHz\n", prop.memoryClockRate / 1000.0); printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth); printf(" Peak Memory Bandwidth: %.2f GB/s\n\n", 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6); } return 0;}
Key Specifications to Discuss
Questions for participants:
1. What is your GPU's compute capability? (affects which CUDA features available)
2. How much global memory? (determines max problem size)
3. How many multiprocessors? (affects parallelism level)
4. What's the max threads per block? (usually 1024)
5. What's the warp size? (should be 32 for NVIDIA GPUs)
Bonus: Error Checking Version
hello_cuda_safe.cu (With Error Checking)
#include <cuda_runtime.h>#include <stdio.h>// Error checking macro#define CUDA_CHECK(call) \ do { \ cudaError_t err = call; \ if (err != cudaSuccess) { \ printf("CUDA error at %s:%d - %s\n", __FILE__, __LINE__, \ cudaGetErrorString(err)); \ exit(1); \ } \ } while(0)__global__ void helloFromGPU() { printf("Hello from GPU thread %d in block %d\n", threadIdx.x, blockIdx.x);}int main() { printf("Hello from CPU\n"); // Launch kernel helloFromGPU<<<2, 4>>>(); // Check for kernel launch errors CUDA_CHECK(cudaGetLastError()); // Wait for GPU to finish and check for errors CUDA_CHECK(cudaDeviceSynchronize()); printf("Back to CPU\n"); return 0;}
Google Colab Alternative
If participants don’t have local CUDA setup:
# Google Colab notebook cell 1: Check GPU!nvidia-smi# Cell 2: Write CUDA file%%writefile hello_cuda.cu#include <cuda_runtime.h>#include <stdio.h>__global__ void helloFromGPU() { printf("Hello from GPU thread %d in block %d\n", threadIdx.x, blockIdx.x);}int main() { printf("Hello from CPU\n"); helloFromGPU<<<2, 4>>>(); cudaDeviceSynchronize(); return 0;}# Cell 3: Compile and run!nvcc hello_cuda.cu -o hello_cuda!./hello_cuda
Troubleshooting Common Issues
Compilation Problems
# If nvcc not foundexport PATH=/usr/local/cuda/bin:$PATH# If architecture errornvcc -arch=sm_50 hello_cuda.cu -o hello_cuda# If linking problemsnvcc -lcudart hello_cuda.cu -o hello_cuda
Runtime Problems
# If no GPU detectednvidia-smi # Check if GPU is visible# If driver issueslsmod | grep nvidia # Check if driver loaded# If permission issuesls -la /dev/nvidia* # Check device permissions