1. deviceQuery
https://github.com/NVIDIA/cuda-samples
Clone:
git clone https://github.com/NVIDIA/cuda-samples.git
Install cmake
:
sudo apt update
sudo apt install -y cmake build-essential git
cmake --version # you’ll usually get 3.27 – 3.30 on 24.04/22.04
Build:
mkdir build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release # generates the project
cmake --build . --target deviceQuery -j$(nproc) # builds just this sample
Run:
cd ./Samples/1_Utilities/deviceQuery
./deviceQuery
2. Go over concepts
3. First look at the memory layout
git clone https://github.com/Infatoshi/cuda-course
cd 05_Writing_your_First_Kernels/01\ CUDA\ Basics/
nvcc 01_idxing.cu -o 01_idxing && ./01_idxing
//-----------------------------------------------------------------------------
// File: whoami_annotated.cu
// Purpose: Illustrate CUDA grid-/block-/thread indexing by assigning each
// thread a unique “global person ID” in a 3-D apartment-complex
// analogy and printing it. The program shows how the built-in
// variables
// blockIdx, blockDim, gridDim, threadIdx
// can be combined to calculate
// * a linear block ID within the grid, and
// * a linear thread ID within the block, and
// * a linear thread ID within the entire grid.
//
// Compile & run (compute capability ≥ 2.x):
// nvcc -o whoami whoami_annotated.cu && ./whoami | sort -n # sort for clarity
//-----------------------------------------------------------------------------
#include <cstdio>
///////////////////////////////////////////////////////////////////////////////
// GPU kernel
///////////////////////////////////////////////////////////////////////////////
__global__ void whoami() {
//---------------------------------------------------------------------
// 1. Derive a *linear* block ID from the 3-D block coordinates.
// Think of the grid as a city (z-dimension)
// made of buildings (y-dimension)
// with floors (x-dimension).
//---------------------------------------------------------------------
int block_id =
blockIdx.x + // apartment number on this floor
blockIdx.y * gridDim.x + // floor number within the building
blockIdx.z * gridDim.x * gridDim.y;// building number within the city
//---------------------------------------------------------------------
// 2. Convert that block ID into a *global* offset for its first thread.
//---------------------------------------------------------------------
int threads_per_block = blockDim.x * blockDim.y * blockDim.z; // people / apt.
int block_offset = block_id * threads_per_block; // first resident
//---------------------------------------------------------------------
// 3. Derive a *linear* thread index within its own block.
//---------------------------------------------------------------------
int thread_offset =
threadIdx.x + // position within a row
threadIdx.y * blockDim.x + // row within a column
threadIdx.z * blockDim.x * blockDim.y; // layer within the stack
//---------------------------------------------------------------------
// 4. Combine the two to obtain a unique global thread ID.
//---------------------------------------------------------------------
int global_id = block_offset + thread_offset;
//---------------------------------------------------------------------
// 5. Report the mapping for this thread.
// "%04d" pads the ID so the output aligns nicely.
//---------------------------------------------------------------------
printf("%04d | Block(%d %d %d) = %3d | Thread(%d %d %d) = %3d\n",
global_id,
blockIdx.x, blockIdx.y, blockIdx.z, block_id,
threadIdx.x, threadIdx.y, threadIdx.z, thread_offset);
}
///////////////////////////////////////////////////////////////////////////////
// Host (CPU) code
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
//---------------------------------------------------------------------
// 1. Define grid (block) dimensions.
// Grid: 2 × 3 × 4 = 24 blocks
// Block: 4 × 4 × 4 = 64 threads (2 warps per block because 64 / 32 = 2)
//---------------------------------------------------------------------
const int b_x = 2, b_y = 3, b_z = 4; // gridDim.{x,y,z}
const int t_x = 4, t_y = 4, t_z = 4; // blockDim.{x,y,z}
//---------------------------------------------------------------------
// 2. Sanity info printed on the host, so we know what to expect.
//---------------------------------------------------------------------
int blocks_per_grid = b_x * b_y * b_z; // 24
int threads_per_block = t_x * t_y * t_z; // 64
printf("%d blocks/grid\n", blocks_per_grid);
printf("%d threads/block\n", threads_per_block);
printf("%d total threads\n\n", blocks_per_grid * threads_per_block);
//---------------------------------------------------------------------
// 3. Launch the kernel.
//---------------------------------------------------------------------
dim3 blocksPerGrid(b_x, b_y, b_z); // <<<grid>>>
dim3 threadsPerBlock(t_x, t_y, t_z); // <<<block>>>
whoami<<<blocksPerGrid, threadsPerBlock>>>();
//---------------------------------------------------------------------
// 4. Wait for the GPU to finish before the program exits.
//---------------------------------------------------------------------
cudaDeviceSynchronize();
}
4. Vector add
https://leetgpu.com/challenges/vector-addition