1. deviceQuery

https://github.com/NVIDIA/cuda-samples

Clone:

git clone https://github.com/NVIDIA/cuda-samples.git

Install cmake:

sudo apt update
sudo apt install -y cmake build-essential git
cmake --version        # you’ll usually get 3.27 – 3.30 on 24.04/22.04

Build:

mkdir build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release              # generates the project
cmake --build . --target deviceQuery -j$(nproc)  # builds just this sample

Run:

cd ./Samples/1_Utilities/deviceQuery
./deviceQuery

2. Go over concepts

https://gpu-viz.alexxi.dev/

3. First look at the memory layout

https://github.com/Infatoshi/cuda-course/tree/master/05_Writing_your_First_Kernels/01%20CUDA%20Basics

	git clone https://github.com/Infatoshi/cuda-course
cd 05_Writing_your_First_Kernels/01\ CUDA\ Basics/
nvcc 01_idxing.cu -o 01_idxing && ./01_idxing
//-----------------------------------------------------------------------------
// File:    whoami_annotated.cu
// Purpose: Illustrate CUDA grid-/block-/thread indexing by assigning each
//          thread a unique “global person ID” in a 3-D apartment-complex
//          analogy and printing it.  The program shows how the built-in
//          variables
//              blockIdx, blockDim, gridDim, threadIdx
//          can be combined to calculate
//              * a linear block ID within the grid, and
//              * a linear thread ID within the block, and
//              * a linear thread ID within the entire grid.
//
// Compile & run (compute capability ≥ 2.x):
//     nvcc -o whoami whoami_annotated.cu && ./whoami | sort -n   # sort for clarity
//-----------------------------------------------------------------------------
#include <cstdio>
 
///////////////////////////////////////////////////////////////////////////////
// GPU kernel
///////////////////////////////////////////////////////////////////////////////
__global__ void whoami() {
    //---------------------------------------------------------------------
    // 1. Derive a *linear* block ID from the 3-D block coordinates.
    //    Think of the grid as a city                     (z-dimension)
    //    made of buildings (y-dimension)
    //    with floors      (x-dimension).
    //---------------------------------------------------------------------
    int block_id =
        blockIdx.x +                       // apartment number on this floor
        blockIdx.y * gridDim.x +           // floor number within the building
        blockIdx.z * gridDim.x * gridDim.y;// building number within the city
 
    //---------------------------------------------------------------------
    // 2. Convert that block ID into a *global* offset for its first thread.
    //---------------------------------------------------------------------
    int threads_per_block = blockDim.x * blockDim.y * blockDim.z; // people / apt.
    int block_offset      = block_id * threads_per_block;         // first resident
 
    //---------------------------------------------------------------------
    // 3. Derive a *linear* thread index within its own block.
    //---------------------------------------------------------------------
    int thread_offset =
        threadIdx.x +                       // position within a row
        threadIdx.y * blockDim.x +          // row within a column
        threadIdx.z * blockDim.x * blockDim.y; // layer within the stack
 
    //---------------------------------------------------------------------
    // 4. Combine the two to obtain a unique global thread ID.
    //---------------------------------------------------------------------
    int global_id = block_offset + thread_offset;
 
    //---------------------------------------------------------------------
    // 5. Report the mapping for this thread.
    //    "%04d" pads the ID so the output aligns nicely.
    //---------------------------------------------------------------------
    printf("%04d | Block(%d %d %d) = %3d | Thread(%d %d %d) = %3d\n",
        global_id,
        blockIdx.x, blockIdx.y, blockIdx.z, block_id,
        threadIdx.x, threadIdx.y, threadIdx.z, thread_offset);
}
 
///////////////////////////////////////////////////////////////////////////////
// Host (CPU) code
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
    //---------------------------------------------------------------------
    // 1. Define grid (block) dimensions.
    //    Grid:  2 × 3 × 4  = 24 blocks
    //    Block: 4 × 4 × 4  = 64 threads  (2 warps per block because 64 / 32 = 2)
    //---------------------------------------------------------------------
    const int b_x = 2, b_y = 3, b_z = 4;         // gridDim.{x,y,z}
    const int t_x = 4, t_y = 4, t_z = 4;         // blockDim.{x,y,z}
 
    //---------------------------------------------------------------------
    // 2. Sanity info printed on the host, so we know what to expect.
    //---------------------------------------------------------------------
    int blocks_per_grid   = b_x * b_y * b_z;     // 24
    int threads_per_block = t_x * t_y * t_z;     // 64
 
    printf("%d blocks/grid\n",   blocks_per_grid);
    printf("%d threads/block\n", threads_per_block);
    printf("%d total threads\n\n", blocks_per_grid * threads_per_block);
 
    //---------------------------------------------------------------------
    // 3. Launch the kernel.
    //---------------------------------------------------------------------
    dim3 blocksPerGrid(b_x, b_y, b_z);   // <<<grid>>>
    dim3 threadsPerBlock(t_x, t_y, t_z); // <<<block>>>
 
    whoami<<<blocksPerGrid, threadsPerBlock>>>();
 
    //---------------------------------------------------------------------
    // 4. Wait for the GPU to finish before the program exits.
    //---------------------------------------------------------------------
    cudaDeviceSynchronize();
}
 

4. Vector add

https://leetgpu.com/challenges/vector-addition

5. Matrix multiplication

https://leetgpu.com/challenges/matrix-multiplication