Agenda

1. `deviceQuery`

Clone:

git clone https://github.com/NVIDIA/cuda-samples.git

Install cmake:

sudo apt update
sudo apt install -y cmake build-essential git
cmake --version        # you’ll usually get 3.27 – 3.30 on 24.04/22.04

Build:

mkdir build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release              # generates the project
cmake --build . --target deviceQuery -j$(nproc)  # builds just this sample

Run:

cd ./Samples/1_Utilities/deviceQuery
./deviceQuery

2. Go over concepts

https://gpu-viz.alexxi.dev/

3. First look at the memory layout

https://github.com/Infatoshi/cuda-course/tree/master/05_Writing_your_First_Kernels/01%20CUDA%20Basics

	git clone https://github.com/Infatoshi/cuda-course

cd 05_Writing_your_First_Kernels/01\ CUDA\ Basics/
nvcc 01_idxing.cu -o 01_idxing && ./01_idxing

//-----------------------------------------------------------------------------
// File:    whoami_annotated.cu
// Purpose: Illustrate CUDA grid-/block-/thread indexing by assigning each
//          thread a unique “global person ID” in a 3-D apartment-complex
//          analogy and printing it.  The program shows how the built-in
//          variables
//              blockIdx, blockDim, gridDim, threadIdx
//          can be combined to calculate
//              * a linear block ID within the grid, and
//              * a linear thread ID within the block, and
//              * a linear thread ID within the entire grid.
//
// Compile & run (compute capability ≥ 2.x):
//     nvcc -o whoami whoami_annotated.cu && ./whoami | sort -n   # sort for clarity
//-----------------------------------------------------------------------------
#include <cstdio>
 
///////////////////////////////////////////////////////////////////////////////
// GPU kernel
///////////////////////////////////////////////////////////////////////////////
__global__ void whoami() {
    //---------------------------------------------------------------------
    // 1. Derive a *linear* block ID from the 3-D block coordinates.
    //    Think of the grid as a city                     (z-dimension)
    //    made of buildings (y-dimension)
    //    with floors      (x-dimension).
    //---------------------------------------------------------------------
    int block_id =
        blockIdx.x +                       // apartment number on this floor
        blockIdx.y * gridDim.x +           // floor number within the building
        blockIdx.z * gridDim.x * gridDim.y;// building number within the city
 
    //---------------------------------------------------------------------
    // 2. Convert that block ID into a *global* offset for its first thread.
    //---------------------------------------------------------------------
    int threads_per_block = blockDim.x * blockDim.y * blockDim.z; // people / apt.
    int block_offset      = block_id * threads_per_block;         // first resident
 
    //---------------------------------------------------------------------
    // 3. Derive a *linear* thread index within its own block.
    //---------------------------------------------------------------------
    int thread_offset =
        threadIdx.x +                       // position within a row
        threadIdx.y * blockDim.x +          // row within a column
        threadIdx.z * blockDim.x * blockDim.y; // layer within the stack
 
    //---------------------------------------------------------------------
    // 4. Combine the two to obtain a unique global thread ID.
    //---------------------------------------------------------------------
    int global_id = block_offset + thread_offset;
 
    //---------------------------------------------------------------------
    // 5. Report the mapping for this thread.
    //    "%04d" pads the ID so the output aligns nicely.
    //---------------------------------------------------------------------
    printf("%04d | Block(%d %d %d) = %3d | Thread(%d %d %d) = %3d\n",
        global_id,
        blockIdx.x, blockIdx.y, blockIdx.z, block_id,
        threadIdx.x, threadIdx.y, threadIdx.z, thread_offset);
}
 
///////////////////////////////////////////////////////////////////////////////
// Host (CPU) code
///////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
    //---------------------------------------------------------------------
    // 1. Define grid (block) dimensions.
    //    Grid:  2 × 3 × 4  = 24 blocks
    //    Block: 4 × 4 × 4  = 64 threads  (2 warps per block because 64 / 32 = 2)
    //---------------------------------------------------------------------
    const int b_x = 2, b_y = 3, b_z = 4;         // gridDim.{x,y,z}
    const int t_x = 4, t_y = 4, t_z = 4;         // blockDim.{x,y,z}
 
    //---------------------------------------------------------------------
    // 2. Sanity info printed on the host, so we know what to expect.
    //---------------------------------------------------------------------
    int blocks_per_grid   = b_x * b_y * b_z;     // 24
    int threads_per_block = t_x * t_y * t_z;     // 64
 
    printf("%d blocks/grid\n",   blocks_per_grid);
    printf("%d threads/block\n", threads_per_block);
    printf("%d total threads\n\n", blocks_per_grid * threads_per_block);
 
    //---------------------------------------------------------------------
    // 3. Launch the kernel.
    //---------------------------------------------------------------------
    dim3 blocksPerGrid(b_x, b_y, b_z);   // <<<grid>>>
    dim3 threadsPerBlock(t_x, t_y, t_z); // <<<block>>>
 
    whoami<<<blocksPerGrid, threadsPerBlock>>>();
 
    //---------------------------------------------------------------------
    // 4. Wait for the GPU to finish before the program exits.
    //---------------------------------------------------------------------
    cudaDeviceSynchronize();
}

4. Vector add

https://leetgpu.com/challenges/vector-addition

5. Matrix multiplication

https://leetgpu.com/challenges/matrix-multiplication

Alex Xi's Notes

Explorer

Agenda

1. `deviceQuery`

2. Go over concepts

3. First look at the memory layout

4. Vector add

5. Matrix multiplication

Table of Contents

Backlinks

Explorer

Alex Xi's Notes

Explorer

Agenda

1. deviceQuery

2. Go over concepts

3. First look at the memory layout

4. Vector add

5. Matrix multiplication

Table of Contents

Backlinks

Explorer

1. `deviceQuery`