using KernelAbstractions


@kernel function axpy_ka!(y, a, @Const(x))
    I = @index(Global)
    @inbounds y[I] = a * x[I] + y[I]
end

axpy_ka! (generic function with 4 methods)


using BenchmarkTools
using Random

function generate_input_data(backend::Backend; N, dtype, kwargs...)
    a = dtype(3.141)
    x = rand!(KernelAbstractions.zeros(backend, dtype, N))
    y = rand!(KernelAbstractions.zeros(backend, dtype, N))
    return a,x,y
end

function measure_perf(backend::Backend; N=2^27, dtype=Float64, verbose=true, kwargs...)  
    # input data
    a,x,y = generate_input_data(backend; N, dtype, kwargs...)
    
    # time measurement
    t = @belapsed begin
        kernel = axpy_ka!($backend, 1024, $(size(y)))
        kernel($y, $a, $x, ndrange = $(size(y)))
        KernelAbstractions.synchronize($backend)
    end evals=2 samples=10
    
    # compute memory bandwidth and flops
    bytes = 3 * sizeof(dtype) * N
    flops = 2 * N
    mem_rate = bytes * 1e-9 / t
    flop_rate = flops * 1e-9 / t
    
    if verbose
        println("Dtype: $dtype")
        println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits=2))
        println("\tCompute (GFLOP/s): ", round(flop_rate; digits=2))
    end
    return mem_rate, flop_rate
end

measure_perf (generic function with 1 method)


using ThreadPinning
pinthreads(:numa)

measure_perf(CPU());
measure_perf(CPU(; static=true));

Dtype: Float64
	Memory Bandwidth (GB/s): 143.11
	Compute (GFLOP/s): 11.93
Dtype: Float64
	Memory Bandwidth (GB/s): 375.95
	Compute (GFLOP/s): 31.33


using CUDA
CUDA.device()

CuDevice(0): NVIDIA A100-SXM4-40GB


measure_perf(CUDABackend());

Dtype: Float64
	Memory Bandwidth (GB/s): 1328.14
	Compute (GFLOP/s): 110.68


using CUDA
using BenchmarkTools

"Computes AXPY using the built-in function `CUBLAS.axpy!` provided by NVIDIA"
function axpy_cublas!(y, a, x)
    CUDA.@sync CUBLAS.axpy!(length(x), a, x, y)
end

function measure_perf_cublas(; N=2^27, dtype=Float64, verbose=true, kwargs...)  
    # input data
    a = dtype(3.141)
    x = CUDA.rand(dtype, N)
    y = CUDA.rand(dtype, N)
    
    # time measurement
    t = @belapsed axpy_cublas!($y, $a, $x) evals=2 samples=10
    
    # compute memory bandwidth and flops
    bytes = 3 * sizeof(dtype) * N
    flops = 2 * N
    mem_rate = bytes * 1e-9 / t
    flop_rate = flops * 1e-9 / t
    
    if verbose
        println("Dtype: $dtype")
        println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits=2))
        println("\tCompute (GFLOP/s): ", round(flop_rate; digits=2))
    end
    return mem_rate, flop_rate
end

measure_perf_cublas (generic function with 1 method)


measure_perf_cublas();

Dtype: Float64
	Memory Bandwidth (GB/s): 1332.96
	Compute (GFLOP/s): 111.08


using CUDA

"Computes the AXPY via broadcasting"
function axpy_broadcast!(y, a, x)
    CUDA.@sync y .= a .* x .+ y
    return nothing
end

"CUDA kernel for computing AXPY on the GPU"
function _axpy_cudakernel!(y, a, x)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    if i <= length(y)
        @inbounds y[i] = a * x[i] + y[i]
    end
    return nothing
end

"Computes AXPY on the GPU using the custom CUDA kernel `_axpy_cudakernel!` above"
function axpy_cuda!(a, x, y; nthreads, nblocks)
    CUDA.@sync @cuda(threads=nthreads,
                     blocks=nblocks,
                     _axpy_cudakernel!(a, x, y))
    return nothing
end;


using BenchmarkTools
using PrettyTables

axpy_flops(t; len) = 2.0 * len * 1e-9 / t # GFLOP/s
axpy_bandwidth(t; dtype, len) = 3.0 * sizeof(dtype) * len * 1e-9 / t # GB/s

function axpy_gpu_bench_all()
    dtype = Float32
    nthreads = 1024
    nblocks = 500_000
    len = nthreads * nblocks # vector length
    a = convert(dtype, 3.1415)
    xgpu = CUDA.ones(dtype, len)
    ygpu = CUDA.ones(dtype, len)

    t_broadcast_gpu = @belapsed axpy_broadcast!($ygpu, $a, $xgpu) samples=10 evals=2
    t_cuda_kernel = @belapsed axpy_cuda!($ygpu, $a, $xgpu; nthreads = $nthreads,
                                                 nblocks = $nblocks) samples=10 evals=2
    t_cublas = @belapsed axpy_cublas!($ygpu, $a, $xgpu) samples=10 evals=2
    
    backend = CUDABackend()
    t_ka = @belapsed begin
        kernel = axpy_ka!($backend, $nthreads, $(size(ygpu)))
        kernel($ygpu, $a, $xgpu, ndrange = $(size(ygpu)))
        KernelAbstractions.synchronize($backend)
    end evals=2 samples=10
    
    times = [t_broadcast_gpu, t_cuda_kernel, t_cublas, t_ka]

    flops = axpy_flops.(times; len)
    bandwidths = axpy_bandwidth.(times; dtype, len)

    labels = ["Broadcast", "CUDA kernel", "CUBLAS", "KernelAbstractions"]
    data = hcat(labels, 1e3 .* times, flops, bandwidths)
    pretty_table(data;
                 header = (["Variant", "Runtime", "FLOPS", "Bandwidth"],
                           ["", "ms", "GFLOP/s", "GB/s"]))
    println("Theoretical Memory Bandwidth of NVIDIA A100: 1555 GB/s")
    return nothing
end;

axpy_gpu_bench_all()

┌────────────────────┬─────────┬─────────┬───────────┐
│            Variant │ Runtime │   FLOPS │ Bandwidth │
│                    │      ms │ GFLOP/s │      GB/s │
├────────────────────┼─────────┼─────────┼───────────┤
│          Broadcast │ 5.01772 │ 204.077 │   1224.46 │
│        CUDA kernel │ 4.61014 │ 222.119 │   1332.72 │
│             CUBLAS │ 4.59217 │ 222.988 │   1337.93 │
│ KernelAbstractions │ 4.62176 │ 221.561 │   1329.36 │
└────────────────────┴─────────┴─────────┴───────────┘
Theoretical Memory Bandwidth of NVIDIA A100: 1555 GB/s

Backend-agnostic AXPY¶

Comparison: `CUBLAS.axpy!`¶

Comparison: Broadcasting, CUDA kernel¶

Backend-agnostic AXPY¶

Comparison: CUBLAS.axpy!¶

Comparison: Broadcasting, CUDA kernel¶

Comparison: `CUBLAS.axpy!`¶