We can use KernelAbstractions.jl to write backend agnostic code for XPU.
("XPU" = CPU, GPU, ...)
Backends
CPU()
CPU(; static=true)
CUDABackend()
Note that functions like
KernelAbstractions.zeros(backend, dtype, N)
automatically initialize in parallel under the hood!
The following AXPY variant is generic and runs on CPUs and GPUs (by NVIDIA, AMD, Intel, ...).
using KernelAbstractions
@kernel function axpy_ka!(y, a, @Const(x))
I = @index(Global)
@inbounds y[I] = a * x[I] + y[I]
end
axpy_ka! (generic function with 4 methods)
using BenchmarkTools
using Random
function generate_input_data(backend::Backend; N, dtype, kwargs...)
a = dtype(3.141)
x = rand!(KernelAbstractions.zeros(backend, dtype, N))
y = rand!(KernelAbstractions.zeros(backend, dtype, N))
return a,x,y
end
function measure_perf(backend::Backend; N=2^27, dtype=Float64, verbose=true, kwargs...)
# input data
a,x,y = generate_input_data(backend; N, dtype, kwargs...)
# time measurement
t = @belapsed begin
kernel = axpy_ka!($backend, 1024, $(size(y)))
kernel($y, $a, $x, ndrange = $(size(y)))
KernelAbstractions.synchronize($backend)
end evals=2 samples=10
# compute memory bandwidth and flops
bytes = 3 * sizeof(dtype) * N
flops = 2 * N
mem_rate = bytes * 1e-9 / t
flop_rate = flops * 1e-9 / t
if verbose
println("Dtype: $dtype")
println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits=2))
println("\tCompute (GFLOP/s): ", round(flop_rate; digits=2))
end
return mem_rate, flop_rate
end
measure_perf (generic function with 1 method)
using ThreadPinning
pinthreads(:numa)
measure_perf(CPU());
measure_perf(CPU(; static=true));
Dtype: Float64 Memory Bandwidth (GB/s): 143.11 Compute (GFLOP/s): 11.93 Dtype: Float64 Memory Bandwidth (GB/s): 375.95 Compute (GFLOP/s): 31.33
using CUDA
CUDA.device()
CuDevice(0): NVIDIA A100-SXM4-40GB
measure_perf(CUDABackend());
Dtype: Float64 Memory Bandwidth (GB/s): 1328.14 Compute (GFLOP/s): 110.68
CUBLAS.axpy!
¶using CUDA
using BenchmarkTools
"Computes AXPY using the built-in function `CUBLAS.axpy!` provided by NVIDIA"
function axpy_cublas!(y, a, x)
CUDA.@sync CUBLAS.axpy!(length(x), a, x, y)
end
function measure_perf_cublas(; N=2^27, dtype=Float64, verbose=true, kwargs...)
# input data
a = dtype(3.141)
x = CUDA.rand(dtype, N)
y = CUDA.rand(dtype, N)
# time measurement
t = @belapsed axpy_cublas!($y, $a, $x) evals=2 samples=10
# compute memory bandwidth and flops
bytes = 3 * sizeof(dtype) * N
flops = 2 * N
mem_rate = bytes * 1e-9 / t
flop_rate = flops * 1e-9 / t
if verbose
println("Dtype: $dtype")
println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits=2))
println("\tCompute (GFLOP/s): ", round(flop_rate; digits=2))
end
return mem_rate, flop_rate
end
measure_perf_cublas (generic function with 1 method)
measure_perf_cublas();
Dtype: Float64 Memory Bandwidth (GB/s): 1332.96 Compute (GFLOP/s): 111.08
using CUDA
"Computes the AXPY via broadcasting"
function axpy_broadcast!(y, a, x)
CUDA.@sync y .= a .* x .+ y
return nothing
end
"CUDA kernel for computing AXPY on the GPU"
function _axpy_cudakernel!(y, a, x)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
if i <= length(y)
@inbounds y[i] = a * x[i] + y[i]
end
return nothing
end
"Computes AXPY on the GPU using the custom CUDA kernel `_axpy_cudakernel!` above"
function axpy_cuda!(a, x, y; nthreads, nblocks)
CUDA.@sync @cuda(threads=nthreads,
blocks=nblocks,
_axpy_cudakernel!(a, x, y))
return nothing
end;
Benchmark all variants:
using BenchmarkTools
using PrettyTables
axpy_flops(t; len) = 2.0 * len * 1e-9 / t # GFLOP/s
axpy_bandwidth(t; dtype, len) = 3.0 * sizeof(dtype) * len * 1e-9 / t # GB/s
function axpy_gpu_bench_all()
dtype = Float32
nthreads = 1024
nblocks = 500_000
len = nthreads * nblocks # vector length
a = convert(dtype, 3.1415)
xgpu = CUDA.ones(dtype, len)
ygpu = CUDA.ones(dtype, len)
t_broadcast_gpu = @belapsed axpy_broadcast!($ygpu, $a, $xgpu) samples=10 evals=2
t_cuda_kernel = @belapsed axpy_cuda!($ygpu, $a, $xgpu; nthreads = $nthreads,
nblocks = $nblocks) samples=10 evals=2
t_cublas = @belapsed axpy_cublas!($ygpu, $a, $xgpu) samples=10 evals=2
backend = CUDABackend()
t_ka = @belapsed begin
kernel = axpy_ka!($backend, $nthreads, $(size(ygpu)))
kernel($ygpu, $a, $xgpu, ndrange = $(size(ygpu)))
KernelAbstractions.synchronize($backend)
end evals=2 samples=10
times = [t_broadcast_gpu, t_cuda_kernel, t_cublas, t_ka]
flops = axpy_flops.(times; len)
bandwidths = axpy_bandwidth.(times; dtype, len)
labels = ["Broadcast", "CUDA kernel", "CUBLAS", "KernelAbstractions"]
data = hcat(labels, 1e3 .* times, flops, bandwidths)
pretty_table(data;
header = (["Variant", "Runtime", "FLOPS", "Bandwidth"],
["", "ms", "GFLOP/s", "GB/s"]))
println("Theoretical Memory Bandwidth of NVIDIA A100: 1555 GB/s")
return nothing
end;
axpy_gpu_bench_all()
┌────────────────────┬─────────┬─────────┬───────────┐ │ Variant │ Runtime │ FLOPS │ Bandwidth │ │ │ ms │ GFLOP/s │ GB/s │ ├────────────────────┼─────────┼─────────┼───────────┤ │ Broadcast │ 5.01772 │ 204.077 │ 1224.46 │ │ CUDA kernel │ 4.61014 │ 222.119 │ 1332.72 │ │ CUBLAS │ 4.59217 │ 222.988 │ 1337.93 │ │ KernelAbstractions │ 4.62176 │ 221.561 │ 1329.36 │ └────────────────────┴─────────┴─────────┴───────────┘ Theoretical Memory Bandwidth of NVIDIA A100: 1555 GB/s