Matrix-vector product
This example illustrates different ways of computing matrix-vector products using the Julia language.
This page comes from a single Julia file: mul-mat-vec.jl
.
You can access the source code for such Julia documentation using the 'Edit on GitHub' link in the top right. You can view the corresponding notebook in nbviewer here: mul-mat-vec.ipynb
, or open it in binder here: mul-mat-vec.ipynb
.
Setup
Add the Julia packages used in this demo. Change false
to true
in the following code block if you are using any of the following packages for the first time.
if false
import Pkg
Pkg.add([
"BenchmarkTools"
"InteractiveUtils"
])
end
Tell Julia to use the following packages. Run Pkg.add()
in the preceding code block first, if needed.
using BenchmarkTools: @benchmark
using InteractiveUtils: versioninfo
Overview of matrix-vector multiplication
The product between a matrix and a compatible vector is such a basic method in linear algebra that, of course, Julia has a function *
built-in for it.
In practice one should simply call that method via A * x
or possibly *(A, x)
.
This demo explores other ways of coding the product, to explore techniques for writing efficient code.
We write each method as a function because the most reliable way to benchmark different methods is to use functions.
Built-in *
Conventional high-level matrix-vector multiply function:
function mul0(A::Matrix, x::Vector)
@boundscheck size(A,2) == length(x) || error("DimensionMismatch(A,x)")
return A * x
end;
Double loop over m,n
This is the textbook version.
function mul_mn(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m in 1:M
inprod = zero(eltype(x)) # accumulator
for n in 1:N
inprod += A[m,n] * x[n]
end
y[m] = inprod
end
return y
end;
Using @inbounds
function mul_mn_inbounds(A::Matrix, x::Vector)
(M,N) = size(A)
@assert N == length(x)
y = similar(x, M)
for m in 1:M
inprod = zero(x[1]) # accumulator
for n in 1:N
@inbounds inprod += A[m,n] * x[n]
end
@inbounds y[m] = inprod
end
return y
end;
Double loop over n,m
We expect this way to be faster because of cache access over m
.
function mul_nm(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n in 1:N
for m in 1:M
y[m] += A[m,n] * x[n]
end
end
return y
end;
With @inbounds
function mul_nm_inbounds(A::Matrix, x::Vector)
(M,N) = size(A)
@assert N == length(x)
y = zeros(eltype(x), M)
for n in 1:N
for m in 1:M
@inbounds y[m] += A[m,n] * x[n]
end
end
return y
end;
With @inbounds
and @simd
function mul_nm_inbounds_simd(A::Matrix, x::Vector)
(M,N) = size(A)
@assert N == length(x)
y = zeros(eltype(x), M)
for n in 1:N
@simd for m in 1:M
@inbounds y[m] += A[m,n] * x[n]
end
end
return y
end;
And with @views
function mul_nm_inbounds_simd_views(A::Matrix, x::Vector)
(M,N) = size(A)
@assert N == length(x)
y = zeros(eltype(x), M)
for n in 1:N
@simd for m in 1:M
@inbounds @views y[m] += A[m,n] * x[n]
end
end
return y
end;
Row versions
Loop over m
.
function mul_row(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m in 1:M
y[m] = transpose(A[m,:]) * x
end
return y
end;
with @inbounds
function mul_row_inbounds(A::Matrix, x::Vector)
(M,N) = size(A)
@assert N == length(x)
y = similar(x, M)
for m in 1:M
@inbounds y[m] = transpose(A[m,:]) * x
end
return y
end;
with @views
function mul_row_views(A::Matrix, x::Vector)
(M,N) = size(A)
y = similar(x, M)
for m in 1:M
@views y[m] = transpose(A[m,:]) * x
end
return y
end;
with both
function mul_row_inbounds_views(A::Matrix, x::Vector)
(M,N) = size(A)
@assert N == length(x)
y = similar(x, M)
for m in 1:M
@inbounds @views y[m] = transpose(A[m,:]) * x
end
return y
end;
Col versions
Loop over n
:
function mul_col(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n in 1:N
y += A[:,n] * x[n]
end
return y
end;
with broadcast via @.
to coalesce operations:
function mul_col_dot(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n in 1:N
@. y += A[:,n] * x[n]
end
return y
end;
and with @views
function mul_col_dot_views(A::Matrix, x::Vector)
(M,N) = size(A)
y = zeros(eltype(x), M)
for n in 1:N
@views @. y += A[:,n] * x[n]
end
return y
end;
Test and time each version
The results will depend on the computer used, of course.
M = 2^11
N = M - 4 # non-square to stress test
A = randn(Float32, M, N)
x = randn(Float32, N);
flist = (mul0,
mul_mn, mul_mn_inbounds,
mul_nm, mul_nm_inbounds, mul_nm_inbounds_simd, mul_nm_inbounds_simd_views,
mul_row, mul_row_inbounds, mul_row_views, mul_row_inbounds_views,
mul_col, mul_col_dot, mul_col_dot_views,
);
for f in flist # warm-up and test each version
@assert A * x ≈ f(A, x)
end;
out = Vector{String}(undef, length(flist))
for (i, f) in enumerate(flist) # benchmark timing for each
b = @benchmark $f($A,$x)
tim = round(minimum(b.times)/10^6, digits=1) # in ms
tim = lpad(tim, 4)
name = rpad(f, 27)
alloc = lpad(b.allocs, 5)
mem = round(b.memory/2^10, digits=1)
tmp = "$name : $tim ms $alloc alloc $mem KiB"
out[i] = tmp
isinteractive() && println(tmp)
end
out
14-element Vector{String}:
"mul0 : 0.1 ms 3 alloc 8.1 KiB"
"mul_mn : 19.5 ms 3 alloc 8.1 KiB"
"mul_mn_inbounds : 31.4 ms 3 alloc 8.1 KiB"
"mul_nm : 0.4 ms 3 alloc 8.1 KiB"
"mul_nm_inbounds : 0.2 ms 3 alloc 8.1 KiB"
"mul_nm_inbounds_simd : 0.2 ms 3 alloc 8.1 KiB"
"mul_nm_inbounds_simd_views : 0.2 ms 3 alloc 8.1 KiB"
"mul_row : 31.7 ms 6147 alloc 16520.1 KiB"
"mul_row_inbounds : 31.8 ms 6147 alloc 16520.1 KiB"
"mul_row_views : 24.7 ms 3 alloc 8.1 KiB"
"mul_row_inbounds_views : 29.5 ms 3 alloc 8.1 KiB"
"mul_col : 5.1 ms 18399 alloc 49447.3 KiB"
"mul_col_dot : 1.6 ms 6135 alloc 16487.8 KiB"
"mul_col_dot_views : 0.3 ms 3 alloc 8.1 KiB"
The following results were for a 2017 iMac with 4.2GHz quad-core Intel Core i7 with macOS Mojave 10.14.6 and Julia 1.6.2.
As expected, simple A*x
is the fastest, but one can come quite close to that using proper double loop order with @inbounds
or using "dots" and @views
to coalesce. Without @views
the vector versions have huge memory overhead!
[
"mul0 : 0.9 ms 1 alloc 16.1 KiB"
"mul_mn : 22.5 ms 1 alloc 16.1 KiB"
"mul_mn_inbounds : 22.0 ms 1 alloc 16.1 KiB"
"mul_nm : 3.1 ms 1 alloc 16.1 KiB"
"mul_nm_inbounds : 1.5 ms 1 alloc 16.1 KiB"
"mul_nm_inbounds_simd : 1.5 ms 1 alloc 16.1 KiB"
"mul_nm_inbounds_simd_views : 1.5 ms 1 alloc 16.1 KiB"
"mul_row : 32.8 ms 2049 alloc 33040.1 KiB"
"mul_row_inbounds : 32.7 ms 2049 alloc 33040.1 KiB"
"mul_row_views : 22.4 ms 1 alloc 16.1 KiB"
"mul_row_inbounds_views : 22.4 ms 1 alloc 16.1 KiB"
"mul_col : 16.0 ms 6133 alloc 98894.6 KiB"
"mul_col_dot : 7.0 ms 2045 alloc 32975.6 KiB"
"mul_col_dot_views : 1.5 ms 1 alloc 16.1 KiB"
];
Reproducibility
This page was generated with the following version of Julia:
using InteractiveUtils: versioninfo
io = IOBuffer(); versioninfo(io); split(String(take!(io)), '\n')
11-element Vector{SubString{String}}:
"Julia Version 1.11.1"
"Commit 8f5b7ca12ad (2024-10-16 10:53 UTC)"
"Build Info:"
" Official https://julialang.org/ release"
"Platform Info:"
" OS: Linux (x86_64-linux-gnu)"
" CPU: 4 × AMD EPYC 7763 64-Core Processor"
" WORD_SIZE: 64"
" LLVM: libLLVM-16.0.6 (ORCJIT, znver3)"
"Threads: 1 default, 0 interactive, 1 GC (on 4 virtual cores)"
""
And with the following package versions
import Pkg; Pkg.status()
Status `~/work/book-la-demo/book-la-demo/docs/Project.toml`
[6e4b80f9] BenchmarkTools v1.5.0
[aaaa29a8] Clustering v0.15.7
[35d6a980] ColorSchemes v3.27.1
⌅ [3da002f7] ColorTypes v0.11.5
⌃ [c3611d14] ColorVectorSpace v0.10.0
[717857b8] DSP v0.7.10
[72c85766] Demos v0.1.0 `~/work/book-la-demo/book-la-demo`
[e30172f5] Documenter v1.7.0
[4f61f5a4] FFTViews v0.3.2
[7a1cc6ca] FFTW v1.8.0
[587475ba] Flux v0.14.25
[a09fc81d] ImageCore v0.10.4
[71a99df6] ImagePhantoms v0.8.1
[b964fa9f] LaTeXStrings v1.4.0
[7031d0ef] LazyGrids v1.0.0
[599c1a8e] LinearMapsAA v0.12.0
[98b081ad] Literate v2.20.1
[7035ae7a] MIRT v0.18.2
[170b2178] MIRTjim v0.25.0
[eb30cadb] MLDatasets v0.7.18
[efe261a4] NFFT v0.13.5
[6ef6ca0d] NMF v1.0.3
[15e1cf62] NPZ v0.4.3
[0b1bfda6] OneHotArrays v0.2.5
[429524aa] Optim v1.10.0
[91a5bcdd] Plots v1.40.9
[f27b6e38] Polynomials v4.0.11
[2913bbd2] StatsBase v0.34.3
[d6d074c3] VideoIO v1.1.0
[b77e0a4c] InteractiveUtils v1.11.0
[37e2e46d] LinearAlgebra v1.11.0
[44cfe95a] Pkg v1.11.0
[9a3f8284] Random v1.11.0
Info Packages marked with ⌃ and ⌅ have new versions available. Those with ⌃ may be upgradable, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated`
This page was generated using Literate.jl.