Matrix-vector product

This example illustrates different ways of computing matrix-vector products using the Julia language.

This page comes from a single Julia file: mul-mat-vec.jl.

You can access the source code for such Julia documentation using the 'Edit on GitHub' link in the top right. You can view the corresponding notebook in nbviewer here: mul-mat-vec.ipynb, or open it in binder here: mul-mat-vec.ipynb.

Setup

Add the Julia packages used in this demo. Change false to true in the following code block if you are using any of the following packages for the first time.

if false
    import Pkg
    Pkg.add([
        "BenchmarkTools"
        "InteractiveUtils"
    ])
end

Tell Julia to use the following packages. Run Pkg.add() in the preceding code block first, if needed.

using BenchmarkTools: @benchmark
using InteractiveUtils: versioninfo

Overview of matrix-vector multiplication

The product between a matrix and a compatible vector is such a basic method in linear algebra that, of course, Julia has a function * built-in for it.

In practice one should simply call that method via A * x or possibly *(A, x).

This demo explores other ways of coding the product, to explore techniques for writing efficient code.

We write each method as a function because the most reliable way to benchmark different methods is to use functions.

Built-in *

Conventional high-level matrix-vector multiply function:

function mul0(A::Matrix, x::Vector)
    @boundscheck size(A,2) == length(x) || error("DimensionMismatch(A,x)")
    return A * x
end;

Double loop over m,n

This is the textbook version.

function mul_mn(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m in 1:M
        inprod = zero(eltype(x)) # accumulator
        for n in 1:N
            inprod += A[m,n] * x[n]
        end
        y[m] = inprod
    end
    return y
end;

Using @inbounds

function mul_mn_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    @assert N == length(x)
    y = similar(x, M)
    for m in 1:M
        inprod = zero(x[1]) # accumulator
        for n in 1:N
            @inbounds inprod += A[m,n] * x[n]
        end
        @inbounds y[m] = inprod
    end
    return y
end;

Double loop over n,m

We expect this way to be faster because of cache access over m.

function mul_nm(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n in 1:N
        for m in 1:M
            y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

With @inbounds

function mul_nm_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    @assert N == length(x)
    y = zeros(eltype(x), M)
    for n in 1:N
        for m in 1:M
            @inbounds y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

With @inbounds and @simd

function mul_nm_inbounds_simd(A::Matrix, x::Vector)
    (M,N) = size(A)
    @assert N == length(x)
    y = zeros(eltype(x), M)
    for n in 1:N
        @simd for m in 1:M
            @inbounds y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

And with @views

function mul_nm_inbounds_simd_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    @assert N == length(x)
    y = zeros(eltype(x), M)
    for n in 1:N
        @simd for m in 1:M
            @inbounds @views y[m] += A[m,n] * x[n]
        end
    end
    return y
end;

Row versions

Loop over m.

function mul_row(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m in 1:M
        y[m] = transpose(A[m,:]) * x
    end
    return y
end;

with @inbounds

function mul_row_inbounds(A::Matrix, x::Vector)
    (M,N) = size(A)
    @assert N == length(x)
    y = similar(x, M)
    for m in 1:M
        @inbounds y[m] = transpose(A[m,:]) * x
    end
    return y
end;

with @views

function mul_row_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = similar(x, M)
    for m in 1:M
        @views y[m] = transpose(A[m,:]) * x
    end
    return y
end;

with both

function mul_row_inbounds_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    @assert N == length(x)
    y = similar(x, M)
    for m in 1:M
        @inbounds @views y[m] = transpose(A[m,:]) * x
    end
    return y
end;

Col versions

Loop over n:

function mul_col(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n in 1:N
        y += A[:,n] * x[n]
    end
    return y
end;

with broadcast via @. to coalesce operations:

function mul_col_dot(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n in 1:N
        @. y += A[:,n] * x[n]
    end
    return y
end;

and with @views

function mul_col_dot_views(A::Matrix, x::Vector)
    (M,N) = size(A)
    y = zeros(eltype(x), M)
    for n in 1:N
        @views @. y += A[:,n] * x[n]
    end
    return y
end;

Test and time each version

The results will depend on the computer used, of course.

M = 2^11
N = M - 4 # non-square to stress test
A = randn(Float32, M, N)
x = randn(Float32, N);

flist = (mul0,
    mul_mn, mul_mn_inbounds,
    mul_nm, mul_nm_inbounds, mul_nm_inbounds_simd, mul_nm_inbounds_simd_views,
    mul_row, mul_row_inbounds, mul_row_views, mul_row_inbounds_views,
    mul_col, mul_col_dot, mul_col_dot_views,
);

for f in flist # warm-up and test each version
    @assert A * x ≈ f(A, x)
end;

out = Vector{String}(undef, length(flist))
for (i, f) in enumerate(flist) # benchmark timing for each
    b = @benchmark $f($A,$x)
    tim = round(minimum(b.times)/10^6, digits=1) # in ms
    tim = lpad(tim, 4)
    name = rpad(f, 27)
	alloc = lpad(b.allocs, 5)
	mem = round(b.memory/2^10, digits=1)
    tmp = "$name : $tim ms $alloc alloc $mem KiB"
    out[i] = tmp
    isinteractive() && println(tmp)
end
out
14-element Vector{String}:
 "mul0                        :  0.1 ms     3 alloc 8.1 KiB"
 "mul_mn                      : 19.5 ms     3 alloc 8.1 KiB"
 "mul_mn_inbounds             : 31.4 ms     3 alloc 8.1 KiB"
 "mul_nm                      :  0.4 ms     3 alloc 8.1 KiB"
 "mul_nm_inbounds             :  0.2 ms     3 alloc 8.1 KiB"
 "mul_nm_inbounds_simd        :  0.2 ms     3 alloc 8.1 KiB"
 "mul_nm_inbounds_simd_views  :  0.2 ms     3 alloc 8.1 KiB"
 "mul_row                     : 31.7 ms  6147 alloc 16520.1 KiB"
 "mul_row_inbounds            : 31.8 ms  6147 alloc 16520.1 KiB"
 "mul_row_views               : 24.7 ms     3 alloc 8.1 KiB"
 "mul_row_inbounds_views      : 29.5 ms     3 alloc 8.1 KiB"
 "mul_col                     :  5.1 ms 18399 alloc 49447.3 KiB"
 "mul_col_dot                 :  1.6 ms  6135 alloc 16487.8 KiB"
 "mul_col_dot_views           :  0.3 ms     3 alloc 8.1 KiB"

The following results were for a 2017 iMac with 4.2GHz quad-core Intel Core i7 with macOS Mojave 10.14.6 and Julia 1.6.2.

As expected, simple A*x is the fastest, but one can come quite close to that using proper double loop order with @inbounds or using "dots" and @views to coalesce. Without @views the vector versions have huge memory overhead!

[
"mul0                       :  0.9 ms     1 alloc    16.1 KiB"
"mul_mn                     : 22.5 ms     1 alloc    16.1 KiB"
"mul_mn_inbounds            : 22.0 ms     1 alloc    16.1 KiB"
"mul_nm                     :  3.1 ms     1 alloc    16.1 KiB"
"mul_nm_inbounds            :  1.5 ms     1 alloc    16.1 KiB"
"mul_nm_inbounds_simd       :  1.5 ms     1 alloc    16.1 KiB"
"mul_nm_inbounds_simd_views :  1.5 ms     1 alloc    16.1 KiB"
"mul_row                    : 32.8 ms  2049 alloc 33040.1 KiB"
"mul_row_inbounds           : 32.7 ms  2049 alloc 33040.1 KiB"
"mul_row_views              : 22.4 ms     1 alloc    16.1 KiB"
"mul_row_inbounds_views     : 22.4 ms     1 alloc    16.1 KiB"
"mul_col                    : 16.0 ms  6133 alloc 98894.6 KiB"
"mul_col_dot                :  7.0 ms  2045 alloc 32975.6 KiB"
"mul_col_dot_views          :  1.5 ms     1 alloc    16.1 KiB"
];

Reproducibility

This page was generated with the following version of Julia:

using InteractiveUtils: versioninfo
io = IOBuffer(); versioninfo(io); split(String(take!(io)), '\n')
11-element Vector{SubString{String}}:
 "Julia Version 1.11.1"
 "Commit 8f5b7ca12ad (2024-10-16 10:53 UTC)"
 "Build Info:"
 "  Official https://julialang.org/ release"
 "Platform Info:"
 "  OS: Linux (x86_64-linux-gnu)"
 "  CPU: 4 × AMD EPYC 7763 64-Core Processor"
 "  WORD_SIZE: 64"
 "  LLVM: libLLVM-16.0.6 (ORCJIT, znver3)"
 "Threads: 1 default, 0 interactive, 1 GC (on 4 virtual cores)"
 ""

And with the following package versions

import Pkg; Pkg.status()
Status `~/work/book-la-demo/book-la-demo/docs/Project.toml`
  [6e4b80f9] BenchmarkTools v1.5.0
  [aaaa29a8] Clustering v0.15.7
  [35d6a980] ColorSchemes v3.27.1
⌅ [3da002f7] ColorTypes v0.11.5
⌃ [c3611d14] ColorVectorSpace v0.10.0
  [717857b8] DSP v0.7.10
  [72c85766] Demos v0.1.0 `~/work/book-la-demo/book-la-demo`
  [e30172f5] Documenter v1.7.0
  [4f61f5a4] FFTViews v0.3.2
  [7a1cc6ca] FFTW v1.8.0
  [587475ba] Flux v0.14.25
  [a09fc81d] ImageCore v0.10.4
  [71a99df6] ImagePhantoms v0.8.1
  [b964fa9f] LaTeXStrings v1.4.0
  [7031d0ef] LazyGrids v1.0.0
  [599c1a8e] LinearMapsAA v0.12.0
  [98b081ad] Literate v2.20.1
  [7035ae7a] MIRT v0.18.2
  [170b2178] MIRTjim v0.25.0
  [eb30cadb] MLDatasets v0.7.18
  [efe261a4] NFFT v0.13.5
  [6ef6ca0d] NMF v1.0.3
  [15e1cf62] NPZ v0.4.3
  [0b1bfda6] OneHotArrays v0.2.5
  [429524aa] Optim v1.10.0
  [91a5bcdd] Plots v1.40.9
  [f27b6e38] Polynomials v4.0.11
  [2913bbd2] StatsBase v0.34.3
  [d6d074c3] VideoIO v1.1.0
  [b77e0a4c] InteractiveUtils v1.11.0
  [37e2e46d] LinearAlgebra v1.11.0
  [44cfe95a] Pkg v1.11.0
  [9a3f8284] Random v1.11.0
Info Packages marked with ⌃ and ⌅ have new versions available. Those with ⌃ may be upgradable, but those with ⌅ are restricted by compatibility constraints from upgrading. To see why use `status --outdated`

This page was generated using Literate.jl.